def setUp(self): self.mExons = [] self.mSplitCodonsNext = {} self.mSplitCodonsPrev = {} self.mSpliceSize = 4 self.mExonSize = 100 self.mIntronSize = 900 self.strand = "+" self.mNExons = 9 self.mOffset = 1000 length = 0 self.frame = 0 self.mIncrement = self.mIntronSize + self.mExonSize seq = list("123" * int((self.mNExons * self.mExonSize) / 3)) exon_id = 0 start = self.mOffset for x in range(self.mNExons): e = GTF.Entry() e.contig, e.strand, e.gene_id, e.transcript_id = "chr1", "+", "gene1", "trans1" e.start, e.end = start, start + self.mExonSize e.frame = (3 - (length % 3)) % 3 length += e.end - e.start self.mExons.append(e) if e.frame != 0: for y in range(0, e.frame): self.mSplitCodonsPrev[start + y] = start - self.mIntronSize for y in range(0, 3 - e.frame): self.mSplitCodonsNext[ start - self.mIntronSize - y - 1] = start exon_id += 1 if exon_id < self.mNExons: p = exon_id * self.mExonSize + self.mIntronSize * (exon_id - 1) seq[p:p] = list("AG") seq[p:p] = list("T" * (self.mIntronSize - 4)) seq[p:p] = list("GT") start += self.mIncrement # print str(e) # print self.mSplitCodonsNext # print self.mSplitCodonsPrev seq[0:0] = "C" * self.mOffset seq.append("G" * self.mOffset) tmpfile = tempfile.NamedTemporaryFile() tmpfile.close() seq = "".join(seq) self.mSequence = seq self.contigSize = len(seq) IndexedFasta.createDatabase(tmpfile.name, iter([("chr1", seq), ])) self.mFasta = IndexedFasta.IndexedFasta(tmpfile.name)
def setUp(self): self.tmpdir = tempfile.mkdtemp() self.outfile_genome = os.path.join(self.tmpdir, "genome_in") self.outfile_gtf = os.path.join(self.tmpdir, "exons.gtf") self.outfile_output = os.path.join(self.tmpdir, "output") self.length = 1000 genome = iter((("chr1", "A" * self.length), )) IndexedFasta.createDatabase(self.outfile_genome, genome) self.reference = ["g"] * self.length
def setUp(self): self.tmpdir = tempfile.mkdtemp() self.outfile_genome = os.path.join(self.tmpdir, "genome_in") self.outfile_gtf = os.path.join(self.tmpdir, "exons.gtf") self.outfile_output = os.path.join(self.tmpdir, "output") self.length = 1000 genome = iter((("chr1", "A" * self.length), )) IndexedFasta.createDatabase(self.outfile_genome, genome) self.reference = ["g"] * self.length
def main(argv=None): if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: IndexedFasta.py 2801 2009-10-22 13:40:39Z andreas $", usage=globals()["__doc__"]) parser.add_option( "-e", "--extract", dest="extract", type="string", help= "extract region for testing purposes. Format is contig:strand:from:to. " "The default coordinates are 0-based open/closed coordinates on both strands. " "For example, chr1:+:10:12 will return bases 11 to 12 on chr1.") parser.add_option("-c", "--compression", dest="compression", type="choice", choices=("lzo", "zlib", "gzip", "dictzip", "bzip2", "debug"), help="compress database [default=%default].") parser.add_option( "--random-access-points", dest="random_access_points", type="int", help= "save random access points every # number of nucleotides [default=%default]." ) parser.add_option("-i", "--input-format", dest="input_format", type="choice", choices=("one-forward-open", "zero-both-open"), help="coordinate format of input [default=%default].") parser.add_option( "-s", "--synonyms", dest="synonyms", type="string", help= "list of synonyms, comma separated with =, for example, chr1=chr1b [default=%default]" ) parser.add_option( "-b", "--benchmark", dest="benchmark", action="store_true", help="benchmark time for read access [default=%default].") parser.add_option( "--benchmark-num-iterations", dest="benchmark_num_iterations", type="int", help="number of iterations for benchmark [default=%default].") parser.add_option("--benchmark-fragment-size", dest="benchmark_fragment_size", type="int", help="benchmark: fragment size [default=%default].") parser.add_option("--verify", dest="verify", type="string", help="verify against other database [default=%default].") parser.add_option( "--file-format", dest="file_format", type="choice", choices=("fasta", "auto", "fasta.gz", "tar", "tar.gz"), help= "file format of input. Supply if data comes from stdin [default=%default]." ) parser.add_option( "-a", "--clean-sequence", dest="clean_sequence", action="store_true", help= "remove X/x from DNA sequences - they cause errors in exonerate [default=%default]." ) parser.add_option( "--allow-duplicates", dest="allow_duplicates", action="store_true", help= "allow duplicate identifiers. Further occurances of an identifier are suffixed by an '_%i' [default=%default]." ) parser.add_option( "--regex-identifier", dest="regex_identifier", type="string", help= "regular expression for extracting the identifier from fasta description line [default=%default]." ) parser.add_option("--compress-index", dest="compress_index", action="store_true", help="compress index [default=%default].") parser.add_option( "--force", dest="force", action="store_true", help="force overwriting of existing files [default=%default].") parser.add_option( "-t", "--translator", dest="translator", type="choice", choices=("solexa", "phred", "bytes", "range200"), help="translate numerical quality scores [default=%default].") parser.set_defaults(extract=None, input_format="zero-both-open", benchmark_fragment_size=1000, benchmark_num_iterations=1000000, benchmark=False, compression=None, random_access_points=0, synonyms=None, verify=None, verify_num_iterations=100000, verify_fragment_size=100, clean_sequence=False, allow_duplicates=False, regex_identifier=None, compress_index=False, file_format="auto", force=False, translator=None) (options, args) = E.Start(parser) if options.synonyms: synonyms = {} for x in options.synonyms.split(","): a, b = x.split("=") a = a.strip() b = b.strip() if a not in synonyms: synonyms[a] = [] synonyms[a].append(b) else: synonyms = None if options.translator: if options.translator == "phred": options.translator = TranslatorPhred() elif options.translator == "solexa": options.translator = TranslatorSolexa() elif options.translator == "bytes": options.translator = TranslatorBytes() elif options.translator == "range200": options.translator = TranslatorRange200() else: raise ValueError("unknown translator %s" % options.translator) if options.extract: fasta = IndexedFasta.IndexedFasta(args[0]) fasta.setTranslator(options.translator) converter = IndexedFasta.getConverter(options.input_format) contig, strand, start, end = IndexedFasta.parseCoordinates( options.extract) sequence = fasta.getSequence(contig, strand, start, end, converter=converter) options.stdout.write( ">%s\n%s\n" % \ ( options.extract, sequence ) ) elif options.benchmark: import timeit timer = timeit.Timer( stmt="benchmarkRandomFragment( fasta = fasta, size = %i)" % (options.benchmark_fragment_size), setup= """from __main__ import benchmarkRandomFragment,IndexedFasta\nfasta=IndexedFasta.IndexedFasta( "%s" )""" % (args[0])) t = timer.timeit(number=options.benchmark_num_iterations) options.stdout.write("iter\tsize\ttime\n") options.stdout.write("%i\t%i\t%i\n" % (options.benchmark_num_iterations, options.benchmark_fragment_size, t)) elif options.verify: fasta1 = IndexedFasta.IndexedFasta(args[0]) fasta2 = IndexedFasta.IndexedFasta(options.verify) nerrors1 = verify(fasta1, fasta2, options.verify_num_iterations, options.verify_fragment_size, stdout=options.stdout) options.stdout.write("errors=%i\n" % (nerrors1)) nerrors2 = IndexedFasta.verify(fasta2, fasta1, options.verify_num_iterations, options.verify_fragment_size, stdout=options.stdout) options.stdout.write("errors=%i\n" % (nerrors2)) elif options.compress_index: fasta = IndexedFasta.IndexedFasta(args[0]) fasta.compressIndex() else: if options.loglevel >= 1: options.stdlog.write("# creating database %s\n" % args[0]) options.stdlog.write("# indexing the following files: \n# %s\n" %\ (" \n# ".join( args[1:] ) )) options.stdlog.flush() if synonyms: options.stdlog.write("# Applying the following synonyms:\n") for k, v in synonyms.items(): options.stdlog.write("# %s=%s\n" % (k, ",".join(v))) options.stdlog.flush() if len(args) < 2: print globals()["__doc__"] sys.exit(1) iterator = IndexedFasta.MultipleFastaIterator( args[1:], regex_identifier=options.regex_identifier, format=options.file_format) IndexedFasta.createDatabase( args[0], iterator, synonyms=synonyms, random_access_points=options.random_access_points, compression=options.compression, clean_sequence=options.clean_sequence, allow_duplicates=options.allow_duplicates, translator=options.translator, force=options.force) E.Stop()
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: quality2fasta.py 2781 2009-09-10 11:33:14Z andreas $") parser.add_option("-f", "--format", dest="format", type="choice", choices=("fasta", ), help="input format [%default]." ) parser.add_option("-o", "--output-format", dest="output_format", type="choice", choices=("fasta", "fastq" ), help="output format - if fastq is chosen, also supply a sequence file [%default]." ) parser.add_option("-a", "--alphabet", dest="alphabet", type="choice", choices=("fastq", "solexa", "printable" ), help="characters to use for quality scores [%default]." ) parser.add_option("-e", "--encoding", dest="encoding", type="choice", choices=("phred", "solexa" ), help="encoding of quality scores [%default]." ) parser.add_option("-i", "--build-index", dest="build_index", type="string", help="build an index. Supply the database name [%default]." ) parser.add_option("-s", "--filename-sequences", dest="filename_sequences", type="string", help="input filename with file of sequences in fasta format - sorted in the same way as the quality file [%default]." ) parser.add_option( "-d", "--set-to-default", dest="default_value", type="int", help="set all quality codes to the default value. Supply the fasta sequence instead of the quality codes [%default]." ) parser.set_defaults( format = "fasta", output_format = "fasta", build_index = None, filename_sequences = None, alphabet = "fastq", encoding = "phred", default_value = None, ) (options, args) = E.Start( parser ) ninput, noutput = 0, 0 if options.format == "fasta": iterator = FromFastaIterator( sys.stdin, alphabet = options.alphabet, default = options.default_value ) if options.output_format == "fasta": if options.build_index: IndexedFasta.createDatabase( options.build_index, iterator ) else: while 1: try: r = iterator.next() except StopIteration: break t,s = r options.stdout.write( ">%s\n%s\n" % (t,s)) elif options.output_format == "fastq": if not options.filename_sequences: raise "please supply a filename with sequences." iterator_sequence = FastaIterator.FastaIterator( open( options.filename_sequences, "r" ) ) while 1: qual, seq = None, None try: qual = iterator.next() seq = iterator_sequence.next() except StopIteration: if qual and not seq: options.stdlog.write( "# sequence file incomplete\n" ) elif seq and not qual: options.stdlog.write( "# quality file incomplete\n" ) qt, qs = qual st, ss = seq.title, seq.sequence assert qt == st, "sequence and quality identifiers incongruent: %s != %s" % (qt, st) options.stdout.write( "@%s\n%s\n+\n%s\n" % (qt, ss, qs)) if options.loglevel >= 1: options.stdlog.write( "# ninput=%i, noutput=%i, noverflow=%i, nunderflow=%i\n" % \ (iterator.mNInput, iterator.mNOutput, iterator.mNOverFlow, iterator.mNUnderFlow )) E.Stop()
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-e", "--extract", dest="extract", type="string", help="extract region for testing purposes. Format is " "contig:strand:from:to. " "The default coordinates are 0-based " "open/closed coordinates on both strands, but can be changed " "by --input-format. " "For example, 'chr1:+:10:12' will return " "bases 11 and 12 on chr1. Elements from the end of the " "string can be omitted. For example, 'chr1' will return " "all of chromosome 'chr1'.") input_format_choices = ("one-forward-open", "zero-both-open") parser.add_option("-i", "--input-format", dest="input_format", type="choice", choices=input_format_choices, help="coordinate format of input. Valid choices are " "%s. See --extract. [default=%%default]." % ", ".join(input_format_choices)) parser.add_option( "-s", "--synonyms", dest="synonyms", type="string", help="list of synonyms. This is a comma separated with list " "of equivalence relations. For example, chrM=chrMT " "means that chrMT will refer to chrM and either " "can be used to retrieve a sequence " "[default=%default]") group = E.OptionGroup(parser, "Bencharking options") group.add_option("-b", "--benchmark", dest="benchmark", action="store_true", help="benchmark time for read access " "[default=%default].") group.add_option("--benchmark-num-iterations", dest="benchmark_num_iterations", type="int", help="number of iterations for benchmark " "[default=%default].") group.add_option("--benchmark-fragment-size", dest="benchmark_fragment_size", type="int", help="benchmark: fragment size [default=%default].") parser.add_option_group(group) group = E.OptionGroup(parser, "Validation options") group.add_option("--verify", dest="verify", type="string", help="verify against other database [default=%default].") group.add_option("--verify-iterations", dest="verify_num_iterations", type="int", help="number of iterations for verification " "[default=%default].") parser.add_option_group(group) file_format_choices = ("fasta", "auto", "fasta.gz", "tar", "tar.gz") parser.add_option("--file-format", dest="file_format", type="choice", choices=file_format_choices, help="file format of input. Supply if data comes " "from stdin " "Valid choices are %s [default=%%default]." % ", ".join(file_format_choices)) parser.add_option("-a", "--clean-sequence", dest="clean_sequence", action="store_true", help="remove X/x from DNA sequences - they cause " "errors in exonerate [default=%default].") parser.add_option("--allow-duplicates", dest="allow_duplicates", action="store_true", help="allow duplicate identifiers. Further occurances " "of an identifier are suffixed by an '_%i' " "[default=%default].") parser.add_option("--regex-identifier", dest="regex_identifier", type="string", help="regular expression for extracting the " "identifier from fasta description line " "[default=%default].") parser.add_option("--force-output", dest="force", action="store_true", help="force overwriting of existing files " "[default=%default].") translator_choices = ("solexa", "phred", "bytes", "range200") parser.add_option("-t", "--translator", dest="translator", type="choice", choices=translator_choices, help="translate numerical quality scores. " "Valid choices are %s [default=%%default]." % ", ".join(translator_choices)) group = E.OptionGroup(parser, 'Compression options') compression_choices = ("lzo", "zlib", "gzip", "dictzip", "bzip2", "debug") group.add_option("-c", "--compression", dest="compression", type="choice", choices=compression_choices, help="compress database, using specified compression " "method. " "Valid choices are %s, but depend on availability on the " "system " "[default=%%default]." % ", ".join(compression_choices)) group.add_option("--random-access-points", dest="random_access_points", type="int", help="set random access points every # number " "of nucleotides for block compression schemes " "[default=%default].") group.add_option( "--compress-index", dest="compress_index", action="store_true", help="compress index. The default is to use a plain-text, " "human-readable index [default=%default].") parser.add_option_group(group) parser.set_defaults(extract=None, input_format="zero-both-open", benchmark_fragment_size=1000, benchmark_num_iterations=1000000, benchmark=False, compression=None, random_access_points=0, synonyms=None, verify=None, verify_num_iterations=100000, verify_fragment_size=100, clean_sequence=False, allow_duplicates=False, regex_identifier=None, compress_index=False, file_format="auto", force=False, translator=None) (options, args) = E.Start(parser) if options.synonyms: synonyms = {} for x in options.synonyms.split(","): a, b = x.split("=") a = a.strip() b = b.strip() if a not in synonyms: synonyms[a] = [] synonyms[a].append(b) else: synonyms = None if options.translator: if options.translator == "phred": options.translator = IndexedFasta.TranslatorPhred() elif options.translator == "solexa": options.translator = IndexedFasta.TranslatorSolexa() elif options.translator == "bytes": options.translator = IndexedFasta.TranslatorBytes() elif options.translator == "range200": options.translator = IndexedFasta.TranslatorRange200() else: raise ValueError("unknown translator %s" % options.translator) if options.extract: fasta = IndexedFasta.IndexedFasta(args[0]) fasta.setTranslator(options.translator) converter = IndexedFasta.getConverter(options.input_format) contig, strand, start, end = IndexedFasta.parseCoordinates( options.extract) sequence = fasta.getSequence(contig, strand, start, end, converter=converter) options.stdout.write(">%s\n%s\n" % (options.extract, sequence)) elif options.benchmark: import timeit timer = timeit.Timer( stmt="IndexedFasta.benchmarkRandomFragment(fasta=fasta, size=%i)" % (options.benchmark_fragment_size), setup="from __main__ import IndexedFasta\n" "fasta=IndexedFasta.IndexedFasta('%s')" % (args[0])) t = timer.timeit(number=options.benchmark_num_iterations) options.stdout.write("iter\tsize\ttime\n") options.stdout.write("%i\t%i\t%i\n" % (options.benchmark_num_iterations, options.benchmark_fragment_size, t)) elif options.verify: fasta1 = IndexedFasta.IndexedFasta(args[0]) fasta2 = IndexedFasta.IndexedFasta(options.verify) nerrors1 = IndexedFasta.verify(fasta1, fasta2, options.verify_num_iterations, options.verify_fragment_size, stdout=options.stdout) options.stdout.write("errors=%i\n" % (nerrors1)) nerrors2 = IndexedFasta.verify(fasta2, fasta1, options.verify_num_iterations, options.verify_fragment_size, stdout=options.stdout) options.stdout.write("errors=%i\n" % (nerrors2)) elif options.compress_index: fasta = IndexedFasta.IndexedFasta(args[0]) fasta.compressIndex() else: if options.loglevel >= 1: options.stdlog.write("# creating database %s\n" % args[0]) options.stdlog.write("# indexing the following files: \n# %s\n" % (" \n# ".join(args[1:]))) options.stdlog.flush() if synonyms: options.stdlog.write("# Applying the following synonyms:\n") for k, v in synonyms.items(): options.stdlog.write("# %s=%s\n" % (k, ",".join(v))) options.stdlog.flush() if len(args) < 2: print globals()["__doc__"] sys.exit(1) iterator = IndexedFasta.MultipleFastaIterator( args[1:], regex_identifier=options.regex_identifier, format=options.file_format) IndexedFasta.createDatabase( args[0], iterator, synonyms=synonyms, random_access_points=options.random_access_points, compression=options.compression, clean_sequence=options.clean_sequence, allow_duplicates=options.allow_duplicates, translator=options.translator, force=options.force) E.Stop()
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-e", "--extract", dest="extract", type="string", help="extract region for testing purposes. Format is " "contig:strand:from:to. " "The default coordinates are 0-based " "open/closed coordinates on both strands. " "For example, chr1:+:10:12 will return " "bases 11 to 12 on chr1.") compression_choices = ("lzo", "zlib", "gzip", "dictzip", "bzip2", "debug") parser.add_option("-c", "--compression", dest="compression", type="choice", choices=compression_choices, help="compress database, using specied compression. " "Valid choices are %s. " "[default=%%default]." % ", ".join(compression_choices)) parser.add_option("--random-access-points", dest="random_access_points", type="int", help="save random access points every # number " "of nucleotides [default=%default].") input_format_choices = ("one-forward-open", "zero-both-open") parser.add_option("-i", "--input-format", dest="input_format", type="choice", choices=input_format_choices, help="coordinate format of input. Valid choices are " "%s [default=%%default]." % ", ".join(input_format_choices)) parser.add_option("-s", "--synonyms", dest="synonyms", type="string", help="list of synonyms, comma separated with =, " "for example, chr1=chr1b [default=%default]") parser.add_option("-b", "--benchmark", dest="benchmark", action="store_true", help="benchmark time for read access " "[default=%default].") parser.add_option("--benchmark-num-iterations", dest="benchmark_num_iterations", type="int", help="number of iterations for benchmark " "[default=%default].") parser.add_option("--benchmark-fragment-size", dest="benchmark_fragment_size", type="int", help="benchmark: fragment size [default=%default].") parser.add_option("--verify", dest="verify", type="string", help="verify against other database [default=%default].") parser.add_option("--verify-iterations", dest="verify_num_iterations", type="int", help="number of iterations for verification " "[default=%default].") file_format_choices = ("fasta", "auto", "fasta.gz", "tar", "tar.gz") parser.add_option("--file-format", dest="file_format", type="choice", choices=file_format_choices, help="file format of input. Supply if data comes " "from stdin " "Valid choices are %s [default=%%default]." % ", ".join(file_format_choices)) parser.add_option("-a", "--clean-sequence", dest="clean_sequence", action="store_true", help="remove X/x from DNA sequences - they cause " "errors in exonerate [default=%default].") parser.add_option("--allow-duplicates", dest="allow_duplicates", action="store_true", help="allow duplicate identifiers. Further occurances " "of an identifier are suffixed by an '_%i' " "[default=%default].") parser.add_option("--regex-identifier", dest="regex_identifier", type="string", help="regular expression for extracting the " "identifier from fasta description line " "[default=%default].") parser.add_option("--compress-index", dest="compress_index", action="store_true", help="compress index [default=%default].") parser.add_option("--force", dest="force", action="store_true", help="force overwriting of existing files " "[default=%default].") translator_choices = ("solexa", "phred", "bytes", "range200") parser.add_option("-t", "--translator", dest="translator", type="choice", choices=translator_choices, help="translate numerical quality scores. " "Valid choices are %s [default=%%default]." % ", ".join(translator_choices)) parser.set_defaults( extract=None, input_format="zero-both-open", benchmark_fragment_size=1000, benchmark_num_iterations=1000000, benchmark=False, compression=None, random_access_points=0, synonyms=None, verify=None, verify_num_iterations=100000, verify_fragment_size=100, clean_sequence=False, allow_duplicates=False, regex_identifier=None, compress_index=False, file_format="auto", force=False, translator=None) (options, args) = E.Start(parser) if options.synonyms: synonyms = {} for x in options.synonyms.split(","): a, b = x.split("=") a = a.strip() b = b.strip() if a not in synonyms: synonyms[a] = [] synonyms[a].append(b) else: synonyms = None if options.translator: if options.translator == "phred": options.translator = IndexedFasta.TranslatorPhred() elif options.translator == "solexa": options.translator = IndexedFasta.TranslatorSolexa() elif options.translator == "bytes": options.translator = IndexedFasta.TranslatorBytes() elif options.translator == "range200": options.translator = IndexedFasta.TranslatorRange200() else: raise ValueError("unknown translator %s" % options.translator) if options.extract: fasta = IndexedFasta.IndexedFasta(args[0]) fasta.setTranslator(options.translator) converter = IndexedFasta.getConverter(options.input_format) contig, strand, start, end = IndexedFasta.parseCoordinates( options.extract) sequence = fasta.getSequence(contig, strand, start, end, converter=converter) options.stdout.write(">%s\n%s\n" % (options.extract, sequence)) elif options.benchmark: import timeit timer = timeit.Timer( stmt="IndexedFasta.benchmarkRandomFragment( fasta = fasta, size = %i)" % ( options.benchmark_fragment_size), setup="""from __main__ import IndexedFasta\nfasta=IndexedFasta.IndexedFasta( "%s" )""" % (args[0] ) ) t = timer.timeit(number=options.benchmark_num_iterations) options.stdout.write("iter\tsize\ttime\n") options.stdout.write("%i\t%i\t%i\n" % ( options.benchmark_num_iterations, options.benchmark_fragment_size, t)) elif options.verify: fasta1 = IndexedFasta.IndexedFasta(args[0]) fasta2 = IndexedFasta.IndexedFasta(options.verify) nerrors1 = IndexedFasta.verify(fasta1, fasta2, options.verify_num_iterations, options.verify_fragment_size, stdout=options.stdout) options.stdout.write("errors=%i\n" % (nerrors1)) nerrors2 = IndexedFasta.verify(fasta2, fasta1, options.verify_num_iterations, options.verify_fragment_size, stdout=options.stdout) options.stdout.write("errors=%i\n" % (nerrors2)) elif options.compress_index: fasta = IndexedFasta.IndexedFasta(args[0]) fasta.compressIndex() else: if options.loglevel >= 1: options.stdlog.write("# creating database %s\n" % args[0]) options.stdlog.write("# indexing the following files: \n# %s\n" % (" \n# ".join(args[1:]))) options.stdlog.flush() if synonyms: options.stdlog.write("# Applying the following synonyms:\n") for k, v in synonyms.items(): options.stdlog.write("# %s=%s\n" % (k, ",".join(v))) options.stdlog.flush() if len(args) < 2: print globals()["__doc__"] sys.exit(1) iterator = IndexedFasta.MultipleFastaIterator( args[1:], regex_identifier=options.regex_identifier, format=options.file_format) IndexedFasta.createDatabase( args[0], iterator, synonyms=synonyms, random_access_points=options.random_access_points, compression=options.compression, clean_sequence=options.clean_sequence, allow_duplicates=options.allow_duplicates, translator=options.translator, force=options.force) E.Stop()
alphabet = "fastq", encoding = "phred", default_value = None, ) (options, args) = E.Start( parser ) ninput, noutput = 0, 0 if options.format == "fasta": iterator = FromFastaIterator( sys.stdin, alphabet = options.alphabet, default = options.default_value ) if options.output_format == "fasta": if options.build_index: IndexedFasta.createDatabase( options.build_index, iterator ) else: while 1: try: r = iterator.next() except StopIteration: break t,s = r options.stdout.write( ">%s\n%s\n" % (t,s)) elif options.output_format == "fastq": if not options.filename_sequences: raise "please supply a filename with sequences." iterator_sequence = FastaIterator.FastaIterator( open( options.filename_sequences, "r" ) )
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: quality2fasta.py 2781 2009-09-10 11:33:14Z andreas $" ) parser.add_option("-f", "--format", dest="format", type="choice", choices=("fasta", ), help="input format [%default].") parser.add_option( "-o", "--output-format", dest="output_format", type="choice", choices=("fasta", "fastq"), help= "output format - if fastq is chosen, also supply a sequence file [%default]." ) parser.add_option("-a", "--alphabet", dest="alphabet", type="choice", choices=("fastq", "solexa", "printable"), help="characters to use for quality scores [%default].") parser.add_option("-e", "--encoding", dest="encoding", type="choice", choices=("phred", "solexa"), help="encoding of quality scores [%default].") parser.add_option( "-i", "--build-index", dest="build_index", type="string", help="build an index. Supply the database name [%default].") parser.add_option( "-s", "--filename-sequences", dest="filename_sequences", type="string", help= "input filename with file of sequences in fasta format - sorted in the same way as the quality file [%default]." ) parser.add_option( "-d", "--set-to-default", dest="default_value", type="int", help= "set all quality codes to the default value. Supply the fasta sequence instead of the quality codes [%default]." ) parser.set_defaults( format="fasta", output_format="fasta", build_index=None, filename_sequences=None, alphabet="fastq", encoding="phred", default_value=None, ) (options, args) = E.start(parser) ninput, noutput = 0, 0 if options.format == "fasta": iterator = FromFastaIterator(sys.stdin, alphabet=options.alphabet, default=options.default_value) if options.output_format == "fasta": if options.build_index: IndexedFasta.createDatabase(options.build_index, iterator) else: while 1: try: r = next(iterator) except StopIteration: break t, s = r options.stdout.write(">%s\n%s\n" % (t, s)) elif options.output_format == "fastq": if not options.filename_sequences: raise ValueError("please supply a filename with sequences") iterator_sequence = FastaIterator.FastaIterator( IOTools.open_file(options.filename_sequences, "r")) while 1: qual, seq = None, None try: qual = next(iterator) seq = next(iterator_sequence) except StopIteration: if qual and not seq: options.stdlog.write("# sequence file incomplete\n") elif seq and not qual: options.stdlog.write("# quality file incomplete\n") qt, qs = qual st, ss = seq.title, seq.sequence assert qt == st, "sequence and quality identifiers incongruent: %s != %s" % ( qt, st) options.stdout.write("@%s\n%s\n+\n%s\n" % (qt, ss, qs)) if options.loglevel >= 1: options.stdlog.write( "# ninput=%i, noutput=%i, noverflow=%i, nunderflow=%i\n" % (iterator.mNInput, iterator.mNOutput, iterator.mNOverFlow, iterator.mNUnderFlow)) E.stop()