Пример #1
0
    def setUp(self):

        self.mExons = []

        self.mSplitCodonsNext = {}
        self.mSplitCodonsPrev = {}

        self.mSpliceSize = 4
        self.mExonSize = 100
        self.mIntronSize = 900
        self.strand = "+"
        self.mNExons = 9
        self.mOffset = 1000
        length = 0
        self.frame = 0
        self.mIncrement = self.mIntronSize + self.mExonSize

        seq = list("123" * int((self.mNExons * self.mExonSize) / 3))

        exon_id = 0

        start = self.mOffset
        for x in range(self.mNExons):

            e = GTF.Entry()
            e.contig, e.strand, e.gene_id, e.transcript_id = "chr1", "+", "gene1", "trans1"
            e.start, e.end = start, start + self.mExonSize
            e.frame = (3 - (length % 3)) % 3
            length += e.end - e.start
            self.mExons.append(e)
            if e.frame != 0:
                for y in range(0, e.frame):
                    self.mSplitCodonsPrev[start + y] = start - self.mIntronSize
                for y in range(0, 3 - e.frame):
                    self.mSplitCodonsNext[
                        start - self.mIntronSize - y - 1] = start

            exon_id += 1
            if exon_id < self.mNExons:
                p = exon_id * self.mExonSize + self.mIntronSize * (exon_id - 1)
                seq[p:p] = list("AG")
                seq[p:p] = list("T" * (self.mIntronSize - 4))
                seq[p:p] = list("GT")

            start += self.mIncrement
            # print str(e)
        # print self.mSplitCodonsNext
        # print self.mSplitCodonsPrev
        seq[0:0] = "C" * self.mOffset
        seq.append("G" * self.mOffset)
        tmpfile = tempfile.NamedTemporaryFile()
        tmpfile.close()

        seq = "".join(seq)
        self.mSequence = seq
        self.contigSize = len(seq)
        IndexedFasta.createDatabase(tmpfile.name, iter([("chr1", seq), ]))
        self.mFasta = IndexedFasta.IndexedFasta(tmpfile.name)
Пример #2
0
    def setUp(self):
        self.tmpdir = tempfile.mkdtemp()

        self.outfile_genome = os.path.join(self.tmpdir, "genome_in")
        self.outfile_gtf = os.path.join(self.tmpdir, "exons.gtf")
        self.outfile_output = os.path.join(self.tmpdir, "output")

        self.length = 1000

        genome = iter((("chr1", "A" * self.length), ))

        IndexedFasta.createDatabase(self.outfile_genome, genome)
        self.reference = ["g"] * self.length
Пример #3
0
    def setUp(self):
        self.tmpdir = tempfile.mkdtemp()

        self.outfile_genome = os.path.join(self.tmpdir, "genome_in")
        self.outfile_gtf = os.path.join(self.tmpdir, "exons.gtf")
        self.outfile_output = os.path.join(self.tmpdir, "output")

        self.length = 1000

        genome = iter((("chr1", "A" * self.length), ))

        IndexedFasta.createDatabase(self.outfile_genome, genome)
        self.reference = ["g"] * self.length
Пример #4
0
def main(argv=None):

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: IndexedFasta.py 2801 2009-10-22 13:40:39Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "-e",
        "--extract",
        dest="extract",
        type="string",
        help=
        "extract region for testing purposes. Format is contig:strand:from:to. "
        "The default coordinates are 0-based open/closed coordinates on both strands. "
        "For example, chr1:+:10:12 will return bases 11 to 12 on chr1.")

    parser.add_option("-c",
                      "--compression",
                      dest="compression",
                      type="choice",
                      choices=("lzo", "zlib", "gzip", "dictzip", "bzip2",
                               "debug"),
                      help="compress database [default=%default].")

    parser.add_option(
        "--random-access-points",
        dest="random_access_points",
        type="int",
        help=
        "save random access points every # number of nucleotides [default=%default]."
    )

    parser.add_option("-i",
                      "--input-format",
                      dest="input_format",
                      type="choice",
                      choices=("one-forward-open", "zero-both-open"),
                      help="coordinate format of input [default=%default].")

    parser.add_option(
        "-s",
        "--synonyms",
        dest="synonyms",
        type="string",
        help=
        "list of synonyms, comma separated with =, for example, chr1=chr1b [default=%default]"
    )

    parser.add_option(
        "-b",
        "--benchmark",
        dest="benchmark",
        action="store_true",
        help="benchmark time for read access [default=%default].")

    parser.add_option(
        "--benchmark-num-iterations",
        dest="benchmark_num_iterations",
        type="int",
        help="number of iterations for benchmark [default=%default].")

    parser.add_option("--benchmark-fragment-size",
                      dest="benchmark_fragment_size",
                      type="int",
                      help="benchmark: fragment size [default=%default].")

    parser.add_option("--verify",
                      dest="verify",
                      type="string",
                      help="verify against other database [default=%default].")

    parser.add_option(
        "--file-format",
        dest="file_format",
        type="choice",
        choices=("fasta", "auto", "fasta.gz", "tar", "tar.gz"),
        help=
        "file format of input. Supply if data comes from stdin [default=%default]."
    )

    parser.add_option(
        "-a",
        "--clean-sequence",
        dest="clean_sequence",
        action="store_true",
        help=
        "remove X/x from DNA sequences - they cause errors in exonerate [default=%default]."
    )

    parser.add_option(
        "--allow-duplicates",
        dest="allow_duplicates",
        action="store_true",
        help=
        "allow duplicate identifiers. Further occurances of an identifier are suffixed by an '_%i' [default=%default]."
    )

    parser.add_option(
        "--regex-identifier",
        dest="regex_identifier",
        type="string",
        help=
        "regular expression for extracting the identifier from fasta description line [default=%default]."
    )

    parser.add_option("--compress-index",
                      dest="compress_index",
                      action="store_true",
                      help="compress index [default=%default].")

    parser.add_option(
        "--force",
        dest="force",
        action="store_true",
        help="force overwriting of existing files [default=%default].")

    parser.add_option(
        "-t",
        "--translator",
        dest="translator",
        type="choice",
        choices=("solexa", "phred", "bytes", "range200"),
        help="translate numerical quality scores [default=%default].")

    parser.set_defaults(extract=None,
                        input_format="zero-both-open",
                        benchmark_fragment_size=1000,
                        benchmark_num_iterations=1000000,
                        benchmark=False,
                        compression=None,
                        random_access_points=0,
                        synonyms=None,
                        verify=None,
                        verify_num_iterations=100000,
                        verify_fragment_size=100,
                        clean_sequence=False,
                        allow_duplicates=False,
                        regex_identifier=None,
                        compress_index=False,
                        file_format="auto",
                        force=False,
                        translator=None)

    (options, args) = E.Start(parser)

    if options.synonyms:
        synonyms = {}
        for x in options.synonyms.split(","):
            a, b = x.split("=")
            a = a.strip()
            b = b.strip()
            if a not in synonyms: synonyms[a] = []
            synonyms[a].append(b)
    else:
        synonyms = None

    if options.translator:
        if options.translator == "phred":
            options.translator = TranslatorPhred()
        elif options.translator == "solexa":
            options.translator = TranslatorSolexa()
        elif options.translator == "bytes":
            options.translator = TranslatorBytes()
        elif options.translator == "range200":
            options.translator = TranslatorRange200()
        else:
            raise ValueError("unknown translator %s" % options.translator)

    if options.extract:
        fasta = IndexedFasta.IndexedFasta(args[0])
        fasta.setTranslator(options.translator)
        converter = IndexedFasta.getConverter(options.input_format)

        contig, strand, start, end = IndexedFasta.parseCoordinates(
            options.extract)
        sequence = fasta.getSequence(contig,
                                     strand,
                                     start,
                                     end,
                                     converter=converter)
        options.stdout.write( ">%s\n%s\n" % \
                              ( options.extract, sequence ) )
    elif options.benchmark:
        import timeit
        timer = timeit.Timer(
            stmt="benchmarkRandomFragment( fasta = fasta, size = %i)" %
            (options.benchmark_fragment_size),
            setup=
            """from __main__ import benchmarkRandomFragment,IndexedFasta\nfasta=IndexedFasta.IndexedFasta( "%s" )"""
            % (args[0]))

        t = timer.timeit(number=options.benchmark_num_iterations)
        options.stdout.write("iter\tsize\ttime\n")
        options.stdout.write("%i\t%i\t%i\n" %
                             (options.benchmark_num_iterations,
                              options.benchmark_fragment_size, t))
    elif options.verify:
        fasta1 = IndexedFasta.IndexedFasta(args[0])
        fasta2 = IndexedFasta.IndexedFasta(options.verify)
        nerrors1 = verify(fasta1,
                          fasta2,
                          options.verify_num_iterations,
                          options.verify_fragment_size,
                          stdout=options.stdout)
        options.stdout.write("errors=%i\n" % (nerrors1))
        nerrors2 = IndexedFasta.verify(fasta2,
                                       fasta1,
                                       options.verify_num_iterations,
                                       options.verify_fragment_size,
                                       stdout=options.stdout)
        options.stdout.write("errors=%i\n" % (nerrors2))
    elif options.compress_index:
        fasta = IndexedFasta.IndexedFasta(args[0])
        fasta.compressIndex()
    else:
        if options.loglevel >= 1:
            options.stdlog.write("# creating database %s\n" % args[0])
            options.stdlog.write("# indexing the following files: \n# %s\n" %\
                                 (" \n# ".join( args[1:] ) ))
            options.stdlog.flush()

            if synonyms:
                options.stdlog.write("# Applying the following synonyms:\n")
                for k, v in synonyms.items():
                    options.stdlog.write("# %s=%s\n" % (k, ",".join(v)))
                options.stdlog.flush()
        if len(args) < 2:
            print globals()["__doc__"]
            sys.exit(1)

        iterator = IndexedFasta.MultipleFastaIterator(
            args[1:],
            regex_identifier=options.regex_identifier,
            format=options.file_format)

        IndexedFasta.createDatabase(
            args[0],
            iterator,
            synonyms=synonyms,
            random_access_points=options.random_access_points,
            compression=options.compression,
            clean_sequence=options.clean_sequence,
            allow_duplicates=options.allow_duplicates,
            translator=options.translator,
            force=options.force)

    E.Stop()
Пример #5
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: quality2fasta.py 2781 2009-09-10 11:33:14Z andreas $")

    parser.add_option("-f", "--format", dest="format", type="choice",
                      choices=("fasta", ),
                      help="input format [%default]."  )

    parser.add_option("-o", "--output-format", dest="output_format", type="choice",
                      choices=("fasta", "fastq" ),
                      help="output format - if fastq is chosen, also supply a sequence file [%default]."  )
    
    parser.add_option("-a", "--alphabet", dest="alphabet", type="choice",
                      choices=("fastq", "solexa", "printable" ),
                      help="characters to use for quality scores [%default]."  )

    parser.add_option("-e", "--encoding", dest="encoding", type="choice",
                      choices=("phred", "solexa" ),
                      help="encoding of quality scores [%default]."  )
    
    parser.add_option("-i", "--build-index", dest="build_index", type="string",
                      help="build an index. Supply the database name [%default]."  )

    parser.add_option("-s", "--filename-sequences", dest="filename_sequences", type="string",
                      help="input filename with file of sequences in fasta format - sorted in the same way as the quality file [%default]."  )


    parser.add_option( "-d", "--set-to-default", dest="default_value", type="int",
                       help="set all quality codes to the default value. Supply the fasta sequence instead of the quality codes [%default]." )

    parser.set_defaults(
        format = "fasta",
        output_format = "fasta",
        build_index = None,
        filename_sequences = None,
        alphabet = "fastq",
        encoding = "phred",
        default_value = None,
        )
    
    (options, args) = E.Start( parser )

    ninput, noutput = 0, 0
    
    if options.format == "fasta":
        iterator = FromFastaIterator( sys.stdin, alphabet = options.alphabet, default = options.default_value )

    if options.output_format == "fasta":

        if options.build_index:
            IndexedFasta.createDatabase( options.build_index,
                                         iterator )
        else:
            while 1:
                try:
                    r = iterator.next()
                except StopIteration:
                    break
                t,s = r
                options.stdout.write( ">%s\n%s\n" % (t,s))

    elif options.output_format == "fastq":
        
        if not options.filename_sequences:
            raise "please supply a filename with sequences."

        iterator_sequence = FastaIterator.FastaIterator( open( options.filename_sequences, "r" ) )
        
        while 1:
            qual, seq = None, None
            try:
                qual = iterator.next()
                seq = iterator_sequence.next()
            except StopIteration:
                if qual and not seq:
                    options.stdlog.write( "# sequence file incomplete\n" )
                elif seq and not qual:
                    options.stdlog.write( "# quality file incomplete\n" )

            qt, qs = qual
            st, ss = seq.title, seq.sequence
            assert qt == st, "sequence and quality identifiers incongruent: %s != %s" % (qt, st)
            options.stdout.write( "@%s\n%s\n+\n%s\n" % (qt, ss, qs))

    if options.loglevel >= 1:
        options.stdlog.write( "# ninput=%i, noutput=%i, noverflow=%i, nunderflow=%i\n" % \
                                  (iterator.mNInput, 
                                   iterator.mNOutput, 
                                   iterator.mNOverFlow, 
                                   iterator.mNUnderFlow ))

    E.Stop()
Пример #6
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-e",
        "--extract",
        dest="extract",
        type="string",
        help="extract region for testing purposes. Format is "
        "contig:strand:from:to. "
        "The default coordinates are 0-based "
        "open/closed coordinates on both strands, but can be changed "
        "by --input-format. "
        "For example, 'chr1:+:10:12' will return "
        "bases 11 and 12 on chr1. Elements from the end of the "
        "string can be omitted. For example, 'chr1' will return "
        "all of chromosome 'chr1'.")

    input_format_choices = ("one-forward-open", "zero-both-open")
    parser.add_option("-i",
                      "--input-format",
                      dest="input_format",
                      type="choice",
                      choices=input_format_choices,
                      help="coordinate format of input. Valid choices are "
                      "%s. See --extract. [default=%%default]." %
                      ", ".join(input_format_choices))

    parser.add_option(
        "-s",
        "--synonyms",
        dest="synonyms",
        type="string",
        help="list of synonyms. This is a comma separated with list "
        "of equivalence relations. For example, chrM=chrMT "
        "means that chrMT will refer to chrM and either "
        "can be used to retrieve a sequence "
        "[default=%default]")

    group = E.OptionGroup(parser, "Bencharking options")
    group.add_option("-b",
                     "--benchmark",
                     dest="benchmark",
                     action="store_true",
                     help="benchmark time for read access "
                     "[default=%default].")
    group.add_option("--benchmark-num-iterations",
                     dest="benchmark_num_iterations",
                     type="int",
                     help="number of iterations for benchmark "
                     "[default=%default].")
    group.add_option("--benchmark-fragment-size",
                     dest="benchmark_fragment_size",
                     type="int",
                     help="benchmark: fragment size [default=%default].")
    parser.add_option_group(group)

    group = E.OptionGroup(parser, "Validation options")
    group.add_option("--verify",
                     dest="verify",
                     type="string",
                     help="verify against other database [default=%default].")

    group.add_option("--verify-iterations",
                     dest="verify_num_iterations",
                     type="int",
                     help="number of iterations for verification "
                     "[default=%default].")
    parser.add_option_group(group)

    file_format_choices = ("fasta", "auto", "fasta.gz", "tar", "tar.gz")
    parser.add_option("--file-format",
                      dest="file_format",
                      type="choice",
                      choices=file_format_choices,
                      help="file format of input. Supply if data comes "
                      "from stdin "
                      "Valid choices are %s [default=%%default]." %
                      ", ".join(file_format_choices))

    parser.add_option("-a",
                      "--clean-sequence",
                      dest="clean_sequence",
                      action="store_true",
                      help="remove X/x from DNA sequences - they cause "
                      "errors in exonerate [default=%default].")

    parser.add_option("--allow-duplicates",
                      dest="allow_duplicates",
                      action="store_true",
                      help="allow duplicate identifiers. Further occurances "
                      "of an identifier are suffixed by an '_%i' "
                      "[default=%default].")

    parser.add_option("--regex-identifier",
                      dest="regex_identifier",
                      type="string",
                      help="regular expression for extracting the "
                      "identifier from fasta description line "
                      "[default=%default].")

    parser.add_option("--force-output",
                      dest="force",
                      action="store_true",
                      help="force overwriting of existing files "
                      "[default=%default].")

    translator_choices = ("solexa", "phred", "bytes", "range200")
    parser.add_option("-t",
                      "--translator",
                      dest="translator",
                      type="choice",
                      choices=translator_choices,
                      help="translate numerical quality scores. "
                      "Valid choices are %s [default=%%default]." %
                      ", ".join(translator_choices))

    group = E.OptionGroup(parser, 'Compression options')
    compression_choices = ("lzo", "zlib", "gzip", "dictzip", "bzip2", "debug")
    group.add_option("-c",
                     "--compression",
                     dest="compression",
                     type="choice",
                     choices=compression_choices,
                     help="compress database, using specified compression "
                     "method. "
                     "Valid choices are %s, but depend on availability on the "
                     "system "
                     "[default=%%default]." % ", ".join(compression_choices))

    group.add_option("--random-access-points",
                     dest="random_access_points",
                     type="int",
                     help="set random access points every # number "
                     "of nucleotides for block compression schemes "
                     "[default=%default].")

    group.add_option(
        "--compress-index",
        dest="compress_index",
        action="store_true",
        help="compress index. The default is to use a plain-text, "
        "human-readable index [default=%default].")

    parser.add_option_group(group)

    parser.set_defaults(extract=None,
                        input_format="zero-both-open",
                        benchmark_fragment_size=1000,
                        benchmark_num_iterations=1000000,
                        benchmark=False,
                        compression=None,
                        random_access_points=0,
                        synonyms=None,
                        verify=None,
                        verify_num_iterations=100000,
                        verify_fragment_size=100,
                        clean_sequence=False,
                        allow_duplicates=False,
                        regex_identifier=None,
                        compress_index=False,
                        file_format="auto",
                        force=False,
                        translator=None)

    (options, args) = E.Start(parser)

    if options.synonyms:
        synonyms = {}
        for x in options.synonyms.split(","):
            a, b = x.split("=")
            a = a.strip()
            b = b.strip()
            if a not in synonyms:
                synonyms[a] = []
            synonyms[a].append(b)
    else:
        synonyms = None

    if options.translator:
        if options.translator == "phred":
            options.translator = IndexedFasta.TranslatorPhred()
        elif options.translator == "solexa":
            options.translator = IndexedFasta.TranslatorSolexa()
        elif options.translator == "bytes":
            options.translator = IndexedFasta.TranslatorBytes()
        elif options.translator == "range200":
            options.translator = IndexedFasta.TranslatorRange200()
        else:
            raise ValueError("unknown translator %s" % options.translator)

    if options.extract:
        fasta = IndexedFasta.IndexedFasta(args[0])
        fasta.setTranslator(options.translator)
        converter = IndexedFasta.getConverter(options.input_format)

        contig, strand, start, end = IndexedFasta.parseCoordinates(
            options.extract)
        sequence = fasta.getSequence(contig,
                                     strand,
                                     start,
                                     end,
                                     converter=converter)
        options.stdout.write(">%s\n%s\n" % (options.extract, sequence))

    elif options.benchmark:
        import timeit
        timer = timeit.Timer(
            stmt="IndexedFasta.benchmarkRandomFragment(fasta=fasta, size=%i)" %
            (options.benchmark_fragment_size),
            setup="from __main__ import IndexedFasta\n"
            "fasta=IndexedFasta.IndexedFasta('%s')" % (args[0]))

        t = timer.timeit(number=options.benchmark_num_iterations)
        options.stdout.write("iter\tsize\ttime\n")
        options.stdout.write("%i\t%i\t%i\n" %
                             (options.benchmark_num_iterations,
                              options.benchmark_fragment_size, t))

    elif options.verify:
        fasta1 = IndexedFasta.IndexedFasta(args[0])
        fasta2 = IndexedFasta.IndexedFasta(options.verify)
        nerrors1 = IndexedFasta.verify(fasta1,
                                       fasta2,
                                       options.verify_num_iterations,
                                       options.verify_fragment_size,
                                       stdout=options.stdout)
        options.stdout.write("errors=%i\n" % (nerrors1))
        nerrors2 = IndexedFasta.verify(fasta2,
                                       fasta1,
                                       options.verify_num_iterations,
                                       options.verify_fragment_size,
                                       stdout=options.stdout)
        options.stdout.write("errors=%i\n" % (nerrors2))
    elif options.compress_index:
        fasta = IndexedFasta.IndexedFasta(args[0])
        fasta.compressIndex()
    else:
        if options.loglevel >= 1:
            options.stdlog.write("# creating database %s\n" % args[0])
            options.stdlog.write("# indexing the following files: \n# %s\n" %
                                 (" \n# ".join(args[1:])))
            options.stdlog.flush()

            if synonyms:
                options.stdlog.write("# Applying the following synonyms:\n")
                for k, v in synonyms.items():
                    options.stdlog.write("# %s=%s\n" % (k, ",".join(v)))
                options.stdlog.flush()
        if len(args) < 2:
            print globals()["__doc__"]
            sys.exit(1)

        iterator = IndexedFasta.MultipleFastaIterator(
            args[1:],
            regex_identifier=options.regex_identifier,
            format=options.file_format)

        IndexedFasta.createDatabase(
            args[0],
            iterator,
            synonyms=synonyms,
            random_access_points=options.random_access_points,
            compression=options.compression,
            clean_sequence=options.clean_sequence,
            allow_duplicates=options.allow_duplicates,
            translator=options.translator,
            force=options.force)

    E.Stop()
Пример #7
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-e", "--extract", dest="extract", type="string",
                      help="extract region for testing purposes. Format is "
                      "contig:strand:from:to. "
                      "The default coordinates are 0-based "
                      "open/closed coordinates on both strands. "
                      "For example, chr1:+:10:12 will return "
                      "bases 11 to 12 on chr1.")

    compression_choices = ("lzo", "zlib", "gzip", "dictzip", "bzip2", "debug")
    parser.add_option("-c", "--compression", dest="compression", type="choice",
                      choices=compression_choices,
                      help="compress database, using specied compression. "
                      "Valid choices are %s. "
                      "[default=%%default]." % ", ".join(compression_choices))

    parser.add_option("--random-access-points", dest="random_access_points",
                      type="int",
                      help="save random access points every # number "
                      "of nucleotides [default=%default].")

    input_format_choices = ("one-forward-open", "zero-both-open")
    parser.add_option("-i", "--input-format", dest="input_format",
                      type="choice",
                      choices=input_format_choices,
                      help="coordinate format of input. Valid choices are "
                      "%s [default=%%default]." %
                      ", ".join(input_format_choices))

    parser.add_option("-s", "--synonyms", dest="synonyms", type="string",
                      help="list of synonyms, comma separated with =, "
                      "for example, chr1=chr1b [default=%default]")

    parser.add_option("-b", "--benchmark", dest="benchmark",
                      action="store_true",
                      help="benchmark time for read access "
                      "[default=%default].")

    parser.add_option("--benchmark-num-iterations",
                      dest="benchmark_num_iterations",
                      type="int",
                      help="number of iterations for benchmark "
                      "[default=%default].")

    parser.add_option("--benchmark-fragment-size",
                      dest="benchmark_fragment_size",
                      type="int",
                      help="benchmark: fragment size [default=%default].")

    parser.add_option("--verify", dest="verify", type="string",
                      help="verify against other database [default=%default].")

    parser.add_option("--verify-iterations", dest="verify_num_iterations",
                      type="int",
                      help="number of iterations for verification "
                      "[default=%default].")

    file_format_choices = ("fasta", "auto", "fasta.gz", "tar", "tar.gz")
    parser.add_option("--file-format", dest="file_format", type="choice",
                      choices=file_format_choices,
                      help="file format of input. Supply if data comes "
                      "from stdin "
                      "Valid choices are %s [default=%%default]." %
                      ", ".join(file_format_choices))

    parser.add_option("-a", "--clean-sequence", dest="clean_sequence",
                      action="store_true",
                      help="remove X/x from DNA sequences - they cause "
                      "errors in exonerate [default=%default].")

    parser.add_option("--allow-duplicates", dest="allow_duplicates",
                      action="store_true",
                      help="allow duplicate identifiers. Further occurances "
                      "of an identifier are suffixed by an '_%i' "
                      "[default=%default].")

    parser.add_option("--regex-identifier", dest="regex_identifier",
                      type="string",
                      help="regular expression for extracting the "
                      "identifier from fasta description line "
                      "[default=%default].")

    parser.add_option("--compress-index", dest="compress_index",
                      action="store_true",
                      help="compress index [default=%default].")

    parser.add_option("--force", dest="force", action="store_true",
                      help="force overwriting of existing files "
                      "[default=%default].")

    translator_choices = ("solexa", "phred", "bytes", "range200")
    parser.add_option("-t", "--translator", dest="translator", type="choice",
                      choices=translator_choices,
                      help="translate numerical quality scores. "
                      "Valid choices are %s [default=%%default]." %
                      ", ".join(translator_choices))

    parser.set_defaults(
        extract=None,
        input_format="zero-both-open",
        benchmark_fragment_size=1000,
        benchmark_num_iterations=1000000,
        benchmark=False,
        compression=None,
        random_access_points=0,
        synonyms=None,
        verify=None,
        verify_num_iterations=100000,
        verify_fragment_size=100,
        clean_sequence=False,
        allow_duplicates=False,
        regex_identifier=None,
        compress_index=False,
        file_format="auto",
        force=False,
        translator=None)

    (options, args) = E.Start(parser)

    if options.synonyms:
        synonyms = {}
        for x in options.synonyms.split(","):
            a, b = x.split("=")
            a = a.strip()
            b = b.strip()
            if a not in synonyms:
                synonyms[a] = []
            synonyms[a].append(b)
    else:
        synonyms = None

    if options.translator:
        if options.translator == "phred":
            options.translator = IndexedFasta.TranslatorPhred()
        elif options.translator == "solexa":
            options.translator = IndexedFasta.TranslatorSolexa()
        elif options.translator == "bytes":
            options.translator = IndexedFasta.TranslatorBytes()
        elif options.translator == "range200":
            options.translator = IndexedFasta.TranslatorRange200()
        else:
            raise ValueError("unknown translator %s" % options.translator)

    if options.extract:
        fasta = IndexedFasta.IndexedFasta(args[0])
        fasta.setTranslator(options.translator)
        converter = IndexedFasta.getConverter(options.input_format)

        contig, strand, start, end = IndexedFasta.parseCoordinates(
            options.extract)
        sequence = fasta.getSequence(contig, strand,
                                     start, end,
                                     converter=converter)
        options.stdout.write(">%s\n%s\n" %
                             (options.extract, sequence))

    elif options.benchmark:
        import timeit
        timer = timeit.Timer(
            stmt="IndexedFasta.benchmarkRandomFragment( fasta = fasta, size = %i)" % (
                options.benchmark_fragment_size),
            setup="""from __main__ import IndexedFasta\nfasta=IndexedFasta.IndexedFasta( "%s" )""" % (args[0] ) )

        t = timer.timeit(number=options.benchmark_num_iterations)
        options.stdout.write("iter\tsize\ttime\n")
        options.stdout.write("%i\t%i\t%i\n" % (
            options.benchmark_num_iterations, options.benchmark_fragment_size, t))

    elif options.verify:
        fasta1 = IndexedFasta.IndexedFasta(args[0])
        fasta2 = IndexedFasta.IndexedFasta(options.verify)
        nerrors1 = IndexedFasta.verify(fasta1, fasta2,
                                       options.verify_num_iterations,
                                       options.verify_fragment_size,
                                       stdout=options.stdout)
        options.stdout.write("errors=%i\n" % (nerrors1))
        nerrors2 = IndexedFasta.verify(fasta2, fasta1,
                                       options.verify_num_iterations,
                                       options.verify_fragment_size,
                                       stdout=options.stdout)
        options.stdout.write("errors=%i\n" % (nerrors2))
    elif options.compress_index:
        fasta = IndexedFasta.IndexedFasta(args[0])
        fasta.compressIndex()
    else:
        if options.loglevel >= 1:
            options.stdlog.write("# creating database %s\n" % args[0])
            options.stdlog.write("# indexing the following files: \n# %s\n" %
                                 (" \n# ".join(args[1:])))
            options.stdlog.flush()

            if synonyms:
                options.stdlog.write("# Applying the following synonyms:\n")
                for k, v in synonyms.items():
                    options.stdlog.write("# %s=%s\n" % (k, ",".join(v)))
                options.stdlog.flush()
        if len(args) < 2:
            print globals()["__doc__"]
            sys.exit(1)

        iterator = IndexedFasta.MultipleFastaIterator(
            args[1:],
            regex_identifier=options.regex_identifier,
            format=options.file_format)

        IndexedFasta.createDatabase(
            args[0],
            iterator,
            synonyms=synonyms,
            random_access_points=options.random_access_points,
            compression=options.compression,
            clean_sequence=options.clean_sequence,
            allow_duplicates=options.allow_duplicates,
            translator=options.translator,
            force=options.force)

    E.Stop()
Пример #8
0
        alphabet = "fastq",
        encoding = "phred",
        default_value = None,
        )
    
    (options, args) = E.Start( parser )

    ninput, noutput = 0, 0
    
    if options.format == "fasta":
        iterator = FromFastaIterator( sys.stdin, alphabet = options.alphabet, default = options.default_value )

    if options.output_format == "fasta":

        if options.build_index:
            IndexedFasta.createDatabase( options.build_index,
                                         iterator )
        else:
            while 1:
                try:
                    r = iterator.next()
                except StopIteration:
                    break
                t,s = r
                options.stdout.write( ">%s\n%s\n" % (t,s))

    elif options.output_format == "fastq":
        
        if not options.filename_sequences:
            raise "please supply a filename with sequences."

        iterator_sequence = FastaIterator.FastaIterator( open( options.filename_sequences, "r" ) )
Пример #9
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: quality2fasta.py 2781 2009-09-10 11:33:14Z andreas $"
    )

    parser.add_option("-f",
                      "--format",
                      dest="format",
                      type="choice",
                      choices=("fasta", ),
                      help="input format [%default].")

    parser.add_option(
        "-o",
        "--output-format",
        dest="output_format",
        type="choice",
        choices=("fasta", "fastq"),
        help=
        "output format - if fastq is chosen, also supply a sequence file [%default]."
    )

    parser.add_option("-a",
                      "--alphabet",
                      dest="alphabet",
                      type="choice",
                      choices=("fastq", "solexa", "printable"),
                      help="characters to use for quality scores [%default].")

    parser.add_option("-e",
                      "--encoding",
                      dest="encoding",
                      type="choice",
                      choices=("phred", "solexa"),
                      help="encoding of quality scores [%default].")

    parser.add_option(
        "-i",
        "--build-index",
        dest="build_index",
        type="string",
        help="build an index. Supply the database name [%default].")

    parser.add_option(
        "-s",
        "--filename-sequences",
        dest="filename_sequences",
        type="string",
        help=
        "input filename with file of sequences in fasta format - sorted in the same way as the quality file [%default]."
    )

    parser.add_option(
        "-d",
        "--set-to-default",
        dest="default_value",
        type="int",
        help=
        "set all quality codes to the default value. Supply the fasta sequence instead of the quality codes [%default]."
    )

    parser.set_defaults(
        format="fasta",
        output_format="fasta",
        build_index=None,
        filename_sequences=None,
        alphabet="fastq",
        encoding="phred",
        default_value=None,
    )

    (options, args) = E.start(parser)

    ninput, noutput = 0, 0

    if options.format == "fasta":
        iterator = FromFastaIterator(sys.stdin,
                                     alphabet=options.alphabet,
                                     default=options.default_value)

    if options.output_format == "fasta":

        if options.build_index:
            IndexedFasta.createDatabase(options.build_index, iterator)
        else:
            while 1:
                try:
                    r = next(iterator)
                except StopIteration:
                    break
                t, s = r
                options.stdout.write(">%s\n%s\n" % (t, s))

    elif options.output_format == "fastq":

        if not options.filename_sequences:
            raise ValueError("please supply a filename with sequences")

        iterator_sequence = FastaIterator.FastaIterator(
            IOTools.open_file(options.filename_sequences, "r"))

        while 1:
            qual, seq = None, None
            try:
                qual = next(iterator)
                seq = next(iterator_sequence)
            except StopIteration:
                if qual and not seq:
                    options.stdlog.write("# sequence file incomplete\n")
                elif seq and not qual:
                    options.stdlog.write("# quality file incomplete\n")

            qt, qs = qual
            st, ss = seq.title, seq.sequence
            assert qt == st, "sequence and quality identifiers incongruent: %s != %s" % (
                qt, st)
            options.stdout.write("@%s\n%s\n+\n%s\n" % (qt, ss, qs))

    if options.loglevel >= 1:
        options.stdlog.write(
            "# ninput=%i, noutput=%i, noverflow=%i, nunderflow=%i\n" %
            (iterator.mNInput, iterator.mNOutput, iterator.mNOverFlow,
             iterator.mNUnderFlow))

    E.stop()