Python IndexedFasta.createDatabase примеры использования

Язык программирования: Python

Пространство имен/Пакет: CGAT

Класс/Тип: IndexedFasta

Метод/Функция: createDatabase

Примеров на hotexamples.com: 9

Python IndexedFasta.createDatabase - 9 примеров найдено. Это лучшие примеры Python кода для CGAT.IndexedFasta.createDatabase, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

IndexedFasta(30)

getConverter(8)

createDatabase(5)

TranslatorBytes(3)

MultipleFastaIterator(2)

parseCoordinates(2)

verify(2)

TranslatorPhred(1)

TranslatorRange200(1)

TranslatorSolexa(1)

getContigSizes(1)

getSequence(1)

Пример #1

Показать файл

Файл: snp2counts_test.py Проект: logust79/cgat-apps

    def setUp(self):

        self.mExons = []

        self.mSplitCodonsNext = {}
        self.mSplitCodonsPrev = {}

        self.mSpliceSize = 4
        self.mExonSize = 100
        self.mIntronSize = 900
        self.strand = "+"
        self.mNExons = 9
        self.mOffset = 1000
        length = 0
        self.frame = 0
        self.mIncrement = self.mIntronSize + self.mExonSize

        seq = list("123" * int((self.mNExons * self.mExonSize) / 3))

        exon_id = 0

        start = self.mOffset
        for x in range(self.mNExons):

            e = GTF.Entry()
            e.contig, e.strand, e.gene_id, e.transcript_id = "chr1", "+", "gene1", "trans1"
            e.start, e.end = start, start + self.mExonSize
            e.frame = (3 - (length % 3)) % 3
            length += e.end - e.start
            self.mExons.append(e)
            if e.frame != 0:
                for y in range(0, e.frame):
                    self.mSplitCodonsPrev[start + y] = start - self.mIntronSize
                for y in range(0, 3 - e.frame):
                    self.mSplitCodonsNext[
                        start - self.mIntronSize - y - 1] = start

            exon_id += 1
            if exon_id < self.mNExons:
                p = exon_id * self.mExonSize + self.mIntronSize * (exon_id - 1)
                seq[p:p] = list("AG")
                seq[p:p] = list("T" * (self.mIntronSize - 4))
                seq[p:p] = list("GT")

            start += self.mIncrement
            # print str(e)
        # print self.mSplitCodonsNext
        # print self.mSplitCodonsPrev
        seq[0:0] = "C" * self.mOffset
        seq.append("G" * self.mOffset)
        tmpfile = tempfile.NamedTemporaryFile()
        tmpfile.close()

        seq = "".join(seq)
        self.mSequence = seq
        self.contigSize = len(seq)
        IndexedFasta.createDatabase(tmpfile.name, iter([("chr1", seq), ]))
        self.mFasta = IndexedFasta.IndexedFasta(tmpfile.name)

Пример #2

Показать файл

Файл: gtf2fasta_test.py Проект: Charlie-George/cgat

    def setUp(self):
        self.tmpdir = tempfile.mkdtemp()

        self.outfile_genome = os.path.join(self.tmpdir, "genome_in")
        self.outfile_gtf = os.path.join(self.tmpdir, "exons.gtf")
        self.outfile_output = os.path.join(self.tmpdir, "output")

        self.length = 1000

        genome = iter((("chr1", "A" * self.length), ))

        IndexedFasta.createDatabase(self.outfile_genome, genome)
        self.reference = ["g"] * self.length

Пример #3

Показать файл

Файл: gtf2fasta_test.py Проект: yangjl/cgat

    def setUp(self):
        self.tmpdir = tempfile.mkdtemp()

        self.outfile_genome = os.path.join(self.tmpdir, "genome_in")
        self.outfile_gtf = os.path.join(self.tmpdir, "exons.gtf")
        self.outfile_output = os.path.join(self.tmpdir, "output")

        self.length = 1000

        genome = iter((("chr1", "A" * self.length), ))

        IndexedFasta.createDatabase(self.outfile_genome, genome)
        self.reference = ["g"] * self.length

Пример #4

Показать файл

def main(argv=None):

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: IndexedFasta.py 2801 2009-10-22 13:40:39Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "-e",
        "--extract",
        dest="extract",
        type="string",
        help=
        "extract region for testing purposes. Format is contig:strand:from:to. "
        "The default coordinates are 0-based open/closed coordinates on both strands. "
        "For example, chr1:+:10:12 will return bases 11 to 12 on chr1.")

    parser.add_option("-c",
                      "--compression",
                      dest="compression",
                      type="choice",
                      choices=("lzo", "zlib", "gzip", "dictzip", "bzip2",
                               "debug"),
                      help="compress database [default=%default].")

    parser.add_option(
        "--random-access-points",
        dest="random_access_points",
        type="int",
        help=
        "save random access points every # number of nucleotides [default=%default]."
    )

    parser.add_option("-i",
                      "--input-format",
                      dest="input_format",
                      type="choice",
                      choices=("one-forward-open", "zero-both-open"),
                      help="coordinate format of input [default=%default].")

    parser.add_option(
        "-s",
        "--synonyms",
        dest="synonyms",
        type="string",
        help=
        "list of synonyms, comma separated with =, for example, chr1=chr1b [default=%default]"
    )

    parser.add_option(
        "-b",
        "--benchmark",
        dest="benchmark",
        action="store_true",
        help="benchmark time for read access [default=%default].")

    parser.add_option(
        "--benchmark-num-iterations",
        dest="benchmark_num_iterations",
        type="int",
        help="number of iterations for benchmark [default=%default].")

    parser.add_option("--benchmark-fragment-size",
                      dest="benchmark_fragment_size",
                      type="int",
                      help="benchmark: fragment size [default=%default].")

    parser.add_option("--verify",
                      dest="verify",
                      type="string",
                      help="verify against other database [default=%default].")

    parser.add_option(
        "--file-format",
        dest="file_format",
        type="choice",
        choices=("fasta", "auto", "fasta.gz", "tar", "tar.gz"),
        help=
        "file format of input. Supply if data comes from stdin [default=%default]."
    )

    parser.add_option(
        "-a",
        "--clean-sequence",
        dest="clean_sequence",
        action="store_true",
        help=
        "remove X/x from DNA sequences - they cause errors in exonerate [default=%default]."
    )

    parser.add_option(
        "--allow-duplicates",
        dest="allow_duplicates",
        action="store_true",
        help=
        "allow duplicate identifiers. Further occurances of an identifier are suffixed by an '_%i' [default=%default]."
    )

    parser.add_option(
        "--regex-identifier",
        dest="regex_identifier",
        type="string",
        help=
        "regular expression for extracting the identifier from fasta description line [default=%default]."
    )

    parser.add_option("--compress-index",
                      dest="compress_index",
                      action="store_true",
                      help="compress index [default=%default].")

    parser.add_option(
        "--force",
        dest="force",
        action="store_true",
        help="force overwriting of existing files [default=%default].")

    parser.add_option(
        "-t",
        "--translator",
        dest="translator",
        type="choice",
        choices=("solexa", "phred", "bytes", "range200"),
        help="translate numerical quality scores [default=%default].")

    parser.set_defaults(extract=None,
                        input_format="zero-both-open",
                        benchmark_fragment_size=1000,
                        benchmark_num_iterations=1000000,
                        benchmark=False,
                        compression=None,
                        random_access_points=0,
                        synonyms=None,
                        verify=None,
                        verify_num_iterations=100000,
                        verify_fragment_size=100,
                        clean_sequence=False,
                        allow_duplicates=False,
                        regex_identifier=None,
                        compress_index=False,
                        file_format="auto",
                        force=False,
                        translator=None)

    (options, args) = E.Start(parser)

    if options.synonyms:
        synonyms = {}
        for x in options.synonyms.split(","):
            a, b = x.split("=")
            a = a.strip()
            b = b.strip()
            if a not in synonyms: synonyms[a] = []
            synonyms[a].append(b)
    else:
        synonyms = None

    if options.translator:
        if options.translator == "phred":
            options.translator = TranslatorPhred()
        elif options.translator == "solexa":
            options.translator = TranslatorSolexa()
        elif options.translator == "bytes":
            options.translator = TranslatorBytes()
        elif options.translator == "range200":
            options.translator = TranslatorRange200()
        else:
            raise ValueError("unknown translator %s" % options.translator)

    if options.extract:
        fasta = IndexedFasta.IndexedFasta(args[0])
        fasta.setTranslator(options.translator)
        converter = IndexedFasta.getConverter(options.input_format)

        contig, strand, start, end = IndexedFasta.parseCoordinates(
            options.extract)
        sequence = fasta.getSequence(contig,
                                     strand,
                                     start,
                                     end,
                                     converter=converter)
        options.stdout.write( ">%s\n%s\n" % \
                              ( options.extract, sequence ) )
    elif options.benchmark:
        import timeit
        timer = timeit.Timer(
            stmt="benchmarkRandomFragment( fasta = fasta, size = %i)" %
            (options.benchmark_fragment_size),
            setup=
            """from __main__ import benchmarkRandomFragment,IndexedFasta\nfasta=IndexedFasta.IndexedFasta( "%s" )"""
            % (args[0]))

        t = timer.timeit(number=options.benchmark_num_iterations)
        options.stdout.write("iter\tsize\ttime\n")
        options.stdout.write("%i\t%i\t%i\n" %
                             (options.benchmark_num_iterations,
                              options.benchmark_fragment_size, t))
    elif options.verify:
        fasta1 = IndexedFasta.IndexedFasta(args[0])
        fasta2 = IndexedFasta.IndexedFasta(options.verify)
        nerrors1 = verify(fasta1,
                          fasta2,
                          options.verify_num_iterations,
                          options.verify_fragment_size,
                          stdout=options.stdout)
        options.stdout.write("errors=%i\n" % (nerrors1))
        nerrors2 = IndexedFasta.verify(fasta2,
                                       fasta1,
                                       options.verify_num_iterations,
                                       options.verify_fragment_size,
                                       stdout=options.stdout)
        options.stdout.write("errors=%i\n" % (nerrors2))
    elif options.compress_index:
        fasta = IndexedFasta.IndexedFasta(args[0])
        fasta.compressIndex()
    else:
        if options.loglevel >= 1:
            options.stdlog.write("# creating database %s\n" % args[0])
            options.stdlog.write("# indexing the following files: \n# %s\n" %\
                                 (" \n# ".join( args[1:] ) ))
            options.stdlog.flush()

            if synonyms:
                options.stdlog.write("# Applying the following synonyms:\n")
                for k, v in synonyms.items():
                    options.stdlog.write("# %s=%s\n" % (k, ",".join(v)))
                options.stdlog.flush()
        if len(args) < 2:
            print globals()["__doc__"]
            sys.exit(1)

        iterator = IndexedFasta.MultipleFastaIterator(
            args[1:],
            regex_identifier=options.regex_identifier,
            format=options.file_format)

        IndexedFasta.createDatabase(
            args[0],
            iterator,
            synonyms=synonyms,
            random_access_points=options.random_access_points,
            compression=options.compression,
            clean_sequence=options.clean_sequence,
            allow_duplicates=options.allow_duplicates,
            translator=options.translator,
            force=options.force)

    E.Stop()

Пример #5

Показать файл

Файл: quality2fasta.py Проект: BioinformaticsArchive/cgat

def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: quality2fasta.py 2781 2009-09-10 11:33:14Z andreas $")

    parser.add_option("-f", "--format", dest="format", type="choice",
                      choices=("fasta", ),
                      help="input format [%default]."  )

    parser.add_option("-o", "--output-format", dest="output_format", type="choice",
                      choices=("fasta", "fastq" ),
                      help="output format - if fastq is chosen, also supply a sequence file [%default]."  )
    
    parser.add_option("-a", "--alphabet", dest="alphabet", type="choice",
                      choices=("fastq", "solexa", "printable" ),
                      help="characters to use for quality scores [%default]."  )

    parser.add_option("-e", "--encoding", dest="encoding", type="choice",
                      choices=("phred", "solexa" ),
                      help="encoding of quality scores [%default]."  )
    
    parser.add_option("-i", "--build-index", dest="build_index", type="string",
                      help="build an index. Supply the database name [%default]."  )

    parser.add_option("-s", "--filename-sequences", dest="filename_sequences", type="string",
                      help="input filename with file of sequences in fasta format - sorted in the same way as the quality file [%default]."  )


    parser.add_option( "-d", "--set-to-default", dest="default_value", type="int",
                       help="set all quality codes to the default value. Supply the fasta sequence instead of the quality codes [%default]." )

    parser.set_defaults(
        format = "fasta",
        output_format = "fasta",
        build_index = None,
        filename_sequences = None,
        alphabet = "fastq",
        encoding = "phred",
        default_value = None,
        )
    
    (options, args) = E.Start( parser )

    ninput, noutput = 0, 0
    
    if options.format == "fasta":
        iterator = FromFastaIterator( sys.stdin, alphabet = options.alphabet, default = options.default_value )

    if options.output_format == "fasta":

        if options.build_index:
            IndexedFasta.createDatabase( options.build_index,
                                         iterator )
        else:
            while 1:
                try:
                    r = iterator.next()
                except StopIteration:
                    break
                t,s = r
                options.stdout.write( ">%s\n%s\n" % (t,s))

    elif options.output_format == "fastq":
        
        if not options.filename_sequences:
            raise "please supply a filename with sequences."

        iterator_sequence = FastaIterator.FastaIterator( open( options.filename_sequences, "r" ) )
        
        while 1:
            qual, seq = None, None
            try:
                qual = iterator.next()
                seq = iterator_sequence.next()
            except StopIteration:
                if qual and not seq:
                    options.stdlog.write( "# sequence file incomplete\n" )
                elif seq and not qual:
                    options.stdlog.write( "# quality file incomplete\n" )

            qt, qs = qual
            st, ss = seq.title, seq.sequence
            assert qt == st, "sequence and quality identifiers incongruent: %s != %s" % (qt, st)
            options.stdout.write( "@%s\n%s\n+\n%s\n" % (qt, ss, qs))

    if options.loglevel >= 1:
        options.stdlog.write( "# ninput=%i, noutput=%i, noverflow=%i, nunderflow=%i\n" % \
                                  (iterator.mNInput, 
                                   iterator.mNOutput, 
                                   iterator.mNOverFlow, 
                                   iterator.mNUnderFlow ))

    E.Stop()

Пример #6

Показать файл

Файл: index_fasta.py Проект: zpeng1989/cgat

def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-e",
        "--extract",
        dest="extract",
        type="string",
        help="extract region for testing purposes. Format is "
        "contig:strand:from:to. "
        "The default coordinates are 0-based "
        "open/closed coordinates on both strands, but can be changed "
        "by --input-format. "
        "For example, 'chr1:+:10:12' will return "
        "bases 11 and 12 on chr1. Elements from the end of the "
        "string can be omitted. For example, 'chr1' will return "
        "all of chromosome 'chr1'.")

    input_format_choices = ("one-forward-open", "zero-both-open")
    parser.add_option("-i",
                      "--input-format",
                      dest="input_format",
                      type="choice",
                      choices=input_format_choices,
                      help="coordinate format of input. Valid choices are "
                      "%s. See --extract. [default=%%default]." %
                      ", ".join(input_format_choices))

    parser.add_option(
        "-s",
        "--synonyms",
        dest="synonyms",
        type="string",
        help="list of synonyms. This is a comma separated with list "
        "of equivalence relations. For example, chrM=chrMT "
        "means that chrMT will refer to chrM and either "
        "can be used to retrieve a sequence "
        "[default=%default]")

    group = E.OptionGroup(parser, "Bencharking options")
    group.add_option("-b",
                     "--benchmark",
                     dest="benchmark",
                     action="store_true",
                     help="benchmark time for read access "
                     "[default=%default].")
    group.add_option("--benchmark-num-iterations",
                     dest="benchmark_num_iterations",
                     type="int",
                     help="number of iterations for benchmark "
                     "[default=%default].")
    group.add_option("--benchmark-fragment-size",
                     dest="benchmark_fragment_size",
                     type="int",
                     help="benchmark: fragment size [default=%default].")
    parser.add_option_group(group)

    group = E.OptionGroup(parser, "Validation options")
    group.add_option("--verify",
                     dest="verify",
                     type="string",
                     help="verify against other database [default=%default].")

    group.add_option("--verify-iterations",
                     dest="verify_num_iterations",
                     type="int",
                     help="number of iterations for verification "
                     "[default=%default].")
    parser.add_option_group(group)

    file_format_choices = ("fasta", "auto", "fasta.gz", "tar", "tar.gz")
    parser.add_option("--file-format",
                      dest="file_format",
                      type="choice",
                      choices=file_format_choices,
                      help="file format of input. Supply if data comes "
                      "from stdin "
                      "Valid choices are %s [default=%%default]." %
                      ", ".join(file_format_choices))

    parser.add_option("-a",
                      "--clean-sequence",
                      dest="clean_sequence",
                      action="store_true",
                      help="remove X/x from DNA sequences - they cause "
                      "errors in exonerate [default=%default].")

    parser.add_option("--allow-duplicates",
                      dest="allow_duplicates",
                      action="store_true",
                      help="allow duplicate identifiers. Further occurances "
                      "of an identifier are suffixed by an '_%i' "
                      "[default=%default].")

    parser.add_option("--regex-identifier",
                      dest="regex_identifier",
                      type="string",
                      help="regular expression for extracting the "
                      "identifier from fasta description line "
                      "[default=%default].")

    parser.add_option("--force-output",
                      dest="force",
                      action="store_true",
                      help="force overwriting of existing files "
                      "[default=%default].")

    translator_choices = ("solexa", "phred", "bytes", "range200")
    parser.add_option("-t",
                      "--translator",
                      dest="translator",
                      type="choice",
                      choices=translator_choices,
                      help="translate numerical quality scores. "
                      "Valid choices are %s [default=%%default]." %
                      ", ".join(translator_choices))

    group = E.OptionGroup(parser, 'Compression options')
    compression_choices = ("lzo", "zlib", "gzip", "dictzip", "bzip2", "debug")
    group.add_option("-c",
                     "--compression",
                     dest="compression",
                     type="choice",
                     choices=compression_choices,
                     help="compress database, using specified compression "
                     "method. "
                     "Valid choices are %s, but depend on availability on the "
                     "system "
                     "[default=%%default]." % ", ".join(compression_choices))

    group.add_option("--random-access-points",
                     dest="random_access_points",
                     type="int",
                     help="set random access points every # number "
                     "of nucleotides for block compression schemes "
                     "[default=%default].")

    group.add_option(
        "--compress-index",
        dest="compress_index",
        action="store_true",
        help="compress index. The default is to use a plain-text, "
        "human-readable index [default=%default].")

    parser.add_option_group(group)

    parser.set_defaults(extract=None,
                        input_format="zero-both-open",
                        benchmark_fragment_size=1000,
                        benchmark_num_iterations=1000000,
                        benchmark=False,
                        compression=None,
                        random_access_points=0,
                        synonyms=None,
                        verify=None,
                        verify_num_iterations=100000,
                        verify_fragment_size=100,
                        clean_sequence=False,
                        allow_duplicates=False,
                        regex_identifier=None,
                        compress_index=False,
                        file_format="auto",
                        force=False,
                        translator=None)

    (options, args) = E.Start(parser)

    if options.synonyms:
        synonyms = {}
        for x in options.synonyms.split(","):
            a, b = x.split("=")
            a = a.strip()
            b = b.strip()
            if a not in synonyms:
                synonyms[a] = []
            synonyms[a].append(b)
    else:
        synonyms = None

    if options.translator:
        if options.translator == "phred":
            options.translator = IndexedFasta.TranslatorPhred()
        elif options.translator == "solexa":
            options.translator = IndexedFasta.TranslatorSolexa()
        elif options.translator == "bytes":
            options.translator = IndexedFasta.TranslatorBytes()
        elif options.translator == "range200":
            options.translator = IndexedFasta.TranslatorRange200()
        else:
            raise ValueError("unknown translator %s" % options.translator)

    if options.extract:
        fasta = IndexedFasta.IndexedFasta(args[0])
        fasta.setTranslator(options.translator)
        converter = IndexedFasta.getConverter(options.input_format)

        contig, strand, start, end = IndexedFasta.parseCoordinates(
            options.extract)
        sequence = fasta.getSequence(contig,
                                     strand,
                                     start,
                                     end,
                                     converter=converter)
        options.stdout.write(">%s\n%s\n" % (options.extract, sequence))

    elif options.benchmark:
        import timeit
        timer = timeit.Timer(
            stmt="IndexedFasta.benchmarkRandomFragment(fasta=fasta, size=%i)" %
            (options.benchmark_fragment_size),
            setup="from __main__ import IndexedFasta\n"
            "fasta=IndexedFasta.IndexedFasta('%s')" % (args[0]))

        t = timer.timeit(number=options.benchmark_num_iterations)
        options.stdout.write("iter\tsize\ttime\n")
        options.stdout.write("%i\t%i\t%i\n" %
                             (options.benchmark_num_iterations,
                              options.benchmark_fragment_size, t))

    elif options.verify:
        fasta1 = IndexedFasta.IndexedFasta(args[0])
        fasta2 = IndexedFasta.IndexedFasta(options.verify)
        nerrors1 = IndexedFasta.verify(fasta1,
                                       fasta2,
                                       options.verify_num_iterations,
                                       options.verify_fragment_size,
                                       stdout=options.stdout)
        options.stdout.write("errors=%i\n" % (nerrors1))
        nerrors2 = IndexedFasta.verify(fasta2,
                                       fasta1,
                                       options.verify_num_iterations,
                                       options.verify_fragment_size,
                                       stdout=options.stdout)
        options.stdout.write("errors=%i\n" % (nerrors2))
    elif options.compress_index:
        fasta = IndexedFasta.IndexedFasta(args[0])
        fasta.compressIndex()
    else:
        if options.loglevel >= 1:
            options.stdlog.write("# creating database %s\n" % args[0])
            options.stdlog.write("# indexing the following files: \n# %s\n" %
                                 (" \n# ".join(args[1:])))
            options.stdlog.flush()

            if synonyms:
                options.stdlog.write("# Applying the following synonyms:\n")
                for k, v in synonyms.items():
                    options.stdlog.write("# %s=%s\n" % (k, ",".join(v)))
                options.stdlog.flush()
        if len(args) < 2:
            print globals()["__doc__"]
            sys.exit(1)

        iterator = IndexedFasta.MultipleFastaIterator(
            args[1:],
            regex_identifier=options.regex_identifier,
            format=options.file_format)

        IndexedFasta.createDatabase(
            args[0],
            iterator,
            synonyms=synonyms,
            random_access_points=options.random_access_points,
            compression=options.compression,
            clean_sequence=options.clean_sequence,
            allow_duplicates=options.allow_duplicates,
            translator=options.translator,
            force=options.force)

    E.Stop()

Пример #7

Показать файл

Файл: index_fasta.py Проект: Charlie-George/cgat

def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-e", "--extract", dest="extract", type="string",
                      help="extract region for testing purposes. Format is "
                      "contig:strand:from:to. "
                      "The default coordinates are 0-based "
                      "open/closed coordinates on both strands. "
                      "For example, chr1:+:10:12 will return "
                      "bases 11 to 12 on chr1.")

    compression_choices = ("lzo", "zlib", "gzip", "dictzip", "bzip2", "debug")
    parser.add_option("-c", "--compression", dest="compression", type="choice",
                      choices=compression_choices,
                      help="compress database, using specied compression. "
                      "Valid choices are %s. "
                      "[default=%%default]." % ", ".join(compression_choices))

    parser.add_option("--random-access-points", dest="random_access_points",
                      type="int",
                      help="save random access points every # number "
                      "of nucleotides [default=%default].")

    input_format_choices = ("one-forward-open", "zero-both-open")
    parser.add_option("-i", "--input-format", dest="input_format",
                      type="choice",
                      choices=input_format_choices,
                      help="coordinate format of input. Valid choices are "
                      "%s [default=%%default]." %
                      ", ".join(input_format_choices))

    parser.add_option("-s", "--synonyms", dest="synonyms", type="string",
                      help="list of synonyms, comma separated with =, "
                      "for example, chr1=chr1b [default=%default]")

    parser.add_option("-b", "--benchmark", dest="benchmark",
                      action="store_true",
                      help="benchmark time for read access "
                      "[default=%default].")

    parser.add_option("--benchmark-num-iterations",
                      dest="benchmark_num_iterations",
                      type="int",
                      help="number of iterations for benchmark "
                      "[default=%default].")

    parser.add_option("--benchmark-fragment-size",
                      dest="benchmark_fragment_size",
                      type="int",
                      help="benchmark: fragment size [default=%default].")

    parser.add_option("--verify", dest="verify", type="string",
                      help="verify against other database [default=%default].")

    parser.add_option("--verify-iterations", dest="verify_num_iterations",
                      type="int",
                      help="number of iterations for verification "
                      "[default=%default].")

    file_format_choices = ("fasta", "auto", "fasta.gz", "tar", "tar.gz")
    parser.add_option("--file-format", dest="file_format", type="choice",
                      choices=file_format_choices,
                      help="file format of input. Supply if data comes "
                      "from stdin "
                      "Valid choices are %s [default=%%default]." %
                      ", ".join(file_format_choices))

    parser.add_option("-a", "--clean-sequence", dest="clean_sequence",
                      action="store_true",
                      help="remove X/x from DNA sequences - they cause "
                      "errors in exonerate [default=%default].")

    parser.add_option("--allow-duplicates", dest="allow_duplicates",
                      action="store_true",
                      help="allow duplicate identifiers. Further occurances "
                      "of an identifier are suffixed by an '_%i' "
                      "[default=%default].")

    parser.add_option("--regex-identifier", dest="regex_identifier",
                      type="string",
                      help="regular expression for extracting the "
                      "identifier from fasta description line "
                      "[default=%default].")

    parser.add_option("--compress-index", dest="compress_index",
                      action="store_true",
                      help="compress index [default=%default].")

    parser.add_option("--force", dest="force", action="store_true",
                      help="force overwriting of existing files "
                      "[default=%default].")

    translator_choices = ("solexa", "phred", "bytes", "range200")
    parser.add_option("-t", "--translator", dest="translator", type="choice",
                      choices=translator_choices,
                      help="translate numerical quality scores. "
                      "Valid choices are %s [default=%%default]." %
                      ", ".join(translator_choices))

    parser.set_defaults(
        extract=None,
        input_format="zero-both-open",
        benchmark_fragment_size=1000,
        benchmark_num_iterations=1000000,
        benchmark=False,
        compression=None,
        random_access_points=0,
        synonyms=None,
        verify=None,
        verify_num_iterations=100000,
        verify_fragment_size=100,
        clean_sequence=False,
        allow_duplicates=False,
        regex_identifier=None,
        compress_index=False,
        file_format="auto",
        force=False,
        translator=None)

    (options, args) = E.Start(parser)

    if options.synonyms:
        synonyms = {}
        for x in options.synonyms.split(","):
            a, b = x.split("=")
            a = a.strip()
            b = b.strip()
            if a not in synonyms:
                synonyms[a] = []
            synonyms[a].append(b)
    else:
        synonyms = None

    if options.translator:
        if options.translator == "phred":
            options.translator = IndexedFasta.TranslatorPhred()
        elif options.translator == "solexa":
            options.translator = IndexedFasta.TranslatorSolexa()
        elif options.translator == "bytes":
            options.translator = IndexedFasta.TranslatorBytes()
        elif options.translator == "range200":
            options.translator = IndexedFasta.TranslatorRange200()
        else:
            raise ValueError("unknown translator %s" % options.translator)

    if options.extract:
        fasta = IndexedFasta.IndexedFasta(args[0])
        fasta.setTranslator(options.translator)
        converter = IndexedFasta.getConverter(options.input_format)

        contig, strand, start, end = IndexedFasta.parseCoordinates(
            options.extract)
        sequence = fasta.getSequence(contig, strand,
                                     start, end,
                                     converter=converter)
        options.stdout.write(">%s\n%s\n" %
                             (options.extract, sequence))

    elif options.benchmark:
        import timeit
        timer = timeit.Timer(
            stmt="IndexedFasta.benchmarkRandomFragment( fasta = fasta, size = %i)" % (
                options.benchmark_fragment_size),
            setup="""from __main__ import IndexedFasta\nfasta=IndexedFasta.IndexedFasta( "%s" )""" % (args[0] ) )

        t = timer.timeit(number=options.benchmark_num_iterations)
        options.stdout.write("iter\tsize\ttime\n")
        options.stdout.write("%i\t%i\t%i\n" % (
            options.benchmark_num_iterations, options.benchmark_fragment_size, t))

    elif options.verify:
        fasta1 = IndexedFasta.IndexedFasta(args[0])
        fasta2 = IndexedFasta.IndexedFasta(options.verify)
        nerrors1 = IndexedFasta.verify(fasta1, fasta2,
                                       options.verify_num_iterations,
                                       options.verify_fragment_size,
                                       stdout=options.stdout)
        options.stdout.write("errors=%i\n" % (nerrors1))
        nerrors2 = IndexedFasta.verify(fasta2, fasta1,
                                       options.verify_num_iterations,
                                       options.verify_fragment_size,
                                       stdout=options.stdout)
        options.stdout.write("errors=%i\n" % (nerrors2))
    elif options.compress_index:
        fasta = IndexedFasta.IndexedFasta(args[0])
        fasta.compressIndex()
    else:
        if options.loglevel >= 1:
            options.stdlog.write("# creating database %s\n" % args[0])
            options.stdlog.write("# indexing the following files: \n# %s\n" %
                                 (" \n# ".join(args[1:])))
            options.stdlog.flush()

            if synonyms:
                options.stdlog.write("# Applying the following synonyms:\n")
                for k, v in synonyms.items():
                    options.stdlog.write("# %s=%s\n" % (k, ",".join(v)))
                options.stdlog.flush()
        if len(args) < 2:
            print globals()["__doc__"]
            sys.exit(1)

        iterator = IndexedFasta.MultipleFastaIterator(
            args[1:],
            regex_identifier=options.regex_identifier,
            format=options.file_format)

        IndexedFasta.createDatabase(
            args[0],
            iterator,
            synonyms=synonyms,
            random_access_points=options.random_access_points,
            compression=options.compression,
            clean_sequence=options.clean_sequence,
            allow_duplicates=options.allow_duplicates,
            translator=options.translator,
            force=options.force)

    E.Stop()

Пример #8

Показать файл

Файл: quality2fasta.py Проект: siping/cgat

        alphabet = "fastq",
        encoding = "phred",
        default_value = None,
        )
    
    (options, args) = E.Start( parser )

    ninput, noutput = 0, 0
    
    if options.format == "fasta":
        iterator = FromFastaIterator( sys.stdin, alphabet = options.alphabet, default = options.default_value )

    if options.output_format == "fasta":

        if options.build_index:
            IndexedFasta.createDatabase( options.build_index,
                                         iterator )
        else:
            while 1:
                try:
                    r = iterator.next()
                except StopIteration:
                    break
                t,s = r
                options.stdout.write( ">%s\n%s\n" % (t,s))

    elif options.output_format == "fastq":
        
        if not options.filename_sequences:
            raise "please supply a filename with sequences."

        iterator_sequence = FastaIterator.FastaIterator( open( options.filename_sequences, "r" ) )

Пример #9

Показать файл

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: quality2fasta.py 2781 2009-09-10 11:33:14Z andreas $"
    )

    parser.add_option("-f",
                      "--format",
                      dest="format",
                      type="choice",
                      choices=("fasta", ),
                      help="input format [%default].")

    parser.add_option(
        "-o",
        "--output-format",
        dest="output_format",
        type="choice",
        choices=("fasta", "fastq"),
        help=
        "output format - if fastq is chosen, also supply a sequence file [%default]."
    )

    parser.add_option("-a",
                      "--alphabet",
                      dest="alphabet",
                      type="choice",
                      choices=("fastq", "solexa", "printable"),
                      help="characters to use for quality scores [%default].")

    parser.add_option("-e",
                      "--encoding",
                      dest="encoding",
                      type="choice",
                      choices=("phred", "solexa"),
                      help="encoding of quality scores [%default].")

    parser.add_option(
        "-i",
        "--build-index",
        dest="build_index",
        type="string",
        help="build an index. Supply the database name [%default].")

    parser.add_option(
        "-s",
        "--filename-sequences",
        dest="filename_sequences",
        type="string",
        help=
        "input filename with file of sequences in fasta format - sorted in the same way as the quality file [%default]."
    )

    parser.add_option(
        "-d",
        "--set-to-default",
        dest="default_value",
        type="int",
        help=
        "set all quality codes to the default value. Supply the fasta sequence instead of the quality codes [%default]."
    )

    parser.set_defaults(
        format="fasta",
        output_format="fasta",
        build_index=None,
        filename_sequences=None,
        alphabet="fastq",
        encoding="phred",
        default_value=None,
    )

    (options, args) = E.start(parser)

    ninput, noutput = 0, 0

    if options.format == "fasta":
        iterator = FromFastaIterator(sys.stdin,
                                     alphabet=options.alphabet,
                                     default=options.default_value)

    if options.output_format == "fasta":

        if options.build_index:
            IndexedFasta.createDatabase(options.build_index, iterator)
        else:
            while 1:
                try:
                    r = next(iterator)
                except StopIteration:
                    break
                t, s = r
                options.stdout.write(">%s\n%s\n" % (t, s))

    elif options.output_format == "fastq":

        if not options.filename_sequences:
            raise ValueError("please supply a filename with sequences")

        iterator_sequence = FastaIterator.FastaIterator(
            IOTools.open_file(options.filename_sequences, "r"))

        while 1:
            qual, seq = None, None
            try:
                qual = next(iterator)
                seq = next(iterator_sequence)
            except StopIteration:
                if qual and not seq:
                    options.stdlog.write("# sequence file incomplete\n")
                elif seq and not qual:
                    options.stdlog.write("# quality file incomplete\n")

            qt, qs = qual
            st, ss = seq.title, seq.sequence
            assert qt == st, "sequence and quality identifiers incongruent: %s != %s" % (
                qt, st)
            options.stdout.write("@%s\n%s\n+\n%s\n" % (qt, ss, qs))

    if options.loglevel >= 1:
        options.stdlog.write(
            "# ninput=%i, noutput=%i, noverflow=%i, nunderflow=%i\n" %
            (iterator.mNInput, iterator.mNOutput, iterator.mNOverFlow,
             iterator.mNUnderFlow))

    E.stop()