Python IndexedFasta.getConverter примеры использования

Язык программирования: Python

Пространство имен/Пакет: CGAT

Класс/Тип: IndexedFasta

Метод/Функция: getConverter

Примеров на hotexamples.com: 15

Python IndexedFasta.getConverter - 15 примеров найдено. Это лучшие примеры Python кода для CGAT.IndexedFasta.getConverter, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

IndexedFasta(30)

getConverter(8)

createDatabase(5)

TranslatorBytes(3)

MultipleFastaIterator(2)

parseCoordinates(2)

verify(2)

TranslatorPhred(1)

TranslatorRange200(1)

TranslatorSolexa(1)

getContigSizes(1)

getSequence(1)

Пример #1

Показать файл

Файл: bed2fasta.py Проект: CGATOxford/cgat

def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id$",
        usage=globals()["__doc__"])

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genomic sequence to retrieve "
                      "sequences from.")

    parser.add_option("-m", "--masker", dest="masker", type="choice",
                      choices=("dust", "dustmasker", "softmask", "none"),
                      help="apply masker to mask output sequences "
                      "[%default].")

    parser.add_option("--output-mode", dest="output_mode", type="choice",
                      choices=("intervals", "leftright", "segments"),
                      help="what to output. "
                      "'intervals' generates a single sequence for "
                      "each bed interval. 'leftright' generates two "
                      "sequences, one in each direction, for each bed "
                      "interval. 'segments' can be used to output "
                      "sequence from bed12 files so that sequence only covers "
                      "the segements [%default]")

    parser.add_option("--min-sequence-length", dest="min_length", type="int",
                      help="require a minimum sequence length [%default]")

    parser.add_option("--max-sequence-length", dest="max_length", type="int",
                      help="require a maximum sequence length [%default]")

    parser.add_option(
        "--extend-at", dest="extend_at", type="choice",
        choices=("none", "3", "5", "both", "3only", "5only"),
        help="extend at 3', 5' or both or no ends. If 3only or 5only "
        "are set, only the added sequence is returned [default=%default]")

    parser.add_option(
        "--extend-by", dest="extend_by", type="int",
        help="extend by # bases [default=%default]")

    parser.add_option(
        "--use-strand", dest="ignore_strand",
        action="store_false",
        help="use strand information and return reverse complement "
        "on intervals located on the negative strand. "
        "[default=%default]")

    parser.set_defaults(
        genome_file=None,
        masker=None,
        output_mode="intervals",
        min_length=0,
        max_length=0,
        extend_at=None,
        extend_by=100,
        ignore_strand=True,
    )

    (options, args) = E.Start(parser)

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contigs = fasta.getContigSizes()
        fasta.setConverter(IndexedFasta.getConverter("zero-both-open"))

    counter = E.Counter()
    ids, seqs = [], []

    E.info("collecting sequences")
    for bed in Bed.setName(Bed.iterator(options.stdin)):
        counter.input += 1

        lcontig = fasta.getLength(bed.contig)

        if options.ignore_strand:
            strand = "+"
        else:
            strand = bed.strand

        if options.output_mode == "segments" and bed.columns == 12:
            ids.append("%s %s:%i..%i (%s) %s %s" %
                       (bed.name, bed.contig, bed.start, bed.end, strand,
                        bed["blockSizes"], bed["blockStarts"]))
            seg_seqs = [fasta.getSequence(bed.contig, strand, start, end)
                        for start, end in bed.toIntervals()]
            seqs.append("".join(seg_seqs))

        elif (options.output_mode == "intervals" or
              options.output_mode == "segments"):
            ids.append("%s %s:%i..%i (%s)" %
                       (bed.name, bed.contig, bed.start, bed.end, strand))
            seqs.append(
                fasta.getSequence(bed.contig, strand, bed.start, bed.end))

        elif options.output_mode == "leftright":
            l = bed.end - bed.start

            start, end = max(0, bed.start - l), bed.end - l
            ids.append("%s_l %s:%i..%i (%s)" %
                       (bed.name, bed.contig, start, end, strand))
            seqs.append(fasta.getSequence(bed.contig, strand, start, end))

            start, end = bed.start + l, min(lcontig, bed.end + l)
            ids.append("%s_r %s:%i..%i (%s)" %
                       (bed.name, bed.contig, start, end, strand))
            seqs.append(fasta.getSequence(bed.contig, strand, start, end))

    E.info("collected %i sequences" % len(seqs))

    masked = Masker.maskSequences(seqs, options.masker)
    options.stdout.write(
        "\n".join([">%s\n%s" % (x, y) for x, y in zip(ids, masked)]) + "\n")

    E.info("masked %i sequences" % len(seqs))

    counter.output = len(seqs)

    E.info("%s" % counter)

    E.Stop()

Пример #2

Показать файл

def main(argv=None):

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: IndexedFasta.py 2801 2009-10-22 13:40:39Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "-e",
        "--extract",
        dest="extract",
        type="string",
        help=
        "extract region for testing purposes. Format is contig:strand:from:to. "
        "The default coordinates are 0-based open/closed coordinates on both strands. "
        "For example, chr1:+:10:12 will return bases 11 to 12 on chr1.")

    parser.add_option("-c",
                      "--compression",
                      dest="compression",
                      type="choice",
                      choices=("lzo", "zlib", "gzip", "dictzip", "bzip2",
                               "debug"),
                      help="compress database [default=%default].")

    parser.add_option(
        "--random-access-points",
        dest="random_access_points",
        type="int",
        help=
        "save random access points every # number of nucleotides [default=%default]."
    )

    parser.add_option("-i",
                      "--input-format",
                      dest="input_format",
                      type="choice",
                      choices=("one-forward-open", "zero-both-open"),
                      help="coordinate format of input [default=%default].")

    parser.add_option(
        "-s",
        "--synonyms",
        dest="synonyms",
        type="string",
        help=
        "list of synonyms, comma separated with =, for example, chr1=chr1b [default=%default]"
    )

    parser.add_option(
        "-b",
        "--benchmark",
        dest="benchmark",
        action="store_true",
        help="benchmark time for read access [default=%default].")

    parser.add_option(
        "--benchmark-num-iterations",
        dest="benchmark_num_iterations",
        type="int",
        help="number of iterations for benchmark [default=%default].")

    parser.add_option("--benchmark-fragment-size",
                      dest="benchmark_fragment_size",
                      type="int",
                      help="benchmark: fragment size [default=%default].")

    parser.add_option("--verify",
                      dest="verify",
                      type="string",
                      help="verify against other database [default=%default].")

    parser.add_option(
        "--file-format",
        dest="file_format",
        type="choice",
        choices=("fasta", "auto", "fasta.gz", "tar", "tar.gz"),
        help=
        "file format of input. Supply if data comes from stdin [default=%default]."
    )

    parser.add_option(
        "-a",
        "--clean-sequence",
        dest="clean_sequence",
        action="store_true",
        help=
        "remove X/x from DNA sequences - they cause errors in exonerate [default=%default]."
    )

    parser.add_option(
        "--allow-duplicates",
        dest="allow_duplicates",
        action="store_true",
        help=
        "allow duplicate identifiers. Further occurances of an identifier are suffixed by an '_%i' [default=%default]."
    )

    parser.add_option(
        "--regex-identifier",
        dest="regex_identifier",
        type="string",
        help=
        "regular expression for extracting the identifier from fasta description line [default=%default]."
    )

    parser.add_option("--compress-index",
                      dest="compress_index",
                      action="store_true",
                      help="compress index [default=%default].")

    parser.add_option(
        "--force",
        dest="force",
        action="store_true",
        help="force overwriting of existing files [default=%default].")

    parser.add_option(
        "-t",
        "--translator",
        dest="translator",
        type="choice",
        choices=("solexa", "phred", "bytes", "range200"),
        help="translate numerical quality scores [default=%default].")

    parser.set_defaults(extract=None,
                        input_format="zero-both-open",
                        benchmark_fragment_size=1000,
                        benchmark_num_iterations=1000000,
                        benchmark=False,
                        compression=None,
                        random_access_points=0,
                        synonyms=None,
                        verify=None,
                        verify_num_iterations=100000,
                        verify_fragment_size=100,
                        clean_sequence=False,
                        allow_duplicates=False,
                        regex_identifier=None,
                        compress_index=False,
                        file_format="auto",
                        force=False,
                        translator=None)

    (options, args) = E.Start(parser)

    if options.synonyms:
        synonyms = {}
        for x in options.synonyms.split(","):
            a, b = x.split("=")
            a = a.strip()
            b = b.strip()
            if a not in synonyms: synonyms[a] = []
            synonyms[a].append(b)
    else:
        synonyms = None

    if options.translator:
        if options.translator == "phred":
            options.translator = TranslatorPhred()
        elif options.translator == "solexa":
            options.translator = TranslatorSolexa()
        elif options.translator == "bytes":
            options.translator = TranslatorBytes()
        elif options.translator == "range200":
            options.translator = TranslatorRange200()
        else:
            raise ValueError("unknown translator %s" % options.translator)

    if options.extract:
        fasta = IndexedFasta.IndexedFasta(args[0])
        fasta.setTranslator(options.translator)
        converter = IndexedFasta.getConverter(options.input_format)

        contig, strand, start, end = IndexedFasta.parseCoordinates(
            options.extract)
        sequence = fasta.getSequence(contig,
                                     strand,
                                     start,
                                     end,
                                     converter=converter)
        options.stdout.write( ">%s\n%s\n" % \
                              ( options.extract, sequence ) )
    elif options.benchmark:
        import timeit
        timer = timeit.Timer(
            stmt="benchmarkRandomFragment( fasta = fasta, size = %i)" %
            (options.benchmark_fragment_size),
            setup=
            """from __main__ import benchmarkRandomFragment,IndexedFasta\nfasta=IndexedFasta.IndexedFasta( "%s" )"""
            % (args[0]))

        t = timer.timeit(number=options.benchmark_num_iterations)
        options.stdout.write("iter\tsize\ttime\n")
        options.stdout.write("%i\t%i\t%i\n" %
                             (options.benchmark_num_iterations,
                              options.benchmark_fragment_size, t))
    elif options.verify:
        fasta1 = IndexedFasta.IndexedFasta(args[0])
        fasta2 = IndexedFasta.IndexedFasta(options.verify)
        nerrors1 = verify(fasta1,
                          fasta2,
                          options.verify_num_iterations,
                          options.verify_fragment_size,
                          stdout=options.stdout)
        options.stdout.write("errors=%i\n" % (nerrors1))
        nerrors2 = IndexedFasta.verify(fasta2,
                                       fasta1,
                                       options.verify_num_iterations,
                                       options.verify_fragment_size,
                                       stdout=options.stdout)
        options.stdout.write("errors=%i\n" % (nerrors2))
    elif options.compress_index:
        fasta = IndexedFasta.IndexedFasta(args[0])
        fasta.compressIndex()
    else:
        if options.loglevel >= 1:
            options.stdlog.write("# creating database %s\n" % args[0])
            options.stdlog.write("# indexing the following files: \n# %s\n" %\
                                 (" \n# ".join( args[1:] ) ))
            options.stdlog.flush()

            if synonyms:
                options.stdlog.write("# Applying the following synonyms:\n")
                for k, v in synonyms.items():
                    options.stdlog.write("# %s=%s\n" % (k, ",".join(v)))
                options.stdlog.flush()
        if len(args) < 2:
            print globals()["__doc__"]
            sys.exit(1)

        iterator = IndexedFasta.MultipleFastaIterator(
            args[1:],
            regex_identifier=options.regex_identifier,
            format=options.file_format)

        IndexedFasta.createDatabase(
            args[0],
            iterator,
            synonyms=synonyms,
            random_access_points=options.random_access_points,
            compression=options.compression,
            clean_sequence=options.clean_sequence,
            allow_duplicates=options.allow_duplicates,
            translator=options.translator,
            force=options.force)

    E.Stop()

Пример #3

Показать файл

Файл: extractseq.py Проект: santayana/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: extractseq.py 2861 2010-02-23 17:36:32Z andreas $"
    )

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="pattern to look for sequence filename.")

    parser.add_option("-d",
                      "--identifier",
                      dest="identifier",
                      type="string",
                      help="identifier(s).")

    parser.add_option(
        "-o",
        "--output-coordinate-format",
        dest="output_coordinate_format",
        type="choice",
        choices=("full", "long"),
        help=
        """output format of coordinates. Output format is contig:strand:from:to in zero based
/forward/reverse strand coordinates in open/closed notation. 'long' includes the contig length as fifth field"""
    )

    parser.add_option("--input-format",
                      dest="input_format",
                      type="choice",
                      choices=("list", "id-compressed"),
                      help="input format.")

    parser.add_option("-i",
                      "--input-coordinate-format",
                      dest="input_coordinate_format",
                      type="choice",
                      choices=("zero-both", "zero-forward"),
                      help="coordinate format.")

    parser.add_option(
        "-e",
        "--extend-region",
        dest="extend_region",
        type="int",
        help="regions are extended by this margin at either end.")

    parser.add_option(
        "-r",
        "--shorten-region",
        dest="shorten_region",
        type="int",
        help="regions are shortened by this margin at either end.")

    parser.set_defaults(
        genome_file=None,
        identifier=None,
        input_coordinate_format="zero-both",
        output_coordinate_format="full",
        input_format="list",
        extend_region=0,
        shorten_region=0,
    )

    (options, args) = E.Start(parser)

    fasta = IndexedFasta.IndexedFasta(options.genome_file)

    lines = []
    if options.identifier:
        lines += map(lambda x: x.split(":"), options.identifier.split(","))

    if args:
        lines += map(lambda x: x.split(":"), args)

    if len(lines) == 0:
        lines = map(lambda x: x[:-1].split("\t"),
                    (filter(lambda x: x[0] != "#", options.stdin.readlines())))

    ninput, nskipped, noutput = 0, 0, 0
    for data in lines:

        if options.input_format == "list":
            if len(data) < 4:
                sbjct_token = data[0]
                sbjct_from, sbjct_to = "0", "0"
                sbjct_strand = "+"
            else:
                sbjct_token, sbjct_strand, sbjct_from, sbjct_to = data[:4]
            id = None
        elif options.input_format == "id-compressed":
            id = data[0]
            sbjct_token, sbjct_strand, sbjct_from, sbjct_to = data[1].split(
                ":")

        ninput += 1

        try:
            sbjct_from, sbjct_to = int(sbjct_from), int(sbjct_to)
        except ValueError:
            E.warn("skipping line %s" % data)
            nskipped += 1
            continue

        sbjct_from -= (options.extend_region - options.shorten_region)
        sbjct_from = max(0, sbjct_from)
        lcontig = fasta.getLength(sbjct_token)
        if sbjct_to != 0:
            sbjct_to += (options.extend_region - options.shorten_region)
            sbjct_to = min(sbjct_to, lcontig)
        else:
            sbjct_to = lcontig

        if sbjct_to - sbjct_from <= 0:
            nskipped += 1
            continue

        sequence = fasta.getSequence(sbjct_token,
                                     sbjct_strand,
                                     sbjct_from,
                                     sbjct_to,
                                     converter=IndexedFasta.getConverter(
                                         options.input_coordinate_format))

        if options.output_coordinate_format == "full":
            coordinates = "%s:%s:%i:%i" % (sbjct_token, sbjct_strand,
                                           sbjct_from, sbjct_to)

        elif options.output_coordinate_format == "long":
            coordinates = "%s:%s:%i:%i:%i" % (sbjct_token, sbjct_strand,
                                              sbjct_from, sbjct_to, lcontig)

        if id:
            options.stdout.write(">%s %s\n%s\n" % (id, coordinates, sequence))
        else:
            options.stdout.write(">%s\n%s\n" % (coordinates, sequence))

        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    E.Stop()

Пример #4

Показать файл

Файл: PipelineDeNovoMotifs_python3.py Проект: sudlab/pipeline_denovo_motifs

def writeSequencesForIntervals(track,
                               filename,
                               dbhandle,
                               full=False,
                               halfwidth=None,
                               maxsize=None,
                               proportion=None,
                               masker=[],
                               offset=0,
                               shuffled=False,
                               num_sequences=None,
                               min_sequences=None,
                               order="peakval",
                               shift=None,
                               stranded=False):
    '''build a sequence set for motif discovery. Intervals are taken from
    the table <track>_intervals in the database *dbhandle* and save to
    *filename* in :term:`fasta` format.

    If num_shuffles is set, shuffled copies are created as well with
    the shuffled number appended to the filename.

    The sequences are masked before shuffling (is this appropriate?)

    If *full* is set, the whole intervals will be output, otherwise
    only the region around the peak given by *halfwidth*

    If *maxsize* is set, the output is truncated at *maxsize* characters
    in order to create jobs that take too long.

    If proportion is set, only the top *proportion* intervals are output
    (sorted by peakval).

    If *num_sequences* is set, the first *num_sequences* will be used.

    *masker* can be a combination of
        * dust, dustmasker: apply dustmasker
        * softmask: mask softmasked genomic regions

    *order* is the order by which peaks should be sorted. Possible
    values are 'peakval' (peak value, descending order), 'score' (peak
    score, descending order)

    If *shift* is set, intervals will be shifted. ``leftright``
    creates two intervals on the left and right of the actual
    interval. The intervals will be centered around the mid-point and
    truncated the same way as the main intervals.

    '''
    cc = dbhandle.cursor()

    orderby = ""
    if order == "peakval":
        orderby = " ORDER BY peakval DESC"
    elif order == "max":
        orderby = " ORDER BY score DESC"
    elif order != "random":
        raise ValueError(
            "Unknown value passed as order parameter, check your ini file")

    tablename = "%s_intervals" % P.tablequote(track)
    statement = '''SELECT contig, start, end, interval_id, score, strand, peakcenter 
                       FROM %(tablename)s 
                       ''' % locals() + orderby

    cc.execute(statement)
    data = cc.fetchall()
    cc.close()

    E.debug("Got %s intervals for track %s" % (len(data), track))
    if len(data) == 0:
        P.touch(filename)
        return

    data = truncateList(data, track, proportion, min_sequences, num_sequences,
                        order == "random")

    beds = bedsFromList(data)

    L.info("writeSequencesForIntervals %s: masker=%s" % (track, str(masker)))

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(PARAMS["genome_dir"], PARAMS["genome"]))

    # At the moment the pipeline retrieves from the DB the bed regions and they will
    # always be in the positive strand but if this were to change. The regions retrieved from
    # the negative strand will be counted from the end of the chromosome and not the beginning without this.
    # This should be tested.
    fasta.setConverter(IndexedFasta.getConverter("zero-single-open"))

    # modify the ranges
    if shift == "leftright":
        beds = shitfBeds(beds)

    if halfwidth and not full:
        beds = centreAndCrop(beds, halfwidth)

    sequences = getFASTAFromBed(beds, fasta, stranded, offset, maxsize)

    if shuffled:
        sequences = shuffleFasta(sequences)

    c = E.Counter()
    outs = IOTools.open_file(filename, "w")
    for masker in masker:
        if masker not in ("unmasked", "none", None):
            ids, sequences = zip(*[(x.title, x.sequence) for x in sequences])
            sequences = maskSequences(sequences, masker)
            sequences = (FastaRecord(id, seq)
                         for id, seq in zip(ids, sequences))

    with IOTools.open_file(filename, "w") as outs:

        for sequence in sequences:
            c.input += 1
            if len(sequence.sequence) == 0:
                c.empty += 1
                continue
            if len(sequence.sequence) < 0:
                c.too_short += 1
                continue

            outs.write(">%s\n%s\n" % (sequence.title, sequence.sequence))
            c.output += 1
        outs.close()

    E.info("%s" % c)

    return c.output

Пример #5

Показать файл

Файл: index_fasta.py Проект: Charlie-George/cgat

def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-e", "--extract", dest="extract", type="string",
                      help="extract region for testing purposes. Format is "
                      "contig:strand:from:to. "
                      "The default coordinates are 0-based "
                      "open/closed coordinates on both strands. "
                      "For example, chr1:+:10:12 will return "
                      "bases 11 to 12 on chr1.")

    compression_choices = ("lzo", "zlib", "gzip", "dictzip", "bzip2", "debug")
    parser.add_option("-c", "--compression", dest="compression", type="choice",
                      choices=compression_choices,
                      help="compress database, using specied compression. "
                      "Valid choices are %s. "
                      "[default=%%default]." % ", ".join(compression_choices))

    parser.add_option("--random-access-points", dest="random_access_points",
                      type="int",
                      help="save random access points every # number "
                      "of nucleotides [default=%default].")

    input_format_choices = ("one-forward-open", "zero-both-open")
    parser.add_option("-i", "--input-format", dest="input_format",
                      type="choice",
                      choices=input_format_choices,
                      help="coordinate format of input. Valid choices are "
                      "%s [default=%%default]." %
                      ", ".join(input_format_choices))

    parser.add_option("-s", "--synonyms", dest="synonyms", type="string",
                      help="list of synonyms, comma separated with =, "
                      "for example, chr1=chr1b [default=%default]")

    parser.add_option("-b", "--benchmark", dest="benchmark",
                      action="store_true",
                      help="benchmark time for read access "
                      "[default=%default].")

    parser.add_option("--benchmark-num-iterations",
                      dest="benchmark_num_iterations",
                      type="int",
                      help="number of iterations for benchmark "
                      "[default=%default].")

    parser.add_option("--benchmark-fragment-size",
                      dest="benchmark_fragment_size",
                      type="int",
                      help="benchmark: fragment size [default=%default].")

    parser.add_option("--verify", dest="verify", type="string",
                      help="verify against other database [default=%default].")

    parser.add_option("--verify-iterations", dest="verify_num_iterations",
                      type="int",
                      help="number of iterations for verification "
                      "[default=%default].")

    file_format_choices = ("fasta", "auto", "fasta.gz", "tar", "tar.gz")
    parser.add_option("--file-format", dest="file_format", type="choice",
                      choices=file_format_choices,
                      help="file format of input. Supply if data comes "
                      "from stdin "
                      "Valid choices are %s [default=%%default]." %
                      ", ".join(file_format_choices))

    parser.add_option("-a", "--clean-sequence", dest="clean_sequence",
                      action="store_true",
                      help="remove X/x from DNA sequences - they cause "
                      "errors in exonerate [default=%default].")

    parser.add_option("--allow-duplicates", dest="allow_duplicates",
                      action="store_true",
                      help="allow duplicate identifiers. Further occurances "
                      "of an identifier are suffixed by an '_%i' "
                      "[default=%default].")

    parser.add_option("--regex-identifier", dest="regex_identifier",
                      type="string",
                      help="regular expression for extracting the "
                      "identifier from fasta description line "
                      "[default=%default].")

    parser.add_option("--compress-index", dest="compress_index",
                      action="store_true",
                      help="compress index [default=%default].")

    parser.add_option("--force", dest="force", action="store_true",
                      help="force overwriting of existing files "
                      "[default=%default].")

    translator_choices = ("solexa", "phred", "bytes", "range200")
    parser.add_option("-t", "--translator", dest="translator", type="choice",
                      choices=translator_choices,
                      help="translate numerical quality scores. "
                      "Valid choices are %s [default=%%default]." %
                      ", ".join(translator_choices))

    parser.set_defaults(
        extract=None,
        input_format="zero-both-open",
        benchmark_fragment_size=1000,
        benchmark_num_iterations=1000000,
        benchmark=False,
        compression=None,
        random_access_points=0,
        synonyms=None,
        verify=None,
        verify_num_iterations=100000,
        verify_fragment_size=100,
        clean_sequence=False,
        allow_duplicates=False,
        regex_identifier=None,
        compress_index=False,
        file_format="auto",
        force=False,
        translator=None)

    (options, args) = E.Start(parser)

    if options.synonyms:
        synonyms = {}
        for x in options.synonyms.split(","):
            a, b = x.split("=")
            a = a.strip()
            b = b.strip()
            if a not in synonyms:
                synonyms[a] = []
            synonyms[a].append(b)
    else:
        synonyms = None

    if options.translator:
        if options.translator == "phred":
            options.translator = IndexedFasta.TranslatorPhred()
        elif options.translator == "solexa":
            options.translator = IndexedFasta.TranslatorSolexa()
        elif options.translator == "bytes":
            options.translator = IndexedFasta.TranslatorBytes()
        elif options.translator == "range200":
            options.translator = IndexedFasta.TranslatorRange200()
        else:
            raise ValueError("unknown translator %s" % options.translator)

    if options.extract:
        fasta = IndexedFasta.IndexedFasta(args[0])
        fasta.setTranslator(options.translator)
        converter = IndexedFasta.getConverter(options.input_format)

        contig, strand, start, end = IndexedFasta.parseCoordinates(
            options.extract)
        sequence = fasta.getSequence(contig, strand,
                                     start, end,
                                     converter=converter)
        options.stdout.write(">%s\n%s\n" %
                             (options.extract, sequence))

    elif options.benchmark:
        import timeit
        timer = timeit.Timer(
            stmt="IndexedFasta.benchmarkRandomFragment( fasta = fasta, size = %i)" % (
                options.benchmark_fragment_size),
            setup="""from __main__ import IndexedFasta\nfasta=IndexedFasta.IndexedFasta( "%s" )""" % (args[0] ) )

        t = timer.timeit(number=options.benchmark_num_iterations)
        options.stdout.write("iter\tsize\ttime\n")
        options.stdout.write("%i\t%i\t%i\n" % (
            options.benchmark_num_iterations, options.benchmark_fragment_size, t))

    elif options.verify:
        fasta1 = IndexedFasta.IndexedFasta(args[0])
        fasta2 = IndexedFasta.IndexedFasta(options.verify)
        nerrors1 = IndexedFasta.verify(fasta1, fasta2,
                                       options.verify_num_iterations,
                                       options.verify_fragment_size,
                                       stdout=options.stdout)
        options.stdout.write("errors=%i\n" % (nerrors1))
        nerrors2 = IndexedFasta.verify(fasta2, fasta1,
                                       options.verify_num_iterations,
                                       options.verify_fragment_size,
                                       stdout=options.stdout)
        options.stdout.write("errors=%i\n" % (nerrors2))
    elif options.compress_index:
        fasta = IndexedFasta.IndexedFasta(args[0])
        fasta.compressIndex()
    else:
        if options.loglevel >= 1:
            options.stdlog.write("# creating database %s\n" % args[0])
            options.stdlog.write("# indexing the following files: \n# %s\n" %
                                 (" \n# ".join(args[1:])))
            options.stdlog.flush()

            if synonyms:
                options.stdlog.write("# Applying the following synonyms:\n")
                for k, v in synonyms.items():
                    options.stdlog.write("# %s=%s\n" % (k, ",".join(v)))
                options.stdlog.flush()
        if len(args) < 2:
            print globals()["__doc__"]
            sys.exit(1)

        iterator = IndexedFasta.MultipleFastaIterator(
            args[1:],
            regex_identifier=options.regex_identifier,
            format=options.file_format)

        IndexedFasta.createDatabase(
            args[0],
            iterator,
            synonyms=synonyms,
            random_access_points=options.random_access_points,
            compression=options.compression,
            clean_sequence=options.clean_sequence,
            allow_duplicates=options.allow_duplicates,
            translator=options.translator,
            force=options.force)

    E.Stop()

Пример #6

Показать файл

Файл: index_fasta.py Проект: zpeng1989/cgat

def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-e",
        "--extract",
        dest="extract",
        type="string",
        help="extract region for testing purposes. Format is "
        "contig:strand:from:to. "
        "The default coordinates are 0-based "
        "open/closed coordinates on both strands, but can be changed "
        "by --input-format. "
        "For example, 'chr1:+:10:12' will return "
        "bases 11 and 12 on chr1. Elements from the end of the "
        "string can be omitted. For example, 'chr1' will return "
        "all of chromosome 'chr1'.")

    input_format_choices = ("one-forward-open", "zero-both-open")
    parser.add_option("-i",
                      "--input-format",
                      dest="input_format",
                      type="choice",
                      choices=input_format_choices,
                      help="coordinate format of input. Valid choices are "
                      "%s. See --extract. [default=%%default]." %
                      ", ".join(input_format_choices))

    parser.add_option(
        "-s",
        "--synonyms",
        dest="synonyms",
        type="string",
        help="list of synonyms. This is a comma separated with list "
        "of equivalence relations. For example, chrM=chrMT "
        "means that chrMT will refer to chrM and either "
        "can be used to retrieve a sequence "
        "[default=%default]")

    group = E.OptionGroup(parser, "Bencharking options")
    group.add_option("-b",
                     "--benchmark",
                     dest="benchmark",
                     action="store_true",
                     help="benchmark time for read access "
                     "[default=%default].")
    group.add_option("--benchmark-num-iterations",
                     dest="benchmark_num_iterations",
                     type="int",
                     help="number of iterations for benchmark "
                     "[default=%default].")
    group.add_option("--benchmark-fragment-size",
                     dest="benchmark_fragment_size",
                     type="int",
                     help="benchmark: fragment size [default=%default].")
    parser.add_option_group(group)

    group = E.OptionGroup(parser, "Validation options")
    group.add_option("--verify",
                     dest="verify",
                     type="string",
                     help="verify against other database [default=%default].")

    group.add_option("--verify-iterations",
                     dest="verify_num_iterations",
                     type="int",
                     help="number of iterations for verification "
                     "[default=%default].")
    parser.add_option_group(group)

    file_format_choices = ("fasta", "auto", "fasta.gz", "tar", "tar.gz")
    parser.add_option("--file-format",
                      dest="file_format",
                      type="choice",
                      choices=file_format_choices,
                      help="file format of input. Supply if data comes "
                      "from stdin "
                      "Valid choices are %s [default=%%default]." %
                      ", ".join(file_format_choices))

    parser.add_option("-a",
                      "--clean-sequence",
                      dest="clean_sequence",
                      action="store_true",
                      help="remove X/x from DNA sequences - they cause "
                      "errors in exonerate [default=%default].")

    parser.add_option("--allow-duplicates",
                      dest="allow_duplicates",
                      action="store_true",
                      help="allow duplicate identifiers. Further occurances "
                      "of an identifier are suffixed by an '_%i' "
                      "[default=%default].")

    parser.add_option("--regex-identifier",
                      dest="regex_identifier",
                      type="string",
                      help="regular expression for extracting the "
                      "identifier from fasta description line "
                      "[default=%default].")

    parser.add_option("--force-output",
                      dest="force",
                      action="store_true",
                      help="force overwriting of existing files "
                      "[default=%default].")

    translator_choices = ("solexa", "phred", "bytes", "range200")
    parser.add_option("-t",
                      "--translator",
                      dest="translator",
                      type="choice",
                      choices=translator_choices,
                      help="translate numerical quality scores. "
                      "Valid choices are %s [default=%%default]." %
                      ", ".join(translator_choices))

    group = E.OptionGroup(parser, 'Compression options')
    compression_choices = ("lzo", "zlib", "gzip", "dictzip", "bzip2", "debug")
    group.add_option("-c",
                     "--compression",
                     dest="compression",
                     type="choice",
                     choices=compression_choices,
                     help="compress database, using specified compression "
                     "method. "
                     "Valid choices are %s, but depend on availability on the "
                     "system "
                     "[default=%%default]." % ", ".join(compression_choices))

    group.add_option("--random-access-points",
                     dest="random_access_points",
                     type="int",
                     help="set random access points every # number "
                     "of nucleotides for block compression schemes "
                     "[default=%default].")

    group.add_option(
        "--compress-index",
        dest="compress_index",
        action="store_true",
        help="compress index. The default is to use a plain-text, "
        "human-readable index [default=%default].")

    parser.add_option_group(group)

    parser.set_defaults(extract=None,
                        input_format="zero-both-open",
                        benchmark_fragment_size=1000,
                        benchmark_num_iterations=1000000,
                        benchmark=False,
                        compression=None,
                        random_access_points=0,
                        synonyms=None,
                        verify=None,
                        verify_num_iterations=100000,
                        verify_fragment_size=100,
                        clean_sequence=False,
                        allow_duplicates=False,
                        regex_identifier=None,
                        compress_index=False,
                        file_format="auto",
                        force=False,
                        translator=None)

    (options, args) = E.Start(parser)

    if options.synonyms:
        synonyms = {}
        for x in options.synonyms.split(","):
            a, b = x.split("=")
            a = a.strip()
            b = b.strip()
            if a not in synonyms:
                synonyms[a] = []
            synonyms[a].append(b)
    else:
        synonyms = None

    if options.translator:
        if options.translator == "phred":
            options.translator = IndexedFasta.TranslatorPhred()
        elif options.translator == "solexa":
            options.translator = IndexedFasta.TranslatorSolexa()
        elif options.translator == "bytes":
            options.translator = IndexedFasta.TranslatorBytes()
        elif options.translator == "range200":
            options.translator = IndexedFasta.TranslatorRange200()
        else:
            raise ValueError("unknown translator %s" % options.translator)

    if options.extract:
        fasta = IndexedFasta.IndexedFasta(args[0])
        fasta.setTranslator(options.translator)
        converter = IndexedFasta.getConverter(options.input_format)

        contig, strand, start, end = IndexedFasta.parseCoordinates(
            options.extract)
        sequence = fasta.getSequence(contig,
                                     strand,
                                     start,
                                     end,
                                     converter=converter)
        options.stdout.write(">%s\n%s\n" % (options.extract, sequence))

    elif options.benchmark:
        import timeit
        timer = timeit.Timer(
            stmt="IndexedFasta.benchmarkRandomFragment(fasta=fasta, size=%i)" %
            (options.benchmark_fragment_size),
            setup="from __main__ import IndexedFasta\n"
            "fasta=IndexedFasta.IndexedFasta('%s')" % (args[0]))

        t = timer.timeit(number=options.benchmark_num_iterations)
        options.stdout.write("iter\tsize\ttime\n")
        options.stdout.write("%i\t%i\t%i\n" %
                             (options.benchmark_num_iterations,
                              options.benchmark_fragment_size, t))

    elif options.verify:
        fasta1 = IndexedFasta.IndexedFasta(args[0])
        fasta2 = IndexedFasta.IndexedFasta(options.verify)
        nerrors1 = IndexedFasta.verify(fasta1,
                                       fasta2,
                                       options.verify_num_iterations,
                                       options.verify_fragment_size,
                                       stdout=options.stdout)
        options.stdout.write("errors=%i\n" % (nerrors1))
        nerrors2 = IndexedFasta.verify(fasta2,
                                       fasta1,
                                       options.verify_num_iterations,
                                       options.verify_fragment_size,
                                       stdout=options.stdout)
        options.stdout.write("errors=%i\n" % (nerrors2))
    elif options.compress_index:
        fasta = IndexedFasta.IndexedFasta(args[0])
        fasta.compressIndex()
    else:
        if options.loglevel >= 1:
            options.stdlog.write("# creating database %s\n" % args[0])
            options.stdlog.write("# indexing the following files: \n# %s\n" %
                                 (" \n# ".join(args[1:])))
            options.stdlog.flush()

            if synonyms:
                options.stdlog.write("# Applying the following synonyms:\n")
                for k, v in synonyms.items():
                    options.stdlog.write("# %s=%s\n" % (k, ",".join(v)))
                options.stdlog.flush()
        if len(args) < 2:
            print globals()["__doc__"]
            sys.exit(1)

        iterator = IndexedFasta.MultipleFastaIterator(
            args[1:],
            regex_identifier=options.regex_identifier,
            format=options.file_format)

        IndexedFasta.createDatabase(
            args[0],
            iterator,
            synonyms=synonyms,
            random_access_points=options.random_access_points,
            compression=options.compression,
            clean_sequence=options.clean_sequence,
            allow_duplicates=options.allow_duplicates,
            translator=options.translator,
            force=options.force)

    E.Stop()

Пример #7

Показать файл

Файл: extractseq.py Проект: SCV/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: extractseq.py 2861 2010-02-23 17:36:32Z andreas $")

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="pattern to look for sequence filename.")

    parser.add_option("-d", "--identifier", dest="identifier", type="string",
                      help="identifier(s).")

    parser.add_option("-o", "--output-coordinate-format", dest="output_coordinate_format", type="choice",
                      choices=("full", "long"),
                      help="""output format of coordinates. Output format is contig:strand:from:to in zero based
/forward/reverse strand coordinates in open/closed notation. 'long' includes the contig length as fifth field"""  )

    parser.add_option("--input-format", dest="input_format", type="choice",
                      choices=("list", "id-compressed"),
                      help="input format.")

    parser.add_option("-i", "--input-coordinate-format", dest="input_coordinate_format", type="choice",
                      choices=("zero-both", "zero-forward"),
                      help="coordinate format.")

    parser.add_option("-e", "--extend-region", dest="extend_region", type="int",
                      help="regions are extended by this margin at either end.")

    parser.add_option("-r", "--shorten-region", dest="shorten_region", type="int",
                      help="regions are shortened by this margin at either end.")

    parser.set_defaults(
        genome_file=None,
        identifier=None,
        input_coordinate_format="zero-both",
        output_coordinate_format="full",
        input_format="list",
        extend_region=0,
        shorten_region=0,
    )

    (options, args) = E.Start(parser)

    fasta = IndexedFasta.IndexedFasta(options.genome_file)

    lines = []
    if options.identifier:
        lines += map(lambda x: x.split(":"), options.identifier.split(","))

    if args:
        lines += map(lambda x: x.split(":"), args)

    if len(lines) == 0:
        lines = map(lambda x: x[
                    :-1].split("\t"), (filter(lambda x: x[0] != "#", options.stdin.readlines())))

    ninput, nskipped, noutput = 0, 0, 0
    for data in lines:

        if options.input_format == "list":
            if len(data) < 4:
                sbjct_token = data[0]
                sbjct_from, sbjct_to = "0", "0"
                sbjct_strand = "+"
            else:
                sbjct_token, sbjct_strand, sbjct_from, sbjct_to = data[:4]
            id = None
        elif options.input_format == "id-compressed":
            id = data[0]
            sbjct_token, sbjct_strand, sbjct_from, sbjct_to = data[
                1].split(":")

        ninput += 1

        try:
            sbjct_from, sbjct_to = int(sbjct_from), int(sbjct_to)
        except ValueError:
            E.warn("skipping line %s" % data)
            nskipped += 1
            continue

        sbjct_from -= (options.extend_region - options.shorten_region)
        sbjct_from = max(0, sbjct_from)
        lcontig = fasta.getLength(sbjct_token)
        if sbjct_to != 0:
            sbjct_to += (options.extend_region - options.shorten_region)
            sbjct_to = min(sbjct_to, lcontig)
        else:
            sbjct_to = lcontig

        if sbjct_to - sbjct_from <= 0:
            nskipped += 1
            continue

        sequence = fasta.getSequence(sbjct_token, sbjct_strand,
                                     sbjct_from, sbjct_to,
                                     converter=IndexedFasta.getConverter(options.input_coordinate_format))

        if options.output_coordinate_format == "full":
            coordinates = "%s:%s:%i:%i" % (sbjct_token,
                                           sbjct_strand,
                                           sbjct_from,
                                           sbjct_to)

        elif options.output_coordinate_format == "long":
            coordinates = "%s:%s:%i:%i:%i" % (sbjct_token,
                                              sbjct_strand,
                                              sbjct_from,
                                              sbjct_to,
                                              lcontig)

        if id:
            options.stdout.write(">%s %s\n%s\n" % (id, coordinates, sequence))
        else:
            options.stdout.write(">%s\n%s\n" % (coordinates, sequence))

        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    E.Stop()

Пример #8

Показать файл

Файл: gtf2exons.py Проект: siping/cgat

        extract_id = None )

    (options, args) = E.Start( parser )
    
    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta( options.genome_file )
        contig_sizes = fasta.getContigSizes()
    else:
        contig_sizes = {}

    if options.extract_id:
        extract_id = re.compile( options.extract_id )
    else:
        extract_id = None

    converter = IndexedFasta.getConverter( options.coordinate_format )

    exons = Exons.ReadExonBoundaries( sys.stdin,
                                      contig_sizes = contig_sizes,
                                      converter = converter,
                                      do_invert = True,
                                      format = "gtf",
                                      gtf_extract_id = extract_id )

    ntranscripts, nexons, nerrors = 0, 0, 0
    for id, ee in exons.items():
        ntranscripts += 1
        has_error = False
        for e in ee:
            if options.forward_coordinates and e.mSbjctToken in contig_sizes and \
                    e.mSbjctStrand == "-":

Пример #9

Показать файл

Файл: bed2fasta.py Проект: Charlie-George/cgat

def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: gff2fasta.py 2861 2010-02-23 17:36:32Z andreas $")

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome.")

    parser.add_option("-m", "--masker", dest="masker", type="choice",
                      choices=("dust", "dustmasker", "softmask", "none"),
                      help="apply masker [%default].")

    parser.add_option("-o", "--mode", dest="mode", type="choice",
                      choices=("intervals", "leftright"),
                      help="what to output [%default]")

    parser.add_option("--min-length", dest="min_length", type="int",
                      help="require a minimum sequence length [%default]")

    parser.add_option("--max-length", dest="max_length", type="int",
                      help="require a maximum sequence length [%default]")

    parser.add_option("--extend-at", dest="extend_at", type="choice",
                      choices=("none", "3", "5", "both", "3only", "5only"),
                      help="extend at no, 3', 5' or both ends. If 3only or 5only are set, only the added sequence is returned [default=%default]")

    parser.add_option("--extend-by", dest="extend_by", type="int",
                      help="extend by # bases [default=%default]")

    parser.add_option("--use-strand", dest="ignore_strand", action="store_false",
                      help="use strand information and return reverse complement [default=%default]")

    parser.set_defaults(
        genome_file=None,
        masker=None,
        mode="intervals",
        min_length=0,
        max_length=0,
        extend_at=None,
        extend_by=100,
        ignore_strand=True,
    )

    (options, args) = E.Start(parser)

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contigs = fasta.getContigSizes()
        fasta.setConverter(IndexedFasta.getConverter("zero-both-open"))

    counter = E.Counter()
    ids, seqs = [], []

    E.info("collecting sequences")
    for bed in Bed.setName(Bed.iterator(options.stdin)):
        counter.input += 1

        lcontig = fasta.getLength(bed.contig)

        if options.ignore_strand:
            strand = "+"
        else:
            strand = bed.strand

        if options.mode == "intervals":
            ids.append("%s %s:%i..%i (%s)" %
                       (bed.name, bed.contig, bed.start, bed.end, strand))
            seqs.append(
                fasta.getSequence(bed.contig, strand, bed.start, bed.end))

        elif options.mode == "leftright":
            l = bed.end - bed.start

            start, end = max(0, bed.start - l), bed.end - l
            ids.append("%s_l %s:%i..%i (%s)" %
                       (bed.name, bed.contig, start, end, strand))
            seqs.append(fasta.getSequence(bed.contig, strand, start, end))

            start, end = bed.start + l, min(lcontig, bed.end + l)
            ids.append("%s_r %s:%i..%i (%s)" %
                       (bed.name, bed.contig, start, end, strand))
            seqs.append(fasta.getSequence(bed.contig, strand, start, end))

    E.info("collected %i sequences" % len(seqs))

    masked = Masker.maskSequences(seqs, options.masker)
    options.stdout.write(
        "\n".join([">%s\n%s" % (x, y) for x, y in zip(ids, masked)]) + "\n")

    E.info("masked %i sequences" % len(seqs))

    counter.output = len(seqs)

    E.info("%s" % counter)

    E.Stop()

Пример #10

Показать файл

Файл: gff2predictions.py Проект: radaniba/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: gpipe/gff2predictions.py 2021 2008-07-10 16:00:48Z andreas $",
        usage=globals()["__doc__"],
    )

    parser.add_option("-t", "--trans", dest="trans", help="input is translated DNA.", action="store_true")

    parser.add_option(
        "-f", "--format", dest="format", help="input format.", type="choice", choices=("exons", "psl", "gff")
    )

    parser.add_option(
        "-o",
        "--output-format",
        dest="output_format",
        help="output format",
        type="choice",
        choices=("exontable", "exons", "predictions", "cds", "fasta"),
    )

    parser.add_option(
        "-g", "--genome-file", dest="genome_file", type="string", help="filename with genomic data (indexed)."
    )

    parser.add_option(
        "--predictions-file",
        dest="predictions_file",
        type="string",
        help="filename with predictions. Use gene structures from this file if available.",
    )

    parser.add_option(
        "-i",
        "--gff-field-id",
        dest="gff_field_id",
        type="string",
        help="field for the feature id in the gff info section.",
    )

    parser.add_option(
        "-p",
        "--filename-peptides",
        dest="filename_peptides",
        type="string",
        help="Filename with peptide sequences. If given, it is used to check the predicted translated sequences.",
    )

    parser.add_option(
        "--no-realignment",
        dest="do_realignment",
        action="store_false",
        help="do not re-align entries that do not parse correctly.",
    )

    parser.add_option(
        "--remove-unaligned",
        dest="remove_unaligned",
        action="store_true",
        help="remove entries that have not been aligned correctly.",
    )

    parser.add_option(
        "--input-coordinates",
        dest="input_coordinates",
        type="string",
        help="specify input format for input coordinates [forward|both-zero|one-closed|open].",
    )

    parser.set_defaults(
        trans=False,
        output_format="predictions",
        format="psl",
        gff_field_id="id",
        input_coordinates="both-zero-open",
        filename_peptides=None,
        genome_file=None,
        do_realignment=True,
        predictions_file=None,
        remove_unaligned=False,
    )

    (options, args) = E.Start(parser)

    if not options.genome_file:
        raise "please specify a genome file."

    fasta = IndexedFasta.IndexedFasta(options.genome_file)
    contig_sizes = fasta.getContigSizes()

    ninput, noutput, nskipped = 0, 0, 0
    nfound, nnotfound, nidentical, nmismatch, naligned, nunaligned = 0, 0, 0, 0, 0, 0

    if options.filename_peptides:
        peptide_sequences = Genomics.ReadPeptideSequences(IOTools.openFile(options.filename_peptides, "r"))
        predictor = Predictor.PredictorExonerate()
        predictor.mLogLevel = 0
    else:
        peptide_sequences = None
        predictor = None

    converter = IndexedFasta.getConverter(options.input_coordinates)

    predictions = {}
    if options.predictions_file:
        parser = PredictionParser.iterator_predictions(IOTools.openFile(options.predictions_file, "r"))
        for p in parser:
            predictions[p.mPredictionId] = p

    if options.output_format == "predictions":

        if options.format == "psl":

            if options.trans:
                parser = PredictionParser.PredictionParserBlatTrans()
            else:
                parser = PredictionParser.PredictionParserBlatCDNA()

            nmatches = 1
            for line in sys.stdin:
                if line[0] == "#":
                    continue
                if not re.match("^[0-9]", line):
                    continue

                try:
                    entries = parser.Parse((line,))
                except PredictionParser.AlignmentError, e:
                    print "# %s" % str(e)
                    print "#", line[:-1]
                    sys.exit(1)

                for entry in entries:
                    entry.mPredictionId = nmatches
                    nmatches += 1

                print str(entries)

        elif options.format == "exons":
            parser = PredictionParser.PredictionParserExons(contig_sizes=contig_sizes)
        else:
            raise "unknown format %s for output option %s" % (options.format, options.output_format)

        if options.loglevel >= 2:
            options.stdlog.write("# parsing.\n")
            options.stdlog.flush()

        results = parser.Parse(sys.stdin.readlines())

        if options.loglevel >= 2:
            options.stdlog.write("# parsing finished.\n")
            options.stdlog.flush()

        if options.loglevel >= 1:
            options.stdlog.write(
                "# parsing: ninput=%i, noutput=%i, nerrors=%i\n"
                % (parser.GetNumInput(), parser.GetNumOutput(), parser.GetNumErrors())
            )

            for error, msg in parser.mErrors:
                options.stdlog.write("# %s : %s\n" % (str(error), msg))
                options.stdlog.flush()

        # if genomes are given: build translation
        if options.genome_file:

            results.Sort(lambda x, y: cmp(x.mSbjctToken, y.mSbjctToken))

            new_results = PredictionParser.Predictions()

            for entry in results:

                ninput += 1

                if options.loglevel >= 2:
                    options.stdlog.write(
                        "# processing entry %s:%s on %s:%s %i/%i.\n"
                        % (
                            entry.mPredictionId,
                            entry.mQueryToken,
                            entry.mSbjctToken,
                            entry.mSbjctStrand,
                            ninput,
                            len(results),
                        )
                    )
                    options.stdlog.flush()

                try:
                    lgenome = fasta.getLength(entry.mSbjctToken)
                    # added 3 residues - was a problem at split codons just before the stop.
                    # See for example the chicken sequence ENSGALP00000002741
                    genomic_sequence = fasta.getSequence(
                        entry.mSbjctToken,
                        entry.mSbjctStrand,
                        entry.mSbjctGenomeFrom,
                        min(entry.mSbjctGenomeTo + 3, lgenome),
                    )

                except KeyError:
                    if options.loglevel >= 1:
                        options.stdlog.write(
                            "# did not find entry for %s on %s.\n" % (entry.mPredictionId, entry.mSbjctToken)
                        )
                    nskipped += 1
                    continue

                if predictions and entry.mPredictionId in predictions:
                    if options.loglevel >= 2:
                        options.stdlog.write(
                            "# substituting entry %s on %s:%s.\n"
                            % (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand)
                        )
                        options.stdlog.flush()
                    entry = predictions[entry.mPredictionId]

                exons = Exons.Alignment2Exons(entry.mMapPeptide2Genome, 0, entry.mSbjctGenomeFrom)

                entry.mMapPeptide2Translation, entry.mTranslation = Genomics.Alignment2PeptideAlignment(
                    Genomics.String2Alignment(entry.mAlignmentString), entry.mQueryFrom, 0, genomic_sequence
                )

                entry.score = entry.mMapPeptide2Translation.getColTo() - entry.mMapPeptide2Translation.getColFrom() + 1

                (
                    entry.mNIntrons,
                    entry.mNFrameShifts,
                    entry.mNGaps,
                    entry.mNSplits,
                    entry.mNStopCodons,
                    entry.mNDisruptions,
                ) = Genomics.CountGeneFeatures(0, entry.mMapPeptide2Genome, genomic_sequence)

                if peptide_sequences:

                    if str(entry.mPredictionId) in peptide_sequences:

                        reference = peptide_sequences[str(entry.mPredictionId)].upper()

                        translation = entry.mTranslation
                        nfound += 1

                        is_identical, nmismatches = checkIdentity(reference, translation, options)

                        if is_identical:
                            nidentical += 1
                        else:
                            nmismatch += 1

                            if options.do_realignment:
                                if options.loglevel >= 2:
                                    options.stdlog.write(
                                        "# %s: mismatches..realigning in region %i:%i\n"
                                        % (entry.mPredictionId, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo)
                                    )
                                    options.stdlog.flush()

                                    result = predictor(
                                        entry.mPredictionId,
                                        reference,
                                        entry.mSbjctToken,
                                        genomic_sequence,
                                        "--subopt FALSE --score '%s'" % str(80),
                                    )
                                    # "--exhaustive --subopt FALSE --score '%s'" % str(80) )

                                    if result:
                                        translation = result[0].mTranslation
                                        is_identical, nmismatches = checkIdentity(reference, translation, options)
                                    else:
                                        if options.loglevel >= 2:
                                            options.stdlog.write(
                                                "# %s: realignment returned empty result\n" % (entry.mPredictionId)
                                            )
                                            options.stdlog.flush()
                                        is_identical = False

                                    if is_identical:
                                        naligned += 1
                                        prediction_id = entry.mPredictionId
                                        sbjct_genome_from = entry.mSbjctGenomeFrom
                                        entry = result[0]
                                        entry.mPredictionId = prediction_id
                                        entry.mSbjctGenomeFrom += sbjct_genome_from
                                    else:
                                        nunaligned += 1
                                        if options.loglevel >= 1:
                                            options.stdlog.write(
                                                "# %s: mismatch on %s:%s:%i-%i after realignment\n# reference =%s\n# translated=%s\n# realigned =%s\n"
                                                % (
                                                    entry.mPredictionId,
                                                    entry.mSbjctToken,
                                                    entry.mSbjctStrand,
                                                    entry.mSbjctGenomeFrom,
                                                    entry.mSbjctGenomeTo,
                                                    reference,
                                                    entry.mTranslation,
                                                    translation,
                                                )
                                            )
                                            options.stdlog.flush()
                                        if options.remove_unaligned:
                                            nskipped += 1
                                            continue

                            else:
                                if options.loglevel >= 2:
                                    options.stdlog.write(
                                        "# %s: mismatches on %s ... no realignment\n"
                                        % (entry.mPredictionId, entry.mSbjctToken)
                                    )
                                    if options.loglevel >= 3:
                                        options.stdlog.write(
                                            "# %s: mismatch before realignment\n# reference =%s\n# translated=%s\n"
                                            % (entry.mPredictionId, reference, translation)
                                        )
                                    options.stdlog.flush()

                                if options.remove_unaligned:
                                    nskipped += 1
                                    continue

                    else:
                        nnotfound += 1

                new_results.append(entry)
                noutput += 1

            results = new_results
        if results:
            options.stdout.write(str(results) + "\n")

Пример #11

Показать файл

Файл: bed2fasta.py Проект: AndreasHegerGenomics/cgat-apps

def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genomic sequence to retrieve "
                      "sequences from.")

    parser.add_option("-m",
                      "--masker",
                      dest="masker",
                      type="choice",
                      choices=("dust", "dustmasker", "softmask", "none"),
                      help="apply masker to mask output sequences "
                      "[%default].")

    parser.add_option("--output-mode",
                      dest="output_mode",
                      type="choice",
                      choices=("intervals", "leftright", "segments"),
                      help="what to output. "
                      "'intervals' generates a single sequence for "
                      "each bed interval. 'leftright' generates two "
                      "sequences, one in each direction, for each bed "
                      "interval. 'segments' can be used to output "
                      "sequence from bed12 files so that sequence only covers "
                      "the segements [%default]")

    parser.add_option("--min-sequence-length",
                      dest="min_length",
                      type="int",
                      help="require a minimum sequence length [%default]")

    parser.add_option("--max-sequence-length",
                      dest="max_length",
                      type="int",
                      help="require a maximum sequence length [%default]")

    parser.add_option(
        "--extend-at",
        dest="extend_at",
        type="choice",
        choices=("none", "3", "5", "both", "3only", "5only"),
        help="extend at 3', 5' or both or no ends. If 3only or 5only "
        "are set, only the added sequence is returned [default=%default]")

    parser.add_option("--extend-by",
                      dest="extend_by",
                      type="int",
                      help="extend by # bases [default=%default]")

    parser.add_option(
        "--use-strand",
        dest="ignore_strand",
        action="store_false",
        help="use strand information and return reverse complement "
        "on intervals located on the negative strand. "
        "[default=%default]")

    parser.set_defaults(
        genome_file=None,
        masker=None,
        output_mode="intervals",
        min_length=0,
        max_length=0,
        extend_at=None,
        extend_by=100,
        ignore_strand=True,
    )

    (options, args) = E.start(parser)

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contigs = fasta.getContigSizes()
        fasta.setConverter(IndexedFasta.getConverter("zero-both-open"))

    counter = E.Counter()
    ids, seqs = [], []

    E.info("collecting sequences")
    for bed in Bed.setName(Bed.iterator(options.stdin)):
        counter.input += 1

        lcontig = fasta.getLength(bed.contig)

        if options.ignore_strand:
            strand = "+"
        else:
            strand = bed.strand

        if options.output_mode == "segments" and bed.columns == 12:
            ids.append("%s %s:%i..%i (%s) %s %s" %
                       (bed.name, bed.contig, bed.start, bed.end, strand,
                        bed["blockSizes"], bed["blockStarts"]))
            seg_seqs = [
                fasta.getSequence(bed.contig, strand, start, end)
                for start, end in bed.toIntervals()
            ]
            seqs.append("".join(seg_seqs))

        elif (options.output_mode == "intervals"
              or options.output_mode == "segments"):
            ids.append("%s %s:%i..%i (%s)" %
                       (bed.name, bed.contig, bed.start, bed.end, strand))
            seqs.append(
                fasta.getSequence(bed.contig, strand, bed.start, bed.end))

        elif options.output_mode == "leftright":
            l = bed.end - bed.start

            start, end = max(0, bed.start - l), bed.end - l
            ids.append("%s_l %s:%i..%i (%s)" %
                       (bed.name, bed.contig, start, end, strand))
            seqs.append(fasta.getSequence(bed.contig, strand, start, end))

            start, end = bed.start + l, min(lcontig, bed.end + l)
            ids.append("%s_r %s:%i..%i (%s)" %
                       (bed.name, bed.contig, start, end, strand))
            seqs.append(fasta.getSequence(bed.contig, strand, start, end))

    E.info("collected %i sequences" % len(seqs))

    masked = Masker.maskSequences(seqs, options.masker)
    options.stdout.write(
        "\n".join([">%s\n%s" % (x, y) for x, y in zip(ids, masked)]) + "\n")

    E.info("masked %i sequences" % len(seqs))

    counter.output = len(seqs)

    E.info("%s" % counter)

    E.stop()

Пример #12

Показать файл

def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: gtf2exons.py 2781 2009-09-10 11:33:14Z andreas $", usage = globals()["__doc__"])

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genomic data (indexed)." )

    parser.add_option("--coordinate-format", dest="coordinate_format", type="string",
                      help="input type of coordinates." )

    parser.add_option("--forward-coordinates", dest="forward_coordinates", action="store_true",
                      help="output forward coordinates." )

    parser.add_option("-e", "--extract-id", dest="extract_id", type="string",
                      help="""regular expression to extract id from id column, e.g. 'transcript_id "(\S+)"'.""" )

    parser.set_defaults(
        coordinate_format = "zero-forward",
        forward_coordinates = False,
        genome_file = None,
        extract_id = None )

    (options, args) = E.Start( parser )
    
    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta( options.genome_file )
        contig_sizes = fasta.getContigSizes()
    else:
        contig_sizes = {}

    if options.extract_id:
        extract_id = re.compile( options.extract_id )
    else:
        extract_id = None

    converter = IndexedFasta.getConverter( options.coordinate_format )

    exons = Exons.ReadExonBoundaries( sys.stdin,
                                      contig_sizes = contig_sizes,
                                      converter = converter,
                                      do_invert = True,
                                      format = "gtf",
                                      gtf_extract_id = extract_id )

    ntranscripts, nexons, nerrors = 0, 0, 0
    for id, ee in exons.items():
        ntranscripts += 1
        has_error = False
        for e in ee:
            if options.forward_coordinates and e.mSbjctToken in contig_sizes and \
                    e.mSbjctStrand == "-":
                l = contig_sizes[e.mSbjctToken]
                e.mGenomeFrom, e.mGenomeTo = l - e.mGenomeTo, l - e.mGenomeFrom

            if e.mGenomeFrom < 0:
                has_error = True
                if options.loglevel >= 1:
                    options.stderr.write( "# Error: %s\n" % str(e) )
                break

            options.stdout.write( str(e) + "\n" )
            nexons += 1
                
        if has_error:
            nerrors += 1
            continue
    
    if options.loglevel >= 1:
        options.stdlog.write("# ntranscripts=%i, nexons=%i, nerrors=%i\n" % (ntranscripts, nexons, nerrors))
    
    E.Stop()

Пример #13

Показать файл

Файл: gff2predictions.py Проект: santayana/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gpipe/gff2predictions.py 2021 2008-07-10 16:00:48Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--trans",
                      dest="trans",
                      help="input is translated DNA.",
                      action="store_true")

    parser.add_option("-f",
                      "--format",
                      dest="format",
                      help="input format.",
                      type="choice",
                      choices=("exons", "psl", "gff"))

    parser.add_option("-o",
                      "--output-format",
                      dest="output_format",
                      help="output format",
                      type="choice",
                      choices=('exontable', 'exons', 'predictions', 'cds',
                               'fasta'))

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genomic data (indexed).")

    parser.add_option(
        "--predictions-file",
        dest="predictions_file",
        type="string",
        help=
        "filename with predictions. Use gene structures from this file if available."
    )

    parser.add_option("-i",
                      "--gff-field-id",
                      dest="gff_field_id",
                      type="string",
                      help="field for the feature id in the gff info section.")

    parser.add_option(
        "-p",
        "--filename-peptides",
        dest="filename_peptides",
        type="string",
        help=
        "Filename with peptide sequences. If given, it is used to check the predicted translated sequences."
    )

    parser.add_option(
        "--no-realignment",
        dest="do_realignment",
        action="store_false",
        help="do not re-align entries that do not parse correctly.")

    parser.add_option(
        "--remove-unaligned",
        dest="remove_unaligned",
        action="store_true",
        help="remove entries that have not been aligned correctly.")

    parser.add_option(
        "--input-coordinates",
        dest="input_coordinates",
        type="string",
        help=
        "specify input format for input coordinates [forward|both-zero|one-closed|open]."
    )

    parser.set_defaults(trans=False,
                        output_format="predictions",
                        format="psl",
                        gff_field_id='id',
                        input_coordinates="both-zero-open",
                        filename_peptides=None,
                        genome_file=None,
                        do_realignment=True,
                        predictions_file=None,
                        remove_unaligned=False)

    (options, args) = E.Start(parser)

    if not options.genome_file:
        raise "please specify a genome file."

    fasta = IndexedFasta.IndexedFasta(options.genome_file)
    contig_sizes = fasta.getContigSizes()

    ninput, noutput, nskipped = 0, 0, 0
    nfound, nnotfound, nidentical, nmismatch, naligned, nunaligned = 0, 0, 0, 0, 0, 0

    if options.filename_peptides:
        peptide_sequences = Genomics.ReadPeptideSequences(
            IOTools.openFile(options.filename_peptides, "r"))
        predictor = Predictor.PredictorExonerate()
        predictor.mLogLevel = 0
    else:
        peptide_sequences = None
        predictor = None

    converter = IndexedFasta.getConverter(options.input_coordinates)

    predictions = {}
    if options.predictions_file:
        parser = PredictionParser.iterator_predictions(
            IOTools.openFile(options.predictions_file, "r"))
        for p in parser:
            predictions[p.mPredictionId] = p

    if options.output_format == "predictions":

        if options.format == "psl":

            if options.trans:
                parser = PredictionParser.PredictionParserBlatTrans()
            else:
                parser = PredictionParser.PredictionParserBlatCDNA()

            nmatches = 1
            for line in sys.stdin:
                if line[0] == "#":
                    continue
                if not re.match("^[0-9]", line):
                    continue

                try:
                    entries = parser.Parse((line, ))
                except PredictionParser.AlignmentError, e:
                    print "# %s" % str(e)
                    print "#", line[:-1]
                    sys.exit(1)

                for entry in entries:
                    entry.mPredictionId = nmatches
                    nmatches += 1

                print str(entries)

        elif options.format == "exons":
            parser = PredictionParser.PredictionParserExons(
                contig_sizes=contig_sizes)
        else:
            raise "unknown format %s for output option %s" % (
                options.format, options.output_format)

        if options.loglevel >= 2:
            options.stdlog.write("# parsing.\n")
            options.stdlog.flush()

        results = parser.Parse(sys.stdin.readlines())

        if options.loglevel >= 2:
            options.stdlog.write("# parsing finished.\n")
            options.stdlog.flush()

        if options.loglevel >= 1:
            options.stdlog.write(
                "# parsing: ninput=%i, noutput=%i, nerrors=%i\n" %
                (parser.GetNumInput(), parser.GetNumOutput(),
                 parser.GetNumErrors()))

            for error, msg in parser.mErrors:
                options.stdlog.write("# %s : %s\n" % (str(error), msg))
                options.stdlog.flush()

        # if genomes are given: build translation
        if options.genome_file:

            results.Sort(lambda x, y: cmp(x.mSbjctToken, y.mSbjctToken))

            new_results = PredictionParser.Predictions()

            for entry in results:

                ninput += 1

                if options.loglevel >= 2:
                    options.stdlog.write(
                        "# processing entry %s:%s on %s:%s %i/%i.\n" %
                        (entry.mPredictionId, entry.mQueryToken,
                         entry.mSbjctToken, entry.mSbjctStrand, ninput,
                         len(results)))
                    options.stdlog.flush()

                try:
                    lgenome = fasta.getLength(entry.mSbjctToken)
                    # added 3 residues - was a problem at split codons just before the stop.
                    # See for example the chicken sequence ENSGALP00000002741
                    genomic_sequence = fasta.getSequence(
                        entry.mSbjctToken, entry.mSbjctStrand,
                        entry.mSbjctGenomeFrom,
                        min(entry.mSbjctGenomeTo + 3, lgenome))

                except KeyError:
                    if options.loglevel >= 1:
                        options.stdlog.write(
                            "# did not find entry for %s on %s.\n" %
                            (entry.mPredictionId, entry.mSbjctToken))
                    nskipped += 1
                    continue

                if predictions and entry.mPredictionId in predictions:
                    if options.loglevel >= 2:
                        options.stdlog.write(
                            "# substituting entry %s on %s:%s.\n" %
                            (entry.mPredictionId, entry.mSbjctToken,
                             entry.mSbjctStrand))
                        options.stdlog.flush()
                    entry = predictions[entry.mPredictionId]

                exons = Exons.Alignment2Exons(entry.mMapPeptide2Genome, 0,
                                              entry.mSbjctGenomeFrom)

                entry.mMapPeptide2Translation, entry.mTranslation = Genomics.Alignment2PeptideAlignment(
                    Genomics.String2Alignment(entry.mAlignmentString),
                    entry.mQueryFrom, 0, genomic_sequence)

                entry.score = entry.mMapPeptide2Translation.getColTo(
                ) - entry.mMapPeptide2Translation.getColFrom() + 1

                (entry.mNIntrons, entry.mNFrameShifts, entry.mNGaps, entry.mNSplits, entry.mNStopCodons, entry.mNDisruptions ) = \
                    Genomics.CountGeneFeatures(0,
                                               entry.mMapPeptide2Genome,
                                               genomic_sequence)

                if peptide_sequences:

                    if str(entry.mPredictionId) in peptide_sequences:

                        reference = peptide_sequences[str(
                            entry.mPredictionId)].upper()

                        translation = entry.mTranslation
                        nfound += 1

                        is_identical, nmismatches = checkIdentity(
                            reference, translation, options)

                        if is_identical:
                            nidentical += 1
                        else:
                            nmismatch += 1

                            if options.do_realignment:
                                if options.loglevel >= 2:
                                    options.stdlog.write(
                                        "# %s: mismatches..realigning in region %i:%i\n"
                                        % (entry.mPredictionId,
                                           entry.mSbjctGenomeFrom,
                                           entry.mSbjctGenomeTo))
                                    options.stdlog.flush()

                                    result = predictor(
                                        entry.mPredictionId, reference,
                                        entry.mSbjctToken, genomic_sequence,
                                        "--subopt FALSE --score '%s'" %
                                        str(80))
                                    # "--exhaustive --subopt FALSE --score '%s'" % str(80) )

                                    if result:
                                        translation = result[0].mTranslation
                                        is_identical, nmismatches = checkIdentity(
                                            reference, translation, options)
                                    else:
                                        if options.loglevel >= 2:
                                            options.stdlog.write(
                                                "# %s: realignment returned empty result\n"
                                                % (entry.mPredictionId))
                                            options.stdlog.flush()
                                        is_identical = False

                                    if is_identical:
                                        naligned += 1
                                        prediction_id = entry.mPredictionId
                                        sbjct_genome_from = entry.mSbjctGenomeFrom
                                        entry = result[0]
                                        entry.mPredictionId = prediction_id
                                        entry.mSbjctGenomeFrom += sbjct_genome_from
                                    else:
                                        nunaligned += 1
                                        if options.loglevel >= 1:
                                            options.stdlog.write(
                                                "# %s: mismatch on %s:%s:%i-%i after realignment\n# reference =%s\n# translated=%s\n# realigned =%s\n"
                                                %
                                                (entry.mPredictionId,
                                                 entry.mSbjctToken,
                                                 entry.mSbjctStrand,
                                                 entry.mSbjctGenomeFrom,
                                                 entry.mSbjctGenomeTo,
                                                 reference, entry.mTranslation,
                                                 translation))
                                            options.stdlog.flush()
                                        if options.remove_unaligned:
                                            nskipped += 1
                                            continue

                            else:
                                if options.loglevel >= 2:
                                    options.stdlog.write(
                                        "# %s: mismatches on %s ... no realignment\n"
                                        % (
                                            entry.mPredictionId,
                                            entry.mSbjctToken,
                                        ))
                                    if options.loglevel >= 3:
                                        options.stdlog.write(
                                            "# %s: mismatch before realignment\n# reference =%s\n# translated=%s\n"
                                            % (entry.mPredictionId, reference,
                                               translation))
                                    options.stdlog.flush()

                                if options.remove_unaligned:
                                    nskipped += 1
                                    continue

                    else:
                        nnotfound += 1

                new_results.append(entry)
                noutput += 1

            results = new_results
        if results:
            options.stdout.write(str(results) + "\n")

Пример #14

Показать файл

def main(argv=None):
    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gff2fasta.py 2861 2010-02-23 17:36:32Z andreas $")

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome.")

    parser.add_option("-m",
                      "--masker",
                      dest="masker",
                      type="choice",
                      choices=("dust", "dustmasker", "softmask", "none"),
                      help="apply masker [%default].")

    parser.add_option("-o",
                      "--mode",
                      dest="mode",
                      type="choice",
                      choices=("intervals", "leftright"),
                      help="what to output [%default]")

    parser.add_option("--min-length",
                      dest="min_length",
                      type="int",
                      help="require a minimum sequence length [%default]")

    parser.add_option("--max-length",
                      dest="max_length",
                      type="int",
                      help="require a maximum sequence length [%default]")

    parser.add_option(
        "--extend-at",
        dest="extend_at",
        type="choice",
        choices=("none", "3", "5", "both", "3only", "5only"),
        help=
        "extend at no, 3', 5' or both ends. If 3only or 5only are set, only the added sequence is returned [default=%default]"
    )

    parser.add_option("--extend-by",
                      dest="extend_by",
                      type="int",
                      help="extend by # bases [default=%default]")

    parser.add_option(
        "--use-strand",
        dest="ignore_strand",
        action="store_false",
        help=
        "use strand information and return reverse complement [default=%default]"
    )

    parser.set_defaults(
        genome_file=None,
        masker=None,
        mode="intervals",
        min_length=0,
        max_length=0,
        extend_at=None,
        extend_by=100,
        ignore_strand=True,
    )

    (options, args) = E.Start(parser)

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contigs = fasta.getContigSizes()
        fasta.setConverter(IndexedFasta.getConverter("zero-both-open"))

    counter = E.Counter()
    ids, seqs = [], []

    E.info("collecting sequences")
    for bed in Bed.setName(Bed.iterator(options.stdin)):
        counter.input += 1

        lcontig = fasta.getLength(bed.contig)

        if options.ignore_strand:
            strand = "+"
        else:
            strand = bed.strand

        if options.mode == "intervals":
            ids.append("%s %s:%i..%i (%s)" %
                       (bed.name, bed.contig, bed.start, bed.end, strand))
            seqs.append(
                fasta.getSequence(bed.contig, strand, bed.start, bed.end))

        elif options.mode == "leftright":
            l = bed.end - bed.start

            start, end = max(0, bed.start - l), bed.end - l
            ids.append("%s_l %s:%i..%i (%s)" %
                       (bed.name, bed.contig, start, end, strand))
            seqs.append(fasta.getSequence(bed.contig, strand, start, end))

            start, end = bed.start + l, min(lcontig, bed.end + l)
            ids.append("%s_r %s:%i..%i (%s)" %
                       (bed.name, bed.contig, start, end, strand))
            seqs.append(fasta.getSequence(bed.contig, strand, start, end))

    E.info("collected %i sequences" % len(seqs))

    masked = Masker.maskSequences(seqs, options.masker)
    options.stdout.write(
        "\n".join([">%s\n%s" % (x, y) for x, y in zip(ids, masked)]) + "\n")

    E.info("masked %i sequences" % len(seqs))

    counter.output = len(seqs)

    E.info("%s" % counter)

    E.Stop()

Пример #15

Показать файл

Файл: gff2predictions.py Проект: siping/cgat

    
    fasta = IndexedFasta.IndexedFasta( options.genome_file ) 
    contig_sizes = fasta.getContigSizes()
    
    ninput, noutput, nskipped = 0,0,0
    nfound, nnotfound, nidentical, nmismatch, naligned, nunaligned = 0,0,0,0,0,0
    
    if options.filename_peptides:
        peptide_sequences = Genomics.ReadPeptideSequences( IOTools.openFile( options.filename_peptides, "r"))
        predictor = PredictorExonerate()
        predictor.mLogLevel = 0
    else:
        peptide_sequences = None
        predictor = None
        
    converter = IndexedFasta.getConverter( options.input_coordinates )

    predictions = {}
    if options.predictions_file:
        parser = PredictionParser.iterator_predictions( IOTools.openFile( options.predictions_file, "r") )
        for p in parser:
            predictions[p.mPredictionId] = p
        
    if options.output_format == "predictions":
        
        if options.format == "psl":

            if options.trans:
                parser = PredictionParser.PredictionParserBlatTrans()
            else:
                parser = PredictionParser.PredictionParserBlatCDNA()