Пример #1
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default].")

    parser.add_option("-q",
                      "--quality-file",
                      dest="quality_file",
                      type="string",
                      help="filename with genomic base quality "
                      "information [default=%default].")

    parser.add_option("-b",
                      "--bam-file",
                      dest="bam_files",
                      type="string",
                      metavar="bam",
                      help="filename with read mapping information. "
                      "Multiple files can be submitted in a "
                      "comma-separated list [default=%default].")

    parser.add_option("-i",
                      "--bigwig-file",
                      dest="bigwig_file",
                      type="string",
                      metavar="bigwig",
                      help="filename with bigwig information "
                      "[default=%default].")

    parser.add_option("-f",
                      "--gff-file",
                      dest="filename_gff",
                      type="string",
                      action="append",
                      metavar='bed',
                      help="filename with extra gff files. The order "
                      "is important [default=%default].")

    parser.add_option("--filename-format",
                      dest="filename_format",
                      type="choice",
                      choices=("bed", "gff", "gtf"),
                      help="format of secondary stream [default=%default].")

    parser.add_option("--restrict-source",
                      dest="gff_sources",
                      type="string",
                      action="append",
                      help="restrict input to this 'source' in extra "
                      "gff file (for counter: overlap) [default=%default].")

    parser.add_option("--restrict-feature",
                      dest="gff_features",
                      type="string",
                      action="append",
                      help="restrict input to this 'feature' in extra gff "
                      "file (for counter: overlap) [default=%default].")

    parser.add_option("-r",
                      "--reporter",
                      dest="reporter",
                      type="choice",
                      choices=("genes", "transcripts"),
                      help="report results for 'genes' or 'transcripts' "
                      "[default=%default].")

    parser.add_option("-s",
                      "--section",
                      dest="sections",
                      type="choice",
                      action="append",
                      choices=("exons", "introns"),
                      help="select range on which counters will operate "
                      "[default=%default].")

    parser.add_option(
        "-c",
        "--counter",
        dest="counters",
        type="choice",
        action="append",
        choices=("bigwig-counts", "binding-pattern", "classifier",
                 "classifier-rnaseq", "classifier-rnaseq-splicing",
                 "classifier-polii", "composition-na", "composition-cpg",
                 "coverage", "distance", "distance-genes", "distance-tss",
                 "length", 'neighbours', "overlap", "overlap-stranded",
                 "overlap-transcripts", "overrun", "position", "proximity",
                 "proximity-exclusive", "proximity-lengthmatched", "quality",
                 "read-coverage", "read-extension", "read-overlap",
                 "read-counts", "read-fullcounts", "readpair-counts",
                 "readpair-fullcounts", "splice", "splice-comparison",
                 "territories"),
        help="select counters to apply to input "
        "[default=%default].")

    parser.add_option("--add-gtf-source",
                      dest="add_gtf_source",
                      action="store_true",
                      help="add gtf field of source to output "
                      "[default=%default].")

    parser.add_option("--proximal-distance",
                      dest="proximal_distance",
                      type="int",
                      help="distance to be considered proximal to "
                      "an interval [default=%default].")

    parser.add_option("--multi-mapping-method",
                      dest="multi_mapping",
                      type="choice",
                      choices=('all', 'ignore', 'weight'),
                      help="how to treat multi-mapping reads in "
                      "bam-files. Requires "
                      "the NH flag to be set by the mapper "
                      "[default=%default].")

    parser.add_option("--use-barcodes",
                      dest="use_barcodes",
                      action="store_true",
                      help="Use barcodes to count unique umi's. "
                      "UMI's are specified in the read identifier "
                      "as the last field, where fields are separated "
                      "by underscores, e.g. "
                      "@READ:ILLUMINA:STUFF_NAMINGSTUFF_UMI. "
                      "When true, unique counts are returned. "
                      "Currently only compatible with count-reads")

    parser.add_option("--sample-probability",
                      dest="sample_probability",
                      type="float",
                      help="Specify the probability of whether any"
                      "given read or read pair in a file bam is counted"
                      "Currently only compatible with count-reads")

    parser.add_option("--column-prefix",
                      dest="prefixes",
                      type="string",
                      action="append",
                      help="add prefix to column headers - prefixes "
                      "are used in the same order as the counters "
                      "[default=%default].")

    parser.add_option("--library-type",
                      dest="library_type",
                      type="choice",
                      choices=("unstranded", "firststrand", "secondstrand",
                               "fr-unstranded", "fr-firststrand",
                               "fr-secondstrand"),
                      help="library type of reads in bam file. "
                      "[default=%default]")

    parser.add_option("--min-mapping-quality",
                      dest="minimum_mapping_quality",
                      type="float",
                      help="minimum mapping quality. Reads with a quality "
                      "score of less will be ignored. "
                      "[default=%default]")

    parser.set_defaults(genome_file=None,
                        reporter="genes",
                        with_values=True,
                        sections=[],
                        counters=[],
                        filename_gff=[],
                        filename_format=None,
                        gff_features=[],
                        gff_sources=[],
                        add_gtf_source=False,
                        proximal_distance=10000,
                        bam_files=None,
                        multi_mapping='all',
                        library_type='fr-unstranded',
                        prefixes=[],
                        minimum_mapping_quality=0,
                        use_barcodes=False,
                        sample_probability=1.0)

    if not argv:
        argv = sys.argv

    (options, args) = E.Start(parser, add_output_options=True, argv=argv)

    if options.prefixes:
        if len(options.prefixes) != len(options.counters):
            raise ValueError("if any prefix is given, the number of prefixes "
                             "must be the same as the number of counters")

    # get files
    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        fasta = None

    if options.quality_file:
        quality = IndexedFasta.IndexedFasta(options.quality_file)
        quality.setTranslator(IndexedFasta.TranslatorBytes())
    else:
        quality = None

    if options.bam_files:
        bam_files = []
        for bamfile in options.bam_files.split(","):
            bam_files.append(pysam.AlignmentFile(bamfile, "rb"))
    else:
        bam_files = None

    if options.bigwig_file:
        bigwig_file = pyBigWig.open(options.bigwig_file)
    else:
        bigwig_file = None

    counters = []

    if not options.sections:
        E.info("counters will use the default section (exons)")
        options.sections.append(None)

    if not options.gff_sources:
        options.gff_sources.append(None)
    if not options.gff_features:
        options.gff_features.append(None)

    cc = E.Counter()

    for n, c in enumerate(options.counters):
        if options.prefixes:
            prefix = options.prefixes[n]
        else:
            prefix = None

        if c == "position":
            for section in options.sections:
                counters.append(
                    GeneModelAnalysis.CounterPosition(section=section,
                                                      options=options,
                                                      prefix=prefix))
        elif c == "length":
            for section in options.sections:
                counters.append(
                    GeneModelAnalysis.CounterLengths(section=section,
                                                     options=options,
                                                     prefix=prefix))
        elif c == "splice":
            if fasta is None:
                raise ValueError('splice requires a genomic sequence')
            counters.append(
                GeneModelAnalysis.CounterSpliceSites(fasta=fasta,
                                                     prefix=prefix))
        elif c == "quality":
            if fasta is None:
                raise ValueError('quality requires a quality score sequence')
            counters.append(
                GeneModelAnalysis.CounterQuality(fasta=quality, prefix=prefix))
        elif c == "overrun":
            counters.append(
                GeneModelAnalysis.CounterOverrun(
                    filename_gff=options.filename_gff,
                    options=options,
                    prefix=prefix))
        elif c == "read-coverage":
            counters.append(
                GeneModelAnalysis.CounterReadCoverage(bam_files,
                                                      options=options,
                                                      prefix=prefix))
        elif c == "read-extension":
            counters.append(
                GeneModelAnalysis.CounterReadExtension(
                    bam_files,
                    filename_gff=options.filename_gff,
                    options=options,
                    prefix=prefix))
        elif c == "read-overlap":
            counters.append(
                GeneModelAnalysis.CounterReadOverlap(
                    bam_files,
                    multi_mapping=options.multi_mapping,
                    minimum_mapping_quality=options.minimum_mapping_quality,
                    options=options,
                    prefix=prefix))
        elif c == "read-counts":
            counters.append(
                GeneModelAnalysis.CounterReadCounts(
                    bam_files,
                    multi_mapping=options.multi_mapping,
                    use_barcodes=options.use_barcodes,
                    sample_probability=options.sample_probability,
                    minimum_mapping_quality=options.minimum_mapping_quality,
                    options=options,
                    prefix=prefix))
        elif c == "read-fullcounts":
            counters.append(
                GeneModelAnalysis.CounterReadCountsFull(
                    bam_files,
                    multi_mapping=options.multi_mapping,
                    sample_probability=options.sample_probability,
                    minimum_mapping_quality=options.minimum_mapping_quality,
                    options=options,
                    prefix=prefix))
        elif c == "readpair-counts":
            counters.append(
                GeneModelAnalysis.CounterReadPairCounts(
                    bam_files,
                    multi_mapping=options.multi_mapping,
                    sample_probability=options.sample_probability,
                    library_type=options.library_type,
                    minimum_mapping_quality=options.minimum_mapping_quality,
                    options=options,
                    prefix=prefix))
        elif c == "readpair-fullcounts":
            counters.append(
                GeneModelAnalysis.CounterReadPairCountsFull(
                    bam_files,
                    multi_mapping=options.multi_mapping,
                    sample_probability=options.sample_probability,
                    minimum_mapping_quality=options.minimum_mapping_quality,
                    options=options,
                    prefix=prefix))
        elif c == "bigwig-counts":
            counters.append(
                GeneModelAnalysis.CounterBigwigCounts(bigwig_file,
                                                      options=options,
                                                      prefix=prefix))
        elif c == "splice-comparison":
            if fasta is None:
                raise ValueError('splice-comparison requires a genomic '
                                 'sequence')
            counters.append(
                GeneModelAnalysis.CounterSpliceSiteComparison(
                    fasta=fasta,
                    filename_gff=options.filename_gff,
                    feature=None,
                    source=None,
                    options=options,
                    prefix=prefix))
        elif c == "composition-na":
            if fasta is None:
                raise ValueError('composition-na requires a genomic sequence')
            for section in options.sections:
                counters.append(
                    GeneModelAnalysis.CounterCompositionNucleotides(
                        fasta=fasta,
                        section=section,
                        options=options,
                        prefix=prefix))
        elif c == "composition-cpg":
            if fasta is None:
                raise ValueError('composition-cpg requires a genomic sequence')
            for section in options.sections:
                counters.append(
                    GeneModelAnalysis.CounterCompositionCpG(fasta=fasta,
                                                            section=section,
                                                            options=options,
                                                            prefix=prefix))

        elif c in ("overlap", "overlap-stranded", "overlap-transcripts",
                   "proximity", "proximity-exclusive",
                   "proximity-lengthmatched", "neighbours", "territories",
                   "distance", "distance-genes", "distance-tss",
                   "binding-pattern", "coverage"):
            if c == "overlap":
                template = GeneModelAnalysis.CounterOverlap
            if c == "overlap-stranded":
                template = GeneModelAnalysis.CounterOverlapStranded
            elif c == "overlap-transcripts":
                template = GeneModelAnalysis.CounterOverlapTranscripts
            elif c == "proximity":
                template = GeneModelAnalysis.CounterProximity
            elif c == "neighbours":
                template = GeneModelAnalysis.CounterNeighbours
            elif c == "proximity-exclusive":
                template = GeneModelAnalysis.CounterProximityExclusive
            elif c == "proximity-lengthmatched":
                template = GeneModelAnalysis.CounterProximityLengthMatched
            elif c == "territories":
                template = GeneModelAnalysis.CounterTerritories
            elif c == "distance":
                template = GeneModelAnalysis.CounterDistance
            elif c == "distance-genes":
                template = GeneModelAnalysis.CounterDistanceGenes
            elif c == "distance-tss":
                template = GeneModelAnalysis.CounterDistanceTranscriptionStartSites
            elif c == "coverage":
                template = GeneModelAnalysis.CounterCoverage
            elif c == "binding-pattern":
                template = GeneModelAnalysis.CounterBindingPattern

            for section in options.sections:
                for source in options.gff_sources:
                    for feature in options.gff_features:
                        counters.append(
                            template(filename_gff=options.filename_gff,
                                     feature=feature,
                                     source=source,
                                     fasta=fasta,
                                     section=section,
                                     options=options,
                                     prefix=prefix))

        elif c == "classifier":
            counters.append(
                GeneModelAnalysis.Classifier(filename_gff=options.filename_gff,
                                             fasta=fasta,
                                             options=options,
                                             prefix=prefix))

        elif c == "classifier-rnaseq":
            counters.append(
                GeneModelAnalysis.ClassifierRNASeq(
                    filename_gff=options.filename_gff,
                    fasta=fasta,
                    options=options,
                    prefix=prefix))
        elif c == "classifier-rnaseq-splicing":
            counters.append(
                GeneModelAnalysis.ClassifierRNASeqSplicing(
                    filename_gff=options.filename_gff,
                    fasta=fasta,
                    options=options,
                    prefix=prefix))
        elif c == "classifier-polii":
            counters.append(
                GeneModelAnalysis.ClassifierPolII(
                    filename_gff=options.filename_gff,
                    feature=None,
                    source=None,
                    fasta=fasta,
                    options=options,
                    prefix=prefix))
        elif c == "binding-pattern":
            counters.append(
                GeneModelAnalysis.CounterBindingPattern(
                    filename_gff=options.filename_gff,
                    feature=None,
                    source=None,
                    fasta=fasta,
                    options=options,
                    prefix=prefix))

    if options.reporter == "genes":
        iterator = GTF.flat_gene_iterator
        header = ["gene_id"]
        fheader = lambda x: [x[0].gene_id]
    elif options.reporter == "transcripts":
        iterator = GTF.transcript_iterator
        header = ["transcript_id"]
        fheader = lambda x: [x[0].transcript_id]

    if options.add_gtf_source:
        header.append("source")
        ffields = lambda x: [x[0].source]
    else:
        ffields = lambda x: []

    options.stdout.write("\t".join(header + [x.getHeader()
                                             for x in counters]) + "\n")

    for gffs in iterator(GTF.iterator(options.stdin)):
        cc.input += 1

        for counter in counters:
            counter.update(gffs)

        skip = len([x for x in counters if x.skip]) == len(counters)
        if skip:
            cc.skipped += 1
            continue

        options.stdout.write("\t".join(
            fheader(gffs) + ffields(gffs) +
            [str(counter) for counter in counters]) + "\n")

        cc.output += 1

    E.info("%s" % str(cc))
    for counter in counters:
        E.info("%s\t%s" % (repr(counter), str(counter.counter)))
    E.Stop()
Пример #2
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-e",
        "--extract",
        dest="extract",
        type="string",
        help="extract region for testing purposes. Format is "
        "contig:strand:from:to. "
        "The default coordinates are 0-based "
        "open/closed coordinates on both strands, but can be changed "
        "by --input-format. "
        "For example, 'chr1:+:10:12' will return "
        "bases 11 and 12 on chr1. Elements from the end of the "
        "string can be omitted. For example, 'chr1' will return "
        "all of chromosome 'chr1'.")

    input_format_choices = ("one-forward-open", "zero-both-open")
    parser.add_option("-i",
                      "--input-format",
                      dest="input_format",
                      type="choice",
                      choices=input_format_choices,
                      help="coordinate format of input. Valid choices are "
                      "%s. See --extract. [default=%%default]." %
                      ", ".join(input_format_choices))

    parser.add_option(
        "-s",
        "--synonyms",
        dest="synonyms",
        type="string",
        help="list of synonyms. This is a comma separated with list "
        "of equivalence relations. For example, chrM=chrMT "
        "means that chrMT will refer to chrM and either "
        "can be used to retrieve a sequence "
        "[default=%default]")

    group = E.OptionGroup(parser, "Bencharking options")
    group.add_option("-b",
                     "--benchmark",
                     dest="benchmark",
                     action="store_true",
                     help="benchmark time for read access "
                     "[default=%default].")
    group.add_option("--benchmark-num-iterations",
                     dest="benchmark_num_iterations",
                     type="int",
                     help="number of iterations for benchmark "
                     "[default=%default].")
    group.add_option("--benchmark-fragment-size",
                     dest="benchmark_fragment_size",
                     type="int",
                     help="benchmark: fragment size [default=%default].")
    parser.add_option_group(group)

    group = E.OptionGroup(parser, "Validation options")
    group.add_option("--verify",
                     dest="verify",
                     type="string",
                     help="verify against other database [default=%default].")

    group.add_option("--verify-iterations",
                     dest="verify_num_iterations",
                     type="int",
                     help="number of iterations for verification "
                     "[default=%default].")
    parser.add_option_group(group)

    file_format_choices = ("fasta", "auto", "fasta.gz", "tar", "tar.gz")
    parser.add_option("--file-format",
                      dest="file_format",
                      type="choice",
                      choices=file_format_choices,
                      help="file format of input. Supply if data comes "
                      "from stdin "
                      "Valid choices are %s [default=%%default]." %
                      ", ".join(file_format_choices))

    parser.add_option("-a",
                      "--clean-sequence",
                      dest="clean_sequence",
                      action="store_true",
                      help="remove X/x from DNA sequences - they cause "
                      "errors in exonerate [default=%default].")

    parser.add_option("--allow-duplicates",
                      dest="allow_duplicates",
                      action="store_true",
                      help="allow duplicate identifiers. Further occurances "
                      "of an identifier are suffixed by an '_%i' "
                      "[default=%default].")

    parser.add_option("--regex-identifier",
                      dest="regex_identifier",
                      type="string",
                      help="regular expression for extracting the "
                      "identifier from fasta description line "
                      "[default=%default].")

    parser.add_option("--force-output",
                      dest="force",
                      action="store_true",
                      help="force overwriting of existing files "
                      "[default=%default].")

    translator_choices = ("solexa", "phred", "bytes", "range200")
    parser.add_option("-t",
                      "--translator",
                      dest="translator",
                      type="choice",
                      choices=translator_choices,
                      help="translate numerical quality scores. "
                      "Valid choices are %s [default=%%default]." %
                      ", ".join(translator_choices))

    group = E.OptionGroup(parser, 'Compression options')
    compression_choices = ("lzo", "zlib", "gzip", "dictzip", "bzip2", "debug")
    group.add_option("-c",
                     "--compression",
                     dest="compression",
                     type="choice",
                     choices=compression_choices,
                     help="compress database, using specified compression "
                     "method. "
                     "Valid choices are %s, but depend on availability on the "
                     "system "
                     "[default=%%default]." % ", ".join(compression_choices))

    group.add_option("--random-access-points",
                     dest="random_access_points",
                     type="int",
                     help="set random access points every # number "
                     "of nucleotides for block compression schemes "
                     "[default=%default].")

    group.add_option(
        "--compress-index",
        dest="compress_index",
        action="store_true",
        help="compress index. The default is to use a plain-text, "
        "human-readable index [default=%default].")

    parser.add_option_group(group)

    parser.set_defaults(extract=None,
                        input_format="zero-both-open",
                        benchmark_fragment_size=1000,
                        benchmark_num_iterations=1000000,
                        benchmark=False,
                        compression=None,
                        random_access_points=0,
                        synonyms=None,
                        verify=None,
                        verify_num_iterations=100000,
                        verify_fragment_size=100,
                        clean_sequence=False,
                        allow_duplicates=False,
                        regex_identifier=None,
                        compress_index=False,
                        file_format="auto",
                        force=False,
                        translator=None)

    (options, args) = E.Start(parser)

    if options.synonyms:
        synonyms = {}
        for x in options.synonyms.split(","):
            a, b = x.split("=")
            a = a.strip()
            b = b.strip()
            if a not in synonyms:
                synonyms[a] = []
            synonyms[a].append(b)
    else:
        synonyms = None

    if options.translator:
        if options.translator == "phred":
            options.translator = IndexedFasta.TranslatorPhred()
        elif options.translator == "solexa":
            options.translator = IndexedFasta.TranslatorSolexa()
        elif options.translator == "bytes":
            options.translator = IndexedFasta.TranslatorBytes()
        elif options.translator == "range200":
            options.translator = IndexedFasta.TranslatorRange200()
        else:
            raise ValueError("unknown translator %s" % options.translator)

    if options.extract:
        fasta = IndexedFasta.IndexedFasta(args[0])
        fasta.setTranslator(options.translator)
        converter = IndexedFasta.getConverter(options.input_format)

        contig, strand, start, end = IndexedFasta.parseCoordinates(
            options.extract)
        sequence = fasta.getSequence(contig,
                                     strand,
                                     start,
                                     end,
                                     converter=converter)
        options.stdout.write(">%s\n%s\n" % (options.extract, sequence))

    elif options.benchmark:
        import timeit
        timer = timeit.Timer(
            stmt="IndexedFasta.benchmarkRandomFragment(fasta=fasta, size=%i)" %
            (options.benchmark_fragment_size),
            setup="from __main__ import IndexedFasta\n"
            "fasta=IndexedFasta.IndexedFasta('%s')" % (args[0]))

        t = timer.timeit(number=options.benchmark_num_iterations)
        options.stdout.write("iter\tsize\ttime\n")
        options.stdout.write("%i\t%i\t%i\n" %
                             (options.benchmark_num_iterations,
                              options.benchmark_fragment_size, t))

    elif options.verify:
        fasta1 = IndexedFasta.IndexedFasta(args[0])
        fasta2 = IndexedFasta.IndexedFasta(options.verify)
        nerrors1 = IndexedFasta.verify(fasta1,
                                       fasta2,
                                       options.verify_num_iterations,
                                       options.verify_fragment_size,
                                       stdout=options.stdout)
        options.stdout.write("errors=%i\n" % (nerrors1))
        nerrors2 = IndexedFasta.verify(fasta2,
                                       fasta1,
                                       options.verify_num_iterations,
                                       options.verify_fragment_size,
                                       stdout=options.stdout)
        options.stdout.write("errors=%i\n" % (nerrors2))
    elif options.compress_index:
        fasta = IndexedFasta.IndexedFasta(args[0])
        fasta.compressIndex()
    else:
        if options.loglevel >= 1:
            options.stdlog.write("# creating database %s\n" % args[0])
            options.stdlog.write("# indexing the following files: \n# %s\n" %
                                 (" \n# ".join(args[1:])))
            options.stdlog.flush()

            if synonyms:
                options.stdlog.write("# Applying the following synonyms:\n")
                for k, v in synonyms.items():
                    options.stdlog.write("# %s=%s\n" % (k, ",".join(v)))
                options.stdlog.flush()
        if len(args) < 2:
            print globals()["__doc__"]
            sys.exit(1)

        iterator = IndexedFasta.MultipleFastaIterator(
            args[1:],
            regex_identifier=options.regex_identifier,
            format=options.file_format)

        IndexedFasta.createDatabase(
            args[0],
            iterator,
            synonyms=synonyms,
            random_access_points=options.random_access_points,
            compression=options.compression,
            clean_sequence=options.clean_sequence,
            allow_duplicates=options.allow_duplicates,
            translator=options.translator,
            force=options.force)

    E.Stop()
Пример #3
0
def main(argv=None):

    parser = E.OptionParser(
        version=
        "%prog version: $Id: malis2masks.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "--random-proportion",
        dest="random_proportion",
        type="float",
        help="mask randomly columns in multiple alignments [default=%default]")

    parser.add_option(
        "--random",
        dest="random",
        action="store_true",
        help="shuffle quality scores before masking [default=%default]")

    parser.set_defaults(
        quality_threshold=40,
        quality_file="quality",
        filename_map=None,
        frame=3,
    )

    (options, args) = E.Start(parser)

    ##################################################
    ##################################################
    ##################################################
    # read map
    ##################################################
    infile = open(options.filename_map)
    map_genes2genome = {}
    for match in Blat.iterator(infile):
        assert match.mQueryId not in map_genes2genome, "duplicate entry %s" % match.mQueryId
        map_genes2genome[match.mQueryId] = match
    infile.close()

    ##################################################
    ##################################################
    ##################################################
    # get quality scores
    ##################################################
    quality = IndexedFasta.IndexedFasta(options.quality_file)
    quality.setTranslator(IndexedFasta.TranslatorBytes())

    ##################################################
    ##################################################
    ##################################################
    # main loop
    ##################################################
    ninput, noutput, nmissed = 0, 0, 0

    options.stdout.write("cluster_id\tstart\tend\n")

    for line in options.stdin:
        if line.startswith("cluster_id"):
            continue
        ninput += 1
        cluster_id, gene_id, alignment = line[:-1].split("\t")

        if gene_id not in map_genes2genome:
            nmissed += 1
            E.warn("gene_id %s not found in map." % gene_id)
            continue

        match = map_genes2genome[gene_id]
        map_gene2genome = match.getMapQuery2Target()
        is_negative = match.strand == "-"

        # if strand is negative, the coordinates are
        # on the negative strand of the gene/query
        # in order to work in the right coordinate system
        # revert the sequence
        if is_negative:
            alignment = alignment[::-1]

        # get map of gene to alignment
        map_gene2mali = alignlib_lite.py_makeAlignmentVector()
        fillAlignment(map_gene2mali, alignment)

        # get quality scores
        quality_scores = quality.getSequence(match.mSbjctId, "+",
                                             match.mSbjctFrom, match.mSbjctTo)

        # print str(alignlib_lite.py_AlignmentFormatEmissions( map_gene2genome))
        # print str(alignlib_lite.py_AlignmentFormatEmissions( map_gene2mali))
        # print quality_scores
        map_mali2genome = alignlib_lite.py_makeAlignmentVector()
        alignlib_lite.py_combineAlignment(map_mali2genome, map_gene2mali,
                                          map_gene2genome, alignlib_lite.py_RR)
        # print str(alignlib_lite.py_AlignmentFormatEmissions(
        # map_mali2genome))

        # shuffle quality scores, but only those that are aligned
        if options.random:
            positions = []
            for fp, c in enumerate(alignment):
                if c == "-":
                    continue
                y = map_mali2genome.mapRowToCol(fp) - match.mSbjctFrom
                if y < 0:
                    continue
                positions.append(y)
            scores = [quality_scores[x] for x in positions]
            random.shuffle(scores)
            for p, q in zip(positions, scores):
                quality_scores[p] = q

        # negative strand
        to_mask = []
        # reverse position
        rp = len(alignment)
        for fp, c in enumerate(alignment):
            rp -= 1
            if c == "-":
                continue
            y = map_mali2genome.mapRowToCol(fp) - match.mSbjctFrom
            if y < 0:
                continue
            if quality_scores[y] < options.quality_threshold:
                if is_negative:
                    p = rp
                else:
                    p = fp
                E.debug(
                    "low quality base: id=%s, mali=%i, char=%s, contig=%s, strand=%s, pos=%i, quality=%i"
                    % (cluster_id, p, c, match.mSbjctId, match.strand,
                       map_mali2genome.mapRowToCol(fp), quality_scores[y]))
                if options.frame > 1:
                    start = (p // options.frame) * options.frame
                    to_mask.extend(list(range(start, start + options.frame)))
                else:
                    to_mask.append(p)

        regions = Iterators.group_by_distance(sorted(to_mask))

        for start, end in regions:
            options.stdout.write("%s\t%i\t%i\n" % (cluster_id, start, end))

        noutput += 1

    E.info("ninput=%i, noutput=%i, nmissed=%i" % (ninput, noutput, nmissed))

    E.Stop()