示例#1
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: psl2table.py 2891 2010-04-07 08:59:18Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "--mask-lowercase",
        dest="mask_lowercase",
        action="store_true",
        help=
        "mask lowercase characters before computing properties [default=%default]"
    )

    parser.add_option("--with-match",
                      dest="with_match",
                      action="store_true",
                      help="echo the match in output [default=%default]")

    parser.add_option(
        "--without-match",
        dest="with_match",
        action="store_false",
        help="do not echo the match in output [default=%default]")

    parser.add_option(
        "-m",
        "--method",
        dest="methods",
        type="choice",
        action="append",
        choices=("counts", "baseml", "match", "query-counts", "sbjct-counts"),
        help="methods to compute properties between sequence pairs.")

    WrapperCodeML.BaseML().AddOptions(parser)

    parser.set_defaults(
        methods=[],
        mask_lowercase=False,
        is_pslx=True,
        with_match=True,
    )

    (options, args) = E.Start(parser)

    counters_plain = []
    counters = []

    for method in options.methods:
        if method == "counts":
            counters.append(
                SequencePairProperties.SequencePairPropertiesCountsNa())
        elif method == "query-counts":
            counters.append(QueriesCounter())
        elif method == "sbjct-counts":
            counters.append(SbjctsCounter())
        elif method == "baseml":
            counters.append(
                SequencePairProperties.SequencePairPropertiesBaseML(options))
        elif method == "match":
            counters_plain.append(CounterMatch(options))

    if counters:
        iterator = Blat.iterator_pslx(options.stdin)
        header = "\t".join(Blat.MatchPSLX().getHeaders())
    else:
        iterator = Blat.iterator(options.stdin)
        header = "\t".join(Blat.Match().getHeaders())

    if not options.with_match:
        header = "qName"

    options.stdout.write(
        "\t".join([
            header,
        ] + ["\t".join(x.getHeaders()) for x in counters] +
                  ["\t".join(x.getHeaders()) for x in counters_plain]) + "\n")

    ninput, noutput, nskipped = 0, 0, 0

    for match in iterator:
        ninput += 1

        if options.with_match:
            options.stdout.write(str(match))
        else:
            options.stdout.write(match.mQueryId)

        if counters:

            qseq = match.mQuerySequence
            sseq = match.mSbjctSequence

            # mask non printable characters - sometimes
            # appear after using pslToPslX
            qseq = [re.sub("[^a-zA-Z]", "N", x) for x in qseq]
            sseq = [re.sub("[^a-zA-Z]", "N", x) for x in sseq]

            if options.mask_lowercase:
                qseq = [re.sub("[a-z]", "N", x) for x in qseq]
                sseq = [re.sub("[a-z]", "N", x) for x in sseq]

            match.mQuerySequence = qseq
            match.mSbjctSequence = sseq

            qseq = "".join(match.mQuerySequence).upper()
            sseq = "".join(match.mSbjctSequence).upper()

            if len(qseq) != len(sseq):
                if options.loglevel >= 1:
                    options.stdlog.write(
                        "# WARNING: two sequences of unequal length in match\n# %s\n"
                        % str(match))
                nskipped += 1
                continue

            for counter in counters:
                counter(qseq, sseq)

            options.stdout.write(
                "\t" + "\t".join([str(counter) for counter in counters]))

        if counters_plain:

            for counter in counters_plain:
                counter(match)

            options.stdout.write(
                "\t" + "\t".join([str(counter) for counter in counters_plain]))

        options.stdout.write("\n")

        noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i\n" %
                             (ninput, noutput, nskipped))

    E.Stop()
示例#2
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id$",
        usage=globals()["__doc__"])

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome [default=%default].")

    parser.add_option("-p", "--output-filename-pattern", dest="output_filename_pattern", type="string",
                      help="OUTPUT filename with histogram information on aggregate coverages [%default].")

    parser.add_option("--read-length-mean", dest="read_length_mean", type="float",
                      help="simulation parameter [default=%default].")

    parser.add_option("--read-length-std", dest="read_length_stddev", type="float",
                      help="simulation parameter [default=%default].")

    parser.add_option("--coverage-mean", dest="coverage_mean", type="float",
                      help="simulation parameter [default=%default].")

    parser.add_option("--coverage-std", dest="coverage_stddev", type="float",
                      help="simulation parameter [default=%default].")

    parser.add_option("--ds-mean", dest="ds_mean", type="float",
                      help="simulation parameter [default=%default].")

    parser.add_option("--ds-std", dest="ds_stddev", type="float",
                      help="simulation parameter [default=%default].")

    parser.add_option("--error-mean", dest="error_mean", type="float",
                      help="simulation parameter [default=%default].")

    parser.add_option("--error-std", dest="error_stddev", type="float",
                      help="simulation parameter [default=%default].")

    parser.add_option("--min-read-length", dest="min_read_length", type="int",
                      help="minimum read length [default=%default].")

    parser.add_option("--sample-size", dest="sample_size", type="int",
                      help="randomly sample from selected transcripts [default=%default].")

    parser.add_option("--test", dest="test", type="int",
                      help="test with # first entries [default=%default].")

    parser.add_option("--mode", dest="mode", type="choice",
                      choices=("genes", "transcripts"),
                      help="use genes or transcripts [default=%default].")

    parser.set_defaults(
        genome_file=None,
        read_length_mean=200.0,
        read_length_stddev=20.0,
        coverage_mean=2.0,
        coverage_stddev=1.0,
        ds_mean=None,
        ds_stddev=None,
        error_mean=None,
        error_stddev=None,
        min_read_length=50,
        test=None,
        mode="transcripts",
        output_filename_pattern=None,
        output_format_id="%010i",
        sample_size=0,
    )

    (options, args) = E.Start(parser, argv)

    assert options.genome_file, "please supply an indexed genome."

    if options.output_filename_pattern:
        outfile_stats = open(options.output_filename_pattern % "stats", "w")
        outfile_stats.write(
            "id\tlen\tnreads\tlen_mean\tlen_std\tcov_mean\tcov_std\n")
        outfile_map = open(options.output_filename_pattern % "map", "w")
        outfile_map.write("id\ttranscript\n")
    else:
        outfile_stats = None
        outfile_map = None

    genome = IndexedFasta.IndexedFasta(options.genome_file)

    ninput, noutput, nskipped = 0, 0, 0

    total_counts, total_read_lengths, total_len = [], [], 0
    total_pids = []
    total_error_pids = []

    if options.mode == "transcripts":
        iterator = GTF.transcript_iterator(
            GTF.iterator_filtered(GTF.iterator(options.stdin), feature="exon"))
        getId = lambda x: x.transcript_id
    elif options.mode == "genes":
        iterator = GTF.flat_gene_iterator(
            GTF.iterator_filtered(GTF.iterator(options.stdin), feature="exon"))
        getId = lambda x: x.gene_id

    if options.sample_size:
        iterator = Iterators.sample(iterator)

    if options.ds_mean:
        do_mutate = True
        pid_calc = SequencePairProperties.SequencePairPropertiesPID()
    else:
        do_mutate = False

    if options.error_mean:
        do_error = True
        pid_calc = SequencePairProperties.SequencePairPropertiesPID()
    else:
        do_error = False

    for gtfs in iterator:

        id = getId(gtfs[0])

        try:
            sequence = GTF.toSequence(gtfs, genome)
        except KeyError, msg:
            if options.loglevel >= 2:
                options.stdlog.write("# skipping %s: %s\n" % (id, msg))
            nskipped += 1
            continue

        lsequence = len(sequence)

        if lsequence <= options.min_read_length * 2:
            if options.loglevel >= 2:
                options.stdlog.write(
                    "# skipping %s - sequence is too short: %i\n" % (id, lsequence))
            nskipped += 1
            continue

        ninput += 1

        if do_mutate:
            new_sequence = getMutatedSequence(sequence, options.ds_mean)
            pid_calc.loadPair(sequence, new_sequence)
            pid = pid_calc.mPID
            total_pids.append(pid)
            sequence = new_sequence
        else:
            pid = 100.0

        if options.loglevel >= 2:
            options.stdlog.write(
                "# processing %s - len=%i\n" % (id, lsequence))
            options.stdlog.flush()

        total_len += lsequence
        lvsequence = lsequence * \
            random.gauss(options.coverage_mean, options.coverage_stddev)

        covered = 0
        counts = numpy.zeros(lsequence)
        nreads = 0

        error_pids, read_lengths = [], []

        while covered < lvsequence:

            read_length = int(
                random.gauss(options.read_length_mean, options.read_length_stddev))
            positive = random.randint(0, 1)

            if positive:
                start = random.randint(0, lsequence)
                end = min(lsequence, start + read_length)
            else:
                end = random.randint(0, lsequence)
                start = max(0, end - read_length)

            read_length = end - start
            if read_length < options.min_read_length:
                continue

            segment = sequence[start:end]

            if not positive:
                segment = Genomics.complement(segment)

            noutput += 1

            if do_error:
                new_segment = getMutatedSequence(segment, options.error_mean)
                pid_calc.loadPair(segment, new_segment)
                pid = pid_calc.mPID
                error_pids.append(pid)
                segment = new_segment
            else:
                pid = 100.0

            options.stdout.write(
                ">%s\n%s\n" % (options.output_format_id % noutput, segment))

            if outfile_map:
                outfile_map.write(
                    "%s\t%s\n" % (id, options.output_format_id % noutput))

            for x in range(start, end):
                counts[x] += 1

            nreads += 1

            covered += read_length
            read_lengths.append(read_length)

        if options.loglevel >= 2:
            options.stdout.write("# transcript %s: len=%i, nreads=%i, len_mean=%.2f, len_std=%.2f, cov_mean=%.2f, cov_stddev=%.2f\n" % (id,
                                                                                                                                        lsequence,
                                                                                                                                        nreads,
                                                                                                                                        numpy.mean(
                                                                                                                                            read_lengths),
                                                                                                                                        numpy.std(
                                                                                                                                            read_lengths),
                                                                                                                                        numpy.mean(
                                                                                                                                            counts),
                                                                                                                                        numpy.std(counts)))
        if outfile_stats:
            outfile_stats.write("%s\t%i\t%i\t%.2f\t%.2f\t%.2f\t%.2f\n" % (id,
                                                                          lsequence,
                                                                          nreads,
                                                                          numpy.mean(
                                                                              read_lengths),
                                                                          numpy.std(
                                                                              read_lengths),
                                                                          numpy.mean(
                                                                              counts),
                                                                          numpy.std(counts)))

        total_counts += list(counts)
        total_read_lengths += read_lengths
        total_error_pids += error_pids

        if options.test and ninput >= options.test:
            break

        if options.sample_size and ninput >= options.sample_size:
            break