示例#1
0
文件: psl2psl.py 项目: Q-KIM/cgat
def pslSelectQuery(options):

    ninput, noutput, ndiscarded, nskipped = 0, 0, 0, 0

    value, field = options.select.split("-")

    if field == "nmatches":
        f = lambda x: x.mNMatches
    elif field == "nmismatches":
        f = lambda x: x.mNMisMatches

    for data in Blat.iterator_per_query(Blat.iterator(options.stdin)):

        ninput += 1
        if options.test and ninput >= options.test:
            break

        if ninput % options.report_step == 0:
            E.info("progress: ninput=%i, noutput=%i" % (ninput, noutput))

        data.sort(key=f)

        if value == "most":
            options.stdout.write("%s\n" % str(data[-1]))
        elif value == "least":
            options.stdout.write("%s\n" % str(data[0]))

        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i, ndiscarded=%i" %
           (ninput, noutput, nskipped, ndiscarded))
示例#2
0
def pslSelectQuery(options):

    ninput, noutput, ndiscarded, nskipped = 0, 0, 0, 0

    value, field = options.select.split("-")

    if field == "nmatches":
        f = lambda x: x.mNMatches
    elif field == "nmismatches":
        f = lambda x: x.mNMisMatches

    for data in Blat.iterator_per_query(Blat.iterator(options.stdin)):

        ninput += 1
        if options.test and ninput >= options.test:
            break

        if ninput % options.report_step == 0:
            E.info("progress: ninput=%i, noutput=%i" % (ninput, noutput))

        data.sort(key=f)

        if value == "most":
            options.stdout.write("%s\n" % str(data[-1]))
        elif value == "least":
            options.stdout.write("%s\n" % str(data[0]))

        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i, ndiscarded=%i" %
           (ninput, noutput, nskipped, ndiscarded))
示例#3
0
def pslAddSequence(query_fasta, sbjct_fasta, options):

    iterator = Blat.BlatIterator(sys.stdin)

    ninput, noutput, ndiscarded, nskipped = 0, 0, 0, 0

    while 1:

        match = next(iterator)
        if not match:
            break

        ninput += 1
        if options.test and ninput >= options.test:
            break

        if ninput % options.report_step == 0:
            E.info("progress: ninput=%i, noutput=%i" % (ninput, noutput))

        new = Blat.MatchPSLX()
        new.fromPSL(match,
                    query_fasta.getSequence(
                        match.mQueryId, "+", match.mQueryFrom, match.mQueryTo),
                    sbjct_fasta.getSequence(
                        match.mSbjctId, "+", match.mSbjctFrom, match.mSbjctTo))

        options.stdout.write(str(new) + "\n")
        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i, ndiscarded=%i" %
           (ninput, noutput, nskipped, ndiscarded))
示例#4
0
def iterator_psl_intervals(options):
    """iterate over psl file yield an entry together with overlapping entries.

    returns tuples of (match, list(query_intervals), list(target_intervals))
    """

    if options.filename_filter_query:
        intervals_query = readIntervals(
            IOTools.openFile(options.filename_filter_query, "r"), options)
    else:
        intervals_query = None

    if options.filename_filter_target:
        intervals_target = readIntervals(
            IOTools.openFile(options.filename_filter_target, "r"), options)
    else:
        intervals_target = None

    iterator = Blat.BlatIterator(options.stdin)

    ninput = 0

    while 1:

        match = iterator.next()
        if not match: break

        ninput += 1
        if options.test and ninput >= options.test:
            break

        if options.loglevel >= 1 and ninput % options.report_step == 0:
            options.stdlog.write("# progress: ninput=%i\n" % (ninput))
            options.stdlog.flush()

        qx, tx = None, None
        if intervals_query:
            try:
                qx = list(
                    intervals_query.get(match.mQueryId, match.mQueryFrom,
                                        match.mQueryTo))
            except KeyError:
                qx = []

        if intervals_target:
            try:
                tx = list(
                    intervals_target.get(match.mSbjctId, match.mSbjctFrom,
                                         match.mSbjctTo))
            except KeyError:
                tx = []

        if options.loglevel >= 2:
            options.stdlog.write(
                "###################################################\n")
            options.stdlog.write("# testing match %s\n" % (str(match)))
            options.stdlog.write(
                "###################################################\n")

        yield match, qx, tx
示例#5
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version="%prog version: $Id: psl2chain.py 2901 2010-04-13 14:38:07Z andreas $", usage=globals()["__doc__"]
    )

    ## add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    ## do sth
    ninput, nskipped, noutput = 0, 0, 0

    for psl in Blat.iterator(options.stdin):
        ninput += 1
        if psl.strand == "-":
            qstart, qend = psl.mQueryLength - psl.mQueryTo, psl.mQueryLength - psl.mQueryFrom
        else:
            qstart, qend = psl.mQueryFrom, psl.mQueryTo

        options.stdout.write(
            "chain %i %s %i %s %i %i %s %i %s %i %i %i\n"
            % (
                psl.mNMatches,
                psl.mSbjctId,
                psl.mSbjctLength,
                "+",
                psl.mSbjctFrom,
                psl.mSbjctTo,
                psl.mQueryId,
                psl.mQueryLength,
                psl.strand,
                qstart,
                qend,
                ninput,
            )
        )

        size, tend, qend = 0, None, None
        for qstart, tstart, size in psl.getBlocks():
            if tend != None:
                options.stdout.write("\t%i\t%i\n" % (tstart - tend, qstart - qend))
            qend, tend = qstart + size, tstart + size
            options.stdout.write("%i" % (size,))
        options.stdout.write("\n")

        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    ## write footer and output benchmark information.
    E.Stop()
示例#6
0
文件: maf2psl.py 项目: santayana/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id: maf2psl.py 2879 2010-04-06 14:44:34Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-q", "--query", dest="query", type="string",
                      help="sequence to use for query [default=%default].")

    parser.add_option("-t", "--target", dest="target", type="string",
                      help="sequence to use for target [default=%default].")

    parser.set_defaults(
        query=None,
        target=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if options.query is None or options.target is None:
        if len(args) != 2:
            raise ValueError(
                "please supply two sequence identifiers for query and target")
        options.query, options.target = args

    # do sth
    ninput, nskipped, noutput = 0, 0, 0

    reader = maf.Reader(options.stdin)

    psl = Blat.Match()
    for cc in threaditer(reader, (options.query, options.target)):

        ninput += 1
        query, target = cc

        # treat identfiers like Hsap.GL000223.1
        try:
            data = query.src.split(".")
            qs, qcontig = data[0], ".".join(data[1:])
        except ValueError, msg:
            raise ValueError(
                "error: could not parse query %s: msg=%s" % (query.src, msg))

        try:
            data = target.src.split(".")
            ts, tcontig = data[0], ".".join(data[1:])
        except ValueError, msg:
            raise ValueError(
                "error: could not parse target %s: msg=%s" % (target.src, msg))
示例#7
0
文件: psl2psl.py 项目: zpeng1989/cgat
def pslComplement(query_fasta, target_fasta, options):
    """complenment psl entries.
    """

    iterator = Blat.BlatIterator(sys.stdin)

    ninput, noutput, ndiscarded, nskipped = 0, 0, 0, 0

    border = options.complement_border
    min_length = options.complement_min_length

    while 1:

        match = iterator.next()
        if not match:
            break

        ninput += 1
        if options.test and ninput >= options.test:
            break

        if ninput % options.report_step == 0:
            E.info("progress: ninput=%i, noutput=%i" % (ninput, noutput))

        if match.mNBlocks <= 1:
            nskipped += 1
            continue

        pairs = []
        for qstart, tstart, size in match.getBlocks():

            qend = qstart + size - border
            qstart += border

            if qend - qstart < options.complement_min_length:
                continue

            tend = tstart + size - border
            tstart += border

            if tend - tstart < options.complement_min_length:
                continue

            query_sequence = query_fasta.getSequence(match.mQueryId,
                                                     match.strand, qstart,
                                                     qend)
            sbjct_sequence = sbjct_fasta.getSequence(match.mSbjctId, "+",
                                                     tstart, tend)

        ndiscarded += 1

        options.stdout.write(str(new) + "\n")
        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i, ndiscarded=%i" %
           (ninput, noutput, nskipped, ndiscarded))
示例#8
0
文件: psl2chain.py 项目: gsc0107/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id: psl2chain.py 2901 2010-04-13 14:38:07Z andreas $",
                            usage=globals()["__doc__"])

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    # do sth
    ninput, nskipped, noutput = 0, 0, 0

    for psl in Blat.iterator(options.stdin):
        ninput += 1
        if psl.strand == "-":
            qstart, qend = psl.mQueryLength - \
                psl.mQueryTo, psl.mQueryLength - psl.mQueryFrom
        else:
            qstart, qend = psl.mQueryFrom, psl.mQueryTo

        options.stdout.write("chain %i %s %i %s %i %i %s %i %s %i %i %i\n" %
                             (psl.mNMatches,
                              psl.mSbjctId,
                              psl.mSbjctLength,
                              "+",
                              psl.mSbjctFrom,
                              psl.mSbjctTo,
                              psl.mQueryId,
                              psl.mQueryLength,
                              psl.strand,
                              qstart,
                              qend,
                              ninput))

        size, tend, qend = 0, None, None
        for qstart, tstart, size in psl.getBlocks():
            if tend is not None:
                options.stdout.write(
                    "\t%i\t%i\n" % (tstart - tend, qstart - qend))
            qend, tend = qstart + size, tstart + size
            options.stdout.write("%i" % (size,))
        options.stdout.write("\n")

        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    # write footer and output benchmark information.
    E.Stop()
示例#9
0
def pslComplementQuery(options):
    """complement psl entries.

    Fill the regions from a second psl file.
    """

    Iterator = Blat.BlatIterator(sys.stdin)

    ninput, noutput, ndiscarded, nskipped = 0, 0, 0, 0

    border = options.complement_border
    min_length = options.complement_min_length

    while 1:

        match = next(iterator)
        if not match:
            break

        ninput += 1
        if options.test and ninput >= options.test:
            break

        if ninput % options.report_step == 0:
            E.info("progress: ninput=%i, noutput=%i" % (ninput, noutput))

        if match.mNBlocks <= 1:
            nskipped += 1
            continue

        pairs = []
        for qstart, tstart, size in match.getBlocks():

            qend = qstart + size - border
            qstart += border

            if qend - qstart < options.complement_min_length:
                continue

            tend = tstart + size - border
            tstart += border

            if tend - tstart < options.complement_min_length:
                continue

        ndiscarded += 1

        options.stdout.write(str(new) + "\n")
        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i, ndiscarded=%i" %
           (ninput, noutput, nskipped, ndiscarded))
示例#10
0
def iterator_filter_overlapping_target( psls, options ):

    ninput, noutput, ndiscarded = 0, 0, 0    
    for block in Blat.iterator_target_overlap( psls, options.threshold_merge_distance ):
        l = len(block)
        ninput += l
        if l > 1: 
            ndiscarded += l
        else:
            yield block[0]
            noutput += 1

    E.info( "iterator_filter_overlapping_target: ninput=%i, noutput=%i, ndiscarded=%i" % (ninput, noutput,ndiscarded) )
示例#11
0
文件: psl2psl.py 项目: zpeng1989/cgat
def iterator_filter_overlapping_target(psls, options):

    ninput, noutput, ndiscarded = 0, 0, 0
    for block in Blat.iterator_target_overlap(
            psls, options.threshold_merge_distance):
        l = len(block)
        ninput += l
        if l > 1:
            ndiscarded += l
        else:
            yield block[0]
            noutput += 1

    E.info("iterator_filter_overlapping_target: ninput=%i, noutput=%i, "
           "ndiscarded=%i" % (ninput, noutput, ndiscarded))
示例#12
0
def chunk_iterator_psl_overlap(infile, args, prefix, use_header=False):
    """iterate over overlapping entries in a psl file."""

    iterator = Blat.BlatIterator(sys.stdin)

    processed_contigs = set()

    merge_distance = args[0]
    last_sbjct_id = None
    sbjct_end = 0
    outfile = None
    filename = None
    while 1:

        match = next(iterator)

        if match is None:
            break

        if match.mSbjctId != last_sbjct_id or \
           match.mSbjctFrom >= (sbjct_end + merge_distance):
            if last_sbjct_id:
                outfile.close()
                yield filename

            if last_sbjct_id != match.mSbjctId and \
               match.mSbjctId in processed_contigs:
                raise ValueError("input not sorted correctly (contig,start): "
                                 "already encountered %s\n%s" %
                                 (match.mSbjctId, str(match)))

            last_sbjct_id = match.mSbjctId
            processed_contigs.add(last_sbjct_id)

            sbjct_start = match.mSbjctFrom
            sbjct_end = match.mSbjctTo

        if match.mSbjctFrom < sbjct_start:
            raise ValueError("input not sorted correctly (contig,start): "
                             "%i < %i\n%s" %
                             (match.mSbjctFrom, sbjct_start, str(match)))

        sbjct_end = max(match.mSbjctTo, sbjct_end)
        outfile.write(str(match) + "\n")

    if outfile:
        outfile.close()
        yield filename
示例#13
0
def iterator_filter_overlapping_query(psls, options):
    '''remove alignments that overlap on query.

    If multiple alignments overlap, the one with the highest number
    of matching nucleotides is chosen.
    '''

    # note: only takes the full ranges, but does not check for
    # individual overlap of blocks use connected components and
    # hasAlignmentOverlap
    ninput, noutput, ndiscarded = 0, 0, 0

    last_contig = None

    for block in Blat.iterator_query_overlap(
            psls,
            options.threshold_merge_distance):

        # commented code is for base-level filtering, which is very slow
        # disabled for now
        # if block[0].mQueryId != last_contig:
        #     last_contig = block[0].mQueryId
        #     E.info( "processing %s" % last_contig )

        l = len(block)
        ninput += l
        if l > 1:
            ndiscarded += l
            # components = Blat.getComponents( block, by_query = True )
            # for component in components:
            #     m = [ block[x] for x in component ]
            #     m.sort( key = lambda x: -x.mNMatches )
            #     ndiscarded += len(m) - 1
            #     yield m[0]
            #     noutput += 1
        else:
            yield block[0]
            noutput += 1

    E.info("iterator_filter_overlapping_query: ninput=%i, "
           "noutput=%i, ndiscarded=%i" %
           (ninput, noutput, ndiscarded))
示例#14
0
文件: psl2psl.py 项目: Q-KIM/cgat
def iterator_filter_overlapping_query(psls, options):
    '''remove alignments that overlap on query.

    If multiple alignments overlap, the one with the highest number
    of matching nucleotides is chosen.
    '''

    # note: only takes the full ranges, but does not check for
    # individual overlap of blocks use connected components and
    # hasAlignmentOverlap
    ninput, noutput, ndiscarded = 0, 0, 0

    last_contig = None

    for block in Blat.iterator_query_overlap(
            psls,
            options.threshold_merge_distance):

        # commented code is for base-level filtering, which is very slow
        # disabled for now
        # if block[0].mQueryId != last_contig:
        #     last_contig = block[0].mQueryId
        #     E.info( "processing %s" % last_contig )

        l = len(block)
        ninput += l
        if l > 1:
            ndiscarded += l
            # components = Blat.getComponents( block, by_query = True )
            # for component in components:
            #     m = [ block[x] for x in component ]
            #     m.sort( key = lambda x: -x.mNMatches )
            #     ndiscarded += len(m) - 1
            #     yield m[0]
            #     noutput += 1
        else:
            yield block[0]
            noutput += 1

    E.info("iterator_filter_overlapping_query: ninput=%i, "
           "noutput=%i, ndiscarded=%i" %
           (ninput, noutput, ndiscarded))
示例#15
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: psl2stats.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.set_defaults()

    (options, args) = E.Start(parser)

    query_bitsets, target_bitsets = {}, {}

    def addRange(bitset, id, size, iterator):

        if id not in bitset:
            bitset[id] = bx.bitset.BinnedBitSet(size)
        b = bitset[id]

        for start, end in iterator:
            b.set_range(start, end - start)

    for psl in Blat.iterator(options.stdin):

        addRange(query_bitsets, psl.mQueryId, psl.mQueryLength,
                 psl.iterator_query_exons())

        addRange(target_bitsets, psl.mSbjctId, psl.mSbjctLength,
                 psl.iterator_sbjct_exons())

    def printBitset(outfile, bitsets):

        outfile.write("contig\tcovered\tsize\tpcovered\n")
        total, total_len = 0, 0
        for chrom in sorted(bitsets):

            l = bitsets[chrom].size
            s = bitsets[chrom].count_range(0, l)
            if l > 0:
                outfile.write("%s\t%i\t%i\t%6.4f\n" %
                              (chrom, s, l, 100.0 * s / l))
            total += s
            total_len += l

        if total_len > 0:
            outfile.write("total\t%i\t%i\t%6.4f\n" %
                          (total, total_len, 100.0 * total / total_len))

    options.stdout.write("# query\n")
    printBitset(options.stdout, query_bitsets)
    options.stdout.write("# target\n")
    printBitset(options.stdout, target_bitsets)

    E.Stop()
示例#16
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: psl2table.py 2891 2010-04-07 08:59:18Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "--mask-lowercase",
        dest="mask_lowercase",
        action="store_true",
        help=
        "mask lowercase characters before computing properties [default=%default]"
    )

    parser.add_option("--with-match",
                      dest="with_match",
                      action="store_true",
                      help="echo the match in output [default=%default]")

    parser.add_option(
        "--without-match",
        dest="with_match",
        action="store_false",
        help="do not echo the match in output [default=%default]")

    parser.add_option(
        "-m",
        "--method",
        dest="methods",
        type="choice",
        action="append",
        choices=("counts", "baseml", "match", "query-counts", "sbjct-counts"),
        help="methods to compute properties between sequence pairs.")

    WrapperCodeML.BaseML().AddOptions(parser)

    parser.set_defaults(
        methods=[],
        mask_lowercase=False,
        is_pslx=True,
        with_match=True,
    )

    (options, args) = E.Start(parser)

    counters_plain = []
    counters = []

    for method in options.methods:
        if method == "counts":
            counters.append(
                SequencePairProperties.SequencePairPropertiesCountsNa())
        elif method == "query-counts":
            counters.append(QueriesCounter())
        elif method == "sbjct-counts":
            counters.append(SbjctsCounter())
        elif method == "baseml":
            counters.append(
                SequencePairProperties.SequencePairPropertiesBaseML(options))
        elif method == "match":
            counters_plain.append(CounterMatch(options))

    if counters:
        iterator = Blat.iterator_pslx(options.stdin)
        header = "\t".join(Blat.MatchPSLX().getHeaders())
    else:
        iterator = Blat.iterator(options.stdin)
        header = "\t".join(Blat.Match().getHeaders())

    if not options.with_match:
        header = "qName"

    options.stdout.write(
        "\t".join([
            header,
        ] + ["\t".join(x.getHeaders()) for x in counters] +
                  ["\t".join(x.getHeaders()) for x in counters_plain]) + "\n")

    ninput, noutput, nskipped = 0, 0, 0

    for match in iterator:
        ninput += 1

        if options.with_match:
            options.stdout.write(str(match))
        else:
            options.stdout.write(match.mQueryId)

        if counters:

            qseq = match.mQuerySequence
            sseq = match.mSbjctSequence

            # mask non printable characters - sometimes
            # appear after using pslToPslX
            qseq = [re.sub("[^a-zA-Z]", "N", x) for x in qseq]
            sseq = [re.sub("[^a-zA-Z]", "N", x) for x in sseq]

            if options.mask_lowercase:
                qseq = [re.sub("[a-z]", "N", x) for x in qseq]
                sseq = [re.sub("[a-z]", "N", x) for x in sseq]

            match.mQuerySequence = qseq
            match.mSbjctSequence = sseq

            qseq = "".join(match.mQuerySequence).upper()
            sseq = "".join(match.mSbjctSequence).upper()

            if len(qseq) != len(sseq):
                if options.loglevel >= 1:
                    options.stdlog.write(
                        "# WARNING: two sequences of unequal length in match\n# %s\n"
                        % str(match))
                nskipped += 1
                continue

            for counter in counters:
                counter(qseq, sseq)

            options.stdout.write(
                "\t" + "\t".join([str(counter) for counter in counters]))

        if counters_plain:

            for counter in counters_plain:
                counter(match)

            options.stdout.write(
                "\t" + "\t".join([str(counter) for counter in counters_plain]))

        options.stdout.write("\n")

        noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i\n" %
                             (ninput, noutput, nskipped))

    E.Stop()
示例#17
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: gtf2alleles.py 2886 2010-04-07 08:47:46Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default].")
    parser.add_option(
        "-t",
        "--tablename",
        dest="tablename",
        type="string",
        help=
        "tablename to get variants from (in samtools pileup format) [default=%default]."
    )
    parser.add_option("-d",
                      "--database",
                      dest="database",
                      type="string",
                      help="sqlite3 database [default=%default].")
    parser.add_option(
        "-f",
        "--exons-file",
        dest="filename_exons",
        type="string",
        help=
        "filename with transcript model information (gtf formatted file)  [default=%default]."
    )
    parser.add_option(
        "-r",
        "--filename-reference",
        dest="filename_reference",
        type="string",
        help=
        "filename with transcript models of a reference gene set. Stop codons that do not"
        " overlap any of the exons in this file are ignore (gtf-formatted file)  [default=%default]."
    )
    parser.add_option(
        "--vcf-file",
        dest="filename_vcf",
        type="string",
        help=
        "filename with variants in VCF format. Should be indexed by tabix  [default=%default]."
    )
    parser.add_option(
        "--pileup-file",
        dest="filename_pileup",
        type="string",
        help=
        "filename with variants in samtools pileup format. Should be indexed by tabix  [default=%default]."
    )
    parser.add_option(
        "--vcf-sample",
        dest="vcf_sample",
        type="string",
        help=
        "sample id for species of interest in vcf formatted file [default=%default]."
    )
    parser.add_option(
        "-s",
        "--seleno-tsv-file",
        dest="filename_seleno",
        type="string",
        help=
        "filename of a list of transcript ids that are selenoproteins [default=%default]."
    )
    parser.add_option("-m",
                      "--module",
                      dest="modules",
                      type="choice",
                      action="append",
                      choices=("gene-counts", "transcript-effects"),
                      help="modules to apply [default=%default].")
    parser.add_option("-o",
                      "--output-section",
                      dest="output",
                      type="choice",
                      action="append",
                      choices=("all", "peptide", "cds", "table", "gtf", "map"),
                      help="sections to output [default=%default].")
    parser.add_option(
        "-k",
        "--with-knockouts",
        dest="with_knockouts",
        action="store_true",
        help=
        "add alleles that are knocked out to fasta and gtf files [default=%default]."
    )

    parser.set_defaults(
        genome_file=None,
        filename_exons=None,
        filename_referenec=None,
        filename_seleno=None,
        modules=[],
        border=200,
        separator="|",
        tablename=None,
        database="csvdb",
        output=[],
        with_knockouts=False,
        filename_vcf=None,
        vcf_sample=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    ninput, nskipped, noutput = 0, 0, 0

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        fasta = None

    if options.filename_seleno:
        seleno = set(IOTools.readList(open(options.filename_seleno, "r")))
    else:
        seleno = {}

    infile_gtf = GTF.gene_iterator(GTF.iterator(options.stdin))

    # acquire variants from SQLlite database
    if options.tablename:
        if not options.database:
            raise ValueError("please supply both database and tablename")
        variant_getter = VariantGetterSqlite(options.database,
                                             options.tablename)
    elif options.filename_pileup:
        variant_getter = VariantGetterPileup(options.filename_pileup)
    elif options.filename_vcf:
        variant_getter = VariantGetterVCF(options.filename_vcf,
                                          options.vcf_sample)
    else:
        raise ValueError("please specify a source of variants.")

    if len(options.output) == 0 or "all" in options.output:
        output_all = True
    else:
        output_all = False

    if "cds" in options.output or output_all:
        outfile_cds = E.openOutputFile("cds.fasta")
    else:
        outfile_cds = None

    if "map" in options.output or output_all:
        outfile_map = E.openOutputFile("map.psl")
    else:
        outfile_map = None

    if "peptide" in options.output or output_all:
        outfile_peptides = E.openOutputFile("peptides.fasta")
    else:
        outfile_peptides = None

    if "table" in options.output or output_all:
        outfile_alleles = E.openOutputFile("table")
        outfile_alleles.write("\t".join(("gene_id", "transcript_id",
                                         "allele_id", "contig", "strand",
                                         "is_wildtype",
                                         ("\t".join(Allele._fields)))) + "\n")
    else:
        outfile_alleles = None

    if "gtf" in options.output or output_all:
        outfile_gtf = E.openOutputFile("gtf")
    else:
        outfile_gtf = None

    # id separatar
    separator = options.separator

    for transcripts in infile_gtf:

        gene_id = transcripts[0][0].gene_id

        overall_start = min([min([x.start for x in y]) for y in transcripts])
        overall_end = max([max([x.end for x in y]) for y in transcripts])
        contig = transcripts[0][0].contig
        strand = transcripts[0][0].strand
        is_positive_strand = Genomics.IsPositiveStrand(strand)
        lcontig = fasta.getLength(contig)
        E.info("%s: started processing on %s:%i..%i (%s)" %
               (gene_id, contig, overall_start, overall_end, strand))

        ninput += 1
        extended_start = max(0, overall_start - options.border)
        extended_end = min(lcontig, overall_end + options.border)

        # if contig.startswith("chr"): contig = contig[3:]

        variants = variant_getter(contig, extended_start, extended_end)

        E.debug("%s: found %i variants in %s:%i..%i" %
                (gene_id, len(variants), contig, extended_start, extended_end))

        if E.global_options.loglevel >= 10:
            print("# collected variants:", variants)

        # collect intron/exon sequences
        # coordinates are forward/reverse
        # also updates the coordinates in transcripts
        all_exons, all_introns = collectExonIntronSequences(transcripts, fasta)

        # update variants such that they use the same coordinates
        # as the transcript
        variants = Variants.updateVariants(variants, lcontig, strand)

        # deal with overlapping but consistent variants
        variants = Variants.mergeVariants(variants)

        E.debug("%s: found %i variants after merging in %s:%i..%i" %
                (gene_id, len(variants), contig, extended_start, extended_end))

        if E.global_options.loglevel >= 10:
            print("# merged variants:", variants)

        # collect coordinate offsets and remove conflicting variants
        variants, removed_variants, offsets = Variants.buildOffsets(
            variants, contig=contig)

        if len(removed_variants) > 0:
            E.warn("removed %i conflicting variants" % len(removed_variants))
            for v in removed_variants:
                E.info("removed variant: %s" % str(v))

        E.info("%i variants after filtering" % len(variants))

        if len(variants) > 0:
            # build variants
            indexed_variants = Variants.indexVariants(variants)

            # update exon sequences according to variants
            variant_exons = buildVariantSequences(indexed_variants, all_exons)

            # update intron sequences according to variants
            variant_introns = buildVariantSequences(indexed_variants,
                                                    all_introns)

            if E.global_options.loglevel >= 10:
                for key in variant_exons:
                    print("exon", key)
                    Genomics.printPrettyAlignment(
                        all_exons[key],
                        variant_exons[key][0],
                        variant_exons[key][1],
                    )
                for key in variant_introns:
                    print("intron", key)
                    Genomics.printPrettyAlignment(
                        all_introns[key][:30] + all_introns[key][-30:],
                        variant_introns[key][0][:30] +
                        variant_introns[key][0][-30:],
                        variant_introns[key][1][:30] +
                        variant_introns[key][1][-30:])

        else:
            variant_exons, variant_introns = None, None

        for transcript in transcripts:

            transcript.sort(key=lambda x: x.start)

            transcript_id = transcript[0].transcript_id
            alleles = buildAlleles(
                transcript,
                variant_exons,
                variant_introns,
                all_exons,
                all_introns,
                offsets,
                is_seleno=transcript_id in seleno,
                reference_coordinates=False,
            )

            ##############################################################
            ##############################################################
            ##############################################################
            # output
            for aid, al in enumerate(alleles):

                allele, map_cds2reference = al

                reference_cds_sequence = buildCDSSequence(
                    transcript, all_exons)
                is_wildtype = reference_cds_sequence == allele.cds

                allele_id = str(aid)
                assert len(allele.exon_starts) == allele.nexons
                assert len(allele.cds_starts) == allele.nexons
                assert len(allele.frames) == allele.nexons

                # the output id
                outid = separator.join((gene_id, transcript_id, allele_id))

                # output map between cds and reference
                if outfile_map and map_cds2reference:
                    match = Blat.Match()
                    match.mQueryId = allele_id
                    match.mQueryLength = allele.cds_len
                    match.mSbjctId = contig
                    match.mSbjctLength = lcontig
                    match.strand = strand
                    match.fromMap(map_cds2reference, use_strand=True)
                    outfile_map.write("%s\n" % str(match))

                # only output sequences for genes that have not been knocked
                # out, unless required
                if not allele.is_nmd_knockout or options.with_knockouts:

                    if outfile_gtf:
                        gtf = GTF.Entry()
                        gtf.gene_id = gene_id
                        gtf.transcript_id = transcript_id
                        gtf.addAttribute("allele_id", allele_id)
                        gtf.contig = contig
                        gtf.strand = strand
                        gtf.feature = "CDS"
                        gtf.source = "gtfxnsps"
                        l = 0
                        last_cds_start = allele.cds_starts[0]
                        gtf.start = allele.exon_starts[0]
                        gtf.frame = allele.frames[0]

                        for exon_start, cds_start, frame in zip(
                                allele.exon_starts[1:], allele.cds_starts[1:],
                                allele.frames[1:]):
                            cds_length = cds_start - last_cds_start
                            gtf.end = gtf.start + cds_length
                            if not is_positive_strand:
                                gtf.start, gtf.end = lcontig - \
                                    gtf.end, lcontig - gtf.start
                            outfile_gtf.write(str(gtf) + "\n")

                            gtf.start = exon_start
                            gtf.frame = frame

                            l += cds_length
                            last_cds_start = cds_start

                        cds_length = len(allele.cds) - last_cds_start
                        gtf.end = gtf.start + cds_length
                        if not is_positive_strand:
                            gtf.start, gtf.end = lcontig - \
                                gtf.end, lcontig - gtf.start
                        outfile_gtf.write(str(gtf) + "\n")

                    if outfile_cds:
                        outfile_cds.write(">%s\n%s\n" % (outid, allele.cds))
                    if outfile_peptides:
                        outfile_peptides.write(">%s\n%s\n" %
                                               (outid, allele.peptide))

                # reformat for tabular output
                allele = allele._replace(
                    cds_starts=",".join(map(str, allele.cds_starts)),
                    exon_starts=",".join(map(str, allele.exon_starts)),
                    frames=",".join(map(str, allele.frames)))

                # convert reference coordinates to positive strand coordinates
                if allele.reference_first_stop_start >= 0 and not is_positive_strand:
                    allele = allele._replace(
                        reference_first_stop_start=lcontig -
                        allele.reference_first_stop_end,
                        reference_first_stop_end=lcontig -
                        allele.reference_first_stop_start,
                    )

                if outfile_alleles:
                    outfile_alleles.write("%s\t%s\n" % ("\t".join(
                        (gene_id, transcript_id, allele_id, contig, strand,
                         "%i" % is_wildtype)), "\t".join(map(str, allele))))

                noutput += 1
                # only output first allele (debugging)
                # break

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    # write footer and output benchmark information.
    E.Stop()
示例#18
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id$",
        usage=globals()["__doc__"])

    parser.add_option("--filter-query", dest="filename_filter_query",
                      type="string",
                      help="filename with intervals in the query "
                      "to filter (in gff format) [default=%default].")

    parser.add_option("--filter-target", dest="filename_filter_target",
                      type="string",
                      help="filename with intervals in the target to "
                      "filter (in gff format) [default=%default].")

    parser.add_option("-m", "--method", dest="methods", type="choice",
                      action="append",
                      choices=("map", "merge",
                               "add-sequence", "complement",
                               "select-query", "test",
                               "filter-keep", "filter-remove",
                               "rename-query",
                               "sanitize",
                               "filter-fasta",
                               "remove-overlapping-query",
                               "remove-overlapping-target"),
                      help="""action to perform [default=%default].""")

    parser.add_option("--select", dest="select", type="choice",
                      choices=("most-nmatches", "least-nmatches",
                               "most-nmismatches", "least-nmismatches"),
                      help="entry to select [default=%default].")

    parser.add_option("--header-names", dest="header", type="choice",
                      choices=("none", "table", "full"),
                      help="output psl header [default=%default].")

    parser.add_option("--format", dest="format", type="choice",
                      choices=("gff", "gtf"),
                      help="format of intervals [default=%default].")

    parser.add_option("--queries-tsv-file", dest="filename_queries",
                      type="string",
                      help="fasta filename with queries.")

    parser.add_option("--target-psl-file", dest="filename_sbjcts",
                      type="string",
                      help="fasta filename with sbjct [default=%default].")

    parser.add_option("--id-format", dest="id_format", type="string",
                      help="format of new identifiers for the rename "
                      "function [default=%default].")

    parser.add_option("--unique", dest="unique", action="store_true",
                      help="in the rename function, make each match "
                      "unique [default=%default].")

    parser.add_option("--output-filename-map", dest="output_filename_map",
                      type="string",
                      help="filename with map of old to new labels for "
                      "rename function [default=%default].")

    parser.add_option("--complement-min-length", dest="complement_min_length",
                      type="int",
                      help="minimum length for complemented blocks "
                      "[default=%default].")

    parser.add_option("--complement-border", dest="complement_border",
                      type="int",
                      help="number of residues to exclude before alignment "
                      "at either end [default=%default].")

    parser.add_option("--complement-aligner", dest="complement_aligner",
                      type="choice",
                      choices=("clustal", "dba", "dialign", "dialign-lgs"),
                      help="aligner for complemented segments "
                      "[default=%default].")

    parser.add_option("--threshold-merge-distance",
                      dest="threshold_merge_distance", type="int",
                      help="distance in nucleotides at which two adjacent "
                      "reads shall be merged even if they are not "
                      "overlapping [%default].")

    parser.add_option("--test", dest="test", type="int",
                      help="for debugging purposes - stop after x "
                      "iterations [default=%default].")

    parser.set_defaults(filename_filter_target=None,
                        filename_filter_query=None,
                        filename_queries=None,
                        filename_sbjcts=None,
                        threshold_merge_distance=0,
                        report_step=100000,
                        min_aligned=100,
                        methods=[],
                        format="gff",
                        select="most-nmatches",
                        id_format="%06i",
                        unique=False,
                        output_filename_map=None,
                        header=None,
                        test=None)

    (options, args) = E.start(parser, add_pipe_options=True)

    if options.filename_queries:
        query_fasta = IndexedFasta.IndexedFasta(options.filename_queries)
    else:
        query_fasta = None

    if options.filename_sbjcts:
        sbjct_fasta = IndexedFasta.IndexedFasta(options.filename_sbjcts)
    else:
        sbjct_fasta = None

    if "add-sequence" in options.methods and \
       (sbjct_fasta is None or query_fasta is None):
        raise ValueError(
            "please supply both indexed query and "
            "target/genome sequence data.")

    iterator = Blat.iterator(options.stdin)

    if options.header is not None or options.header != "none":
        if options.header == "table":
            options.stdout.write("\t".join(Blat.FIELDS) + "\n")
        elif options.header == "full":
            options.stdout.write(Blat.HEADER + "\n")

    for method in options.methods:

        if "map" == method:
            pslMap(options)
            break
        elif "filter-keep" == method:
            pslFilter(options, keep=True)
            break
        elif "filter-remove" == method:
            pslFilter(options, keep=False)
            break
        elif "merge" == method:
            pslMerge(options)
            break
        elif "add-sequence" == method:
            pslAddSequence(query_fasta, sbjct_fasta, options)
            break
        elif "complement" == method:
            pslComplement(query_fasta, sbjct_fasta, options)
            break
        elif "select-query" == method:
            pslSelectQuery(options)
            break
        elif "test" == method:
            iterator = Blat.iterator_test(iterator, options.report_step)
        elif "rename-query" == method:
            iterator = iterator_rename_query(iterator, options)
        elif "sanitize" == method:
            iterator = iterator_sanitize(
                iterator, query_fasta, sbjct_fasta, options)
        elif "filter-fasta" == method:
            iterator = iterator_filter_fasta(
                iterator, query_fasta, sbjct_fasta, options)
        elif "remove-overlapping-query" == method:
            iterator = iterator_filter_overlapping_query(iterator, options)
        elif "remove-overlapping-target" == method:
            iterator = iterator_filter_overlapping_target(iterator, options)

    for psl in iterator:
        options.stdout.write("%s\n" % str(psl))

    E.stop()
示例#19
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: psl2stats.py 2781 2009-09-10 11:33:14Z andreas $",
                                    usage = globals()["__doc__"])

    parser.set_defaults(
        )
    
    (options, args) = E.Start( parser )

    query_bitsets, target_bitsets = {}, {}

    def addRange( bitset, id, size, iterator ):
        
        if id not in bitset: bitset[id] = bx.bitset.BinnedBitSet( size )
        b = bitset[id]

        for start, end in iterator:
            b.set_range( start, end-start )

    for psl in Blat.iterator( options.stdin ):

        addRange( query_bitsets, 
                  psl.mQueryId, 
                  psl.mQueryLength,
                  psl.iterator_query_exons() )

        addRange( target_bitsets, 
                  psl.mSbjctId, 
                  psl.mSbjctLength,
                  psl.iterator_sbjct_exons() )
        
    def printBitset( outfile, bitsets ):


        outfile.write( "contig\tcovered\tsize\tpcovered\n" )
        total, total_len = 0, 0
        for chrom in sorted(bitsets):
            
            l = bitsets[chrom].size 
            s = bitsets[chrom].count_range( 0, l )
            if l > 0:
                outfile.write( "%s\t%i\t%i\t%6.4f\n" % (chrom, s,l,100.0 * s / l) )
            total += s
            total_len += l

        if total_len > 0:
            outfile.write("total\t%i\t%i\t%6.4f\n" % (total,total_len, 100.0 * total / total_len))        
        
    options.stdout.write("# query\n" )
    printBitset( options.stdout, query_bitsets )
    options.stdout.write("# target\n" )
    printBitset( options.stdout, target_bitsets )

    E.Stop()
示例#20
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--query-psl-file",
                      dest="filename_query",
                      type="string",
                      help="fasta filename with queries.")

    parser.add_option("--target-psl-file",
                      dest="filename_target",
                      type="string",
                      help="fasta filename with target.")

    parser.add_option(
        "-m",
        "--method",
        dest="method",
        type="choice",
        choices=("full", "pileup-query", "pileup-target", "gapless"),
        help="method to use for constructing the alignment [%default].")

    parser.add_option(
        "--forward-query",
        dest="forward_query",
        action="store_true",
        help=
        "reverse-complement sequences such that query is always on forward strand [%default]"
    )

    parser.add_option("--target-prefix",
                      dest="target_prefix",
                      type="string",
                      help="prefix to use for target [%default].")

    parser.add_option("--query-prefix",
                      dest="query_prefix",
                      type="string",
                      help="prefix to use for query [%default].")

    parser.add_option("--id",
                      dest="id",
                      type="choice",
                      choices=("numeric", "query"),
                      help="choose type of identifier to use [%default]")

    parser.set_defaults(
        filename_query=None,
        filename_target=None,
        method="full",
        output_format_id="%06i",
        target_prefix="",
        query_prefix="",
        forward_query=False,
    )

    (options, args) = E.Start(parser)

    if options.filename_query:
        query = IndexedFasta.IndexedFasta(options.filename_query)

    if options.filename_target:
        target = IndexedFasta.IndexedFasta(options.filename_target)

    if options.method == "full":
        getAlignment = getAlignmentFull

    id = 0
    for match in Blat.iterator(options.stdin):
        if options.loglevel >= 2:
            options.stdout.write("# %s\n" % str(match))

        m = match.getMapQuery2Target()
        m.moveAlignment(-min(match.mQueryBlockStarts),
                        -min(match.mSbjctBlockStarts))
        q = query.getSequence(match.mQueryId, match.strand, match.mQueryFrom,
                              match.mQueryTo)
        t = target.getSequence(match.mSbjctId, "+", match.mSbjctFrom,
                               match.mSbjctTo)
        query_ali, sbjct_ali = getAlignment(m, q, t, options)

        if match.strand == "-" and options.forward_query:
            query_ali = Genomics.complement(query_ali)
            sbjct_ali = Genomics.complement(sbjct_ali)

        options.stdout.write(
            ">%s%s:%s/%i-%i\n%s\n>%s%s:%s%s/%i-%i\n%s\n" %
            (options.query_prefix, options.output_format_id % id,
             match.mQueryId, match.mQueryFrom, match.mQueryTo, query_ali,
             options.target_prefix, options.output_format_id % id,
             match.mSbjctId, match.strand, match.mSbjctFrom, match.mSbjctTo,
             sbjct_ali))
        id += 1

    E.Stop()
示例#21
0
文件: psl2fasta.py 项目: Q-KIM/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--query-psl-file", dest="filename_query", type="string",
                      help="fasta filename with queries.")

    parser.add_option("--target-psl-file", dest="filename_target", type="string",
                      help="fasta filename with target.")

    parser.add_option("-m", "--method", dest="method", type="choice",
                      choices=(
                          "full", "pileup-query", "pileup-target", "gapless"),
                      help="method to use for constructing the alignment [%default].")

    parser.add_option("--forward-query", dest="forward_query", action="store_true",
                      help="reverse-complement sequences such that query is always on forward strand [%default]")

    parser.add_option("--target-prefix", dest="target_prefix", type="string",
                      help="prefix to use for target [%default].")

    parser.add_option("--query-prefix", dest="query_prefix", type="string",
                      help="prefix to use for query [%default].")

    parser.add_option("--id", dest="id", type="choice",
                      choices=("numeric", "query"),
                      help="choose type of identifier to use [%default]")

    parser.set_defaults(
        filename_query=None,
        filename_target=None,
        method="full",
        output_format_id="%06i",
        target_prefix="",
        query_prefix="",
        forward_query=False,
    )

    (options, args) = E.Start(parser)

    if options.filename_query:
        query = IndexedFasta.IndexedFasta(options.filename_query)

    if options.filename_target:
        target = IndexedFasta.IndexedFasta(options.filename_target)

    if options.method == "full":
        getAlignment = getAlignmentFull

    id = 0
    for match in Blat.iterator(options.stdin):
        if options.loglevel >= 2:
            options.stdout.write("# %s\n" % str(match))

        m = match.getMapQuery2Target()
        m.moveAlignment(-min(match.mQueryBlockStarts), -
                        min(match.mSbjctBlockStarts))
        q = query.getSequence(
            match.mQueryId, match.strand, match.mQueryFrom, match.mQueryTo)
        t = target.getSequence(
            match.mSbjctId, "+", match.mSbjctFrom, match.mSbjctTo)
        query_ali, sbjct_ali = getAlignment(m, q, t, options)

        if match.strand == "-" and options.forward_query:
            query_ali = Genomics.complement(query_ali)
            sbjct_ali = Genomics.complement(sbjct_ali)

        options.stdout.write(">%s%s:%s/%i-%i\n%s\n>%s%s:%s%s/%i-%i\n%s\n" %
                             (options.query_prefix,
                              options.output_format_id % id,
                              match.mQueryId, match.mQueryFrom, match.mQueryTo,
                              query_ali,
                              options.target_prefix,
                              options.output_format_id % id,
                              match.mSbjctId, match.strand,
                              match.mSbjctFrom, match.mSbjctTo,
                              sbjct_ali))
        id += 1

    E.Stop()
示例#22
0
文件: maf2psl.py 项目: gsc0107/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id: maf2psl.py 2879 2010-04-06 14:44:34Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-q", "--query", dest="query", type="string",
                      help="sequence to use for query [default=%default].")

    parser.add_option("-t", "--target", dest="target", type="string",
                      help="sequence to use for target [default=%default].")

    parser.set_defaults(
        query=None,
        target=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if options.query is None or options.target is None:
        if len(args) != 2:
            raise ValueError(
                "please supply two sequence identifiers for query and target")
        options.query, options.target = args

    # do sth
    ninput, nskipped, noutput = 0, 0, 0

    reader = maf.Reader(options.stdin)

    psl = Blat.Match()
    for cc in threaditer(reader, (options.query, options.target)):

        ninput += 1
        query, target = cc

        # treat identfiers like Hsap.GL000223.1
        try:
            data = query.src.split(".")
            qs, qcontig = data[0], ".".join(data[1:])
        except ValueError as msg:
            raise ValueError(
                "error: could not parse query %s: msg=%s" % (query.src, msg))

        try:
            data = target.src.split(".")
            ts, tcontig = data[0], ".".join(data[1:])
        except ValueError as msg:
            raise ValueError(
                "error: could not parse target %s: msg=%s" % (target.src, msg))

        assert qs == options.query
        assert ts == options.target
        psl.mQueryId = qcontig
        psl.mSbjctId = tcontig

        psl.fromPair(query.start, query.src_size, query.strand, query.text.upper(),
                     target.start, target.src_size, target.strand, target.text.upper())

        E.debug("%s\t%s\t%i\t%i\t%s\t%s" %
                (qs, qcontig, query.start, query.src_size, query.strand, query.text))
        E.debug("%s\t%s\t%i\t%i\t%s\t%s" %
                (ts, tcontig, target.start, target.src_size, target.strand, target.text))
        options.stdout.write("%s\n" % str(psl))
        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    # write footer and output benchmark information.
    E.Stop()
示例#23
0
文件: psl2map.py 项目: gsc0107/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: psl2map.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"])

    parser.add_option("--queries-tsv-file", dest="input_filename_queries", type="string",
                      help="fasta filename with queries - required for polyA analysis [%default].")

    parser.add_option("--polyA", dest="polyA", action="store_true",
                      help="detect polyA tails [%default].")

    parser.add_option("-p", "--output-filename-pattern", dest="output_filename_pattern", type="string",
                      help="OUTPUT filename with histogram information on aggregate coverages [%default].")

    parser.add_option("--output-filename-empty", dest="output_filename_empty", type="string",
                      help="OUTPUT filename with queries for which all matches have been discarded [%default].")

    parser.add_option("-o", "--output-format", dest="output_format", type="choice",
                      choices=("map", "psl"),
                      help="output format to choose [%default].")

    parser.add_option("-z", "--from-zipped", dest="from_zipped", action="store_true",
                      help="input is zipped.")

    parser.add_option("--threshold-min-pid", dest="threshold_min_pid", type="float",
                      help="minimum thresholds for pid [%default].")

    parser.add_option("--threshold-min-matches", dest="threshold_min_matches", type="int",
                      help="minimum threshold for number of matching residues [%default].")

    parser.add_option("--threshold-max-error-rate", dest="threshold_max_error_rate", type="float",
                      help="maximum threshold for error of aligned part [%default].")

    parser.add_option("--threshold-good-query-coverage", dest="threshold_good_query_coverage", type="float",
                      help="minimum query coverage for segments to be counted as good [%default].")

    parser.add_option("--threshold-min-query-coverage", dest="threshold_min_query_coverage", type="float",
                      help="minimum query coverage for segments to be accepted [%default].")

    parser.add_option("--threshold-max-query-gapchars", dest="threshold_max_query_gapchars", type="int",
                      help="maximum number of gap characters  in query[%default].")

    parser.add_option("--threshold-max-query-gaps", dest="threshold_max_query_gaps", type="int",
                      help="maximum number of gaps  in query[%default].")

    parser.add_option("--threshold-max-sbjct-gapchars", dest="threshold_max_sbjct_gapchars", type="int",
                      help="maximum number of gap characters  in sbjct[%default].")

    parser.add_option("--keep-unique-matches", dest="keep_unique_matches", action="store_true",
                      help="ignore filters for unique matches [%default].")

    parser.add_option("--keep-all-best", dest="keep_all_best", action="store_true",
                      help="when sorting matches, keep all matches within the collection threshold [%default].")

    parser.add_option("--output-best-per-subject", dest="best_per_sbjct", action="store_true",
                      help="keep only the best entry per sbjct (for transcript mapping) [%default].")

    parser.add_option("--threshold-max-sbjct-gaps", dest="threshold_max_sbjct_gaps", type="int",
                      help="maximum number of gaps  in sbjct[%default].")

    parser.add_option("--test", dest="test", type="int",
                      help="test - stop after # rows of parsing[%default].")

    parser.add_option("-m", "--matching-mode", dest="matching_mode", type="choice",
                      choices=("best-coverage", "best-query-coverage", "best-sbjct-coverage",
                               "best-pid", "best-covpid", "best-query-covpid", "best-sbjct-covpid",
                               "best-min-covpid", "best-query-min-covpid", "best-sbjct-min-covpid",
                               "unique", "all"),
                      help="determines how to selecte the best match [%default].")

    parser.add_option("--subjctfilter-tsv-file", dest="filename_filter_sbjct", type="string",
                      help="gff file for filtering sbjct matches. Matches overlapping these regions are discarded, but see --keep-forbidden [%default].")

    parser.add_option("--keep-forbidden", dest="keep_forbidden", action="store_true",
                      help="if set, keep only matches that overlap the regions supplied with --subjctfilter-tsv-file [%default].")

    parser.add_option("--query-forward-coordinates", dest="query_forward_coordinates", action="store_true",
                      help="use forward coordinates for query, strand will refer to sbjct [%default].")

    parser.add_option("--ignore-all-random", dest="ignore_all_random", action="store_true",
                      help="if there are multiple best matches, ignore all those to chrUn and _random [%default].")

    parser.add_option("--collection-threshold", dest="collection_threshold", type="float",
                      help="threshold for collecting matches, percent of best score [%default].")

    parser.add_option("--collection-distance", dest="collection_distance", type="float",
                      help="threshold for collecting matches, difference to best score [%default].")

    parser.set_defaults(input_filename_domains=None,
                        input_filename_queries=None,
                        threshold_good_query_coverage=90.0,
                        threshold_min_pid=30.0,
                        threshold_min_matches=0,
                        threshold_max_error_rate=None,
                        output_filename_pattern="%s",
                        keep_unique_matches=False,
                        output_format="map",
                        print_matched=["full", "partial", "good"],
                        from_zipped=False,
                        combine_overlaps=True,
                        min_length_domain=30,
                        threshold_min_query_coverage=50,
                        min_length_singletons=30,
                        new_family_id=10000000,
                        add_singletons=False,
                        matching_mode="best-coverage",
                        best_per_sbjct=False,
                        threshold_max_query_gapchars=None,
                        threshold_max_query_gaps=None,
                        threshold_max_sbjct_gapchars=None,
                        threshold_max_sbjct_gaps=None,
                        filename_filter_sbjct=None,
                        keep_forbidden=False,
                        keep_all_best=False,
                        test=None,
                        query_forward_coordinates=False,
                        output_filename_empty=None,
                        collection_threshold=1.0,
                        collection_distance=0,
                        polyA=False,
                        # max residues missing from non polyA end
                        polyA_max_unaligned=3,
                        # min residues in tail
                        polyA_min_unaligned=10,
                        # min percent residues that are A/T in tail
                        polyA_min_percent=70.0,
                        # ignore duplicate matches if they are on Un or
                        # _random
                        ignore_all_random=False,
                        )

    (options, args) = E.Start(parser, add_pipe_options=True)

    if len(args) == 1:
        if options.from_zipped or args[0][-3:] == ".gz":
            import gzip
            infile = gzip.open(args[0], "r")
        else:
            infile = IOTools.openFile(args[0], "r")
    else:
        infile = sys.stdin

    if options.input_filename_queries:
        queries_fasta = IndexedFasta.IndexedFasta(
            options.input_filename_queries)
    else:
        queries_fasta = None

    if options.filename_filter_sbjct:

        try:
            import bx.intervals.intersection
        except ImportError:
            raise ValueError("filtering for intervals requires the bx tools")

        intervals = GTF.readGFFFromFileAsIntervals(
           IOTools.openFile(options.filename_filter_sbjct, "r"))

        intersectors = {}

        for contig, values in list(intervals.items()):
            intersector = bx.intervals.intersection.Intersecter()
            for start, end in values:
                intersector.add_interval(bx.intervals.Interval(start, end))
            intersectors[contig] = intersector

        if options.loglevel >= 1:
            options.stdlog.write("# read %i intervals for %i contigs.\n" %
                                 (sum([len(x) for x in list(intervals.values())]),
                                  len(intersectors)))
    else:
        intersectors = None

    ################################################
    ################################################
    ################################################
    # processing of a chunk (matches of same query)
    ################################################
    ninput, noutput, nskipped = 0, 0, 0

    # number of sequences with full/partial/good matches
    nfull_matches, npartial_matches, ngood_matches = 0, 0, 0
    # number of sequences which are fully/good/partially matched
    # i.e., after combining all aligned regions
    nfully_matched, npartially_matched, nwell_matched = 0, 0, 0

    nremoved_pid, nremoved_query_coverage, nempty = 0, 0, 0
    nremoved_gaps, nremoved_nmatches = 0, 0
    nremoved_regions = 0
    nqueries_removed_region = 0

    aggregate_coverages = []
    mapped_coverages = []
    fully_matched = []
    well_matched = []
    partially_matched = []
    new_family_id = options.new_family_id

    if options.output_filename_empty:
        outfile_empty = IOTools.openFile(options.output_filename_empty, "w")
        outfile_empty.write("read_id\tcomment\n")
    else:
        outfile_empty = None

    if options.polyA:
        options.outfile_polyA = IOTools.openFile(
            options.output_filename_pattern % "polyA", "w")
        options.outfile_polyA.write("query_id\tstart\tend\tpA+N\tpT+N\ttail\n")

    def processChunk(query_id, matches):
        """process a set of matches from query_id"""

        global ninput, noutput, nskipped
        global nfull_matches, npartial_matches, ngood_matches
        global nremoved_pid, nremoved_query_coverage, nempty, nremoved_gaps, nremoved_nmatches
        global nremoved_regions, nqueries_removed_region
        global outfile_empty
        ninput += 1

        full_matches = []
        good_matches = []
        partial_matches = []

        x_nremoved_pid, x_nquery_coverage, x_nremoved_gaps, x_nremoved_nmatches = 0, 0, 0, 0
        nmatches = len(matches)

        new_matches = []

        # absolute filters applicable to non-fragmentory matches

        for match in matches:

            if match.mPid < options.threshold_min_pid:
                nremoved_pid += 1
                continue

            if match.mNMatches < options.threshold_min_matches:
                nremoved_nmatches += 1
                continue

            if options.threshold_max_error_rate:
                r = 100.0 * \
                    math.power(
                        options.threshold_max_error_rate, match.mNMatches + match.mNMismatches)
                if match.mPid < r:
                    nremoved_pid += 1
                    x_nremoved_pid += 1
                    continue

            new_matches.append(match)

        matches = new_matches

        # filter matches
        if len(matches) == 0:
            if outfile_empty:
                outfile_empty.write("%s\tall matches removed after applying thresholds: before=%i, npid=%i, nqcoverage=%i, ngaps=%i, nmatches=%i\n" %
                                    (query_id, nmatches, x_nremoved_pid, x_nquery_coverage, x_nremoved_gaps, x_nremoved_nmatches))
            nskipped += 1
            return

        if options.keep_unique_matches and len(matches) == 1:
            pass
        else:
            new_matches = []

            for match in matches:

                if match.mQueryCoverage < options.threshold_min_query_coverage:
                    nremoved_query_coverage += 1
                    x_nquery_coverage += 1
                    continue

                if options.threshold_max_query_gaps and options.threshold_max_query_gaps > match.mQueryNGapsCounts:
                    nremoved_gaps += 1
                    x_nremoved_gaps += 1
                    continue

                if options.threshold_max_query_gapchars and options.threshold_max_query_gapchars > match.mQueryNGapsBases:
                    nremoved_gaps += 1
                    x_nremoved_gaps += 1
                    continue

                if options.threshold_max_sbjct_gaps and options.threshold_max_sbjct_gaps > match.mSbjctNGapsCounts:
                    nremoved_gaps += 1
                    x_nremoved_gaps += 1
                    continue

                if options.threshold_max_sbjct_gapchars and options.threshold_max_sbjct_gapchars > match.mSbjctNGapsBases:
                    nremoved_gaps += 1
                    x_nremoved_gaps += 1
                    continue

                new_matches.append(match)
            matches = new_matches

        if len(matches) == 0:
            if outfile_empty:
                outfile_empty.write("%s\tall matches removed after applying thresholds: before=%i, npid=%i, nqcoverage=%i, ngaps=%i, nmatches=%i\n" %
                                    (query_id, nmatches, x_nremoved_pid, x_nquery_coverage, x_nremoved_gaps, x_nremoved_nmatches))
            nskipped += 1
            return

        # Remove queries matching to a forbidden region. This section
        # will remove the full query if any of its matches matches in a
        # forbidden region.
        keep = True
        for match in matches:
            if intersectors and match.mSbjctId in intersectors:
                found = intersectors[match.mSbjctId].find(
                    match.mSbjctFrom, match.mSbjctTo)
                if found and not options.keep_forbidden or (found and not options.keep_forbidden):
                    nremoved_regions += 1
                    keep = False
                    continue

        if not keep:
            nqueries_removed_region += 1
            if outfile_empty:
                outfile_empty.write(
                    "%s\toverlap with forbidden region\n" % query_id)
            return

        # check for full length matches
        for match in matches:
            if match.mQueryCoverage >= 99.9:
                full_matches.append(match)
            if match.mQueryCoverage > options.threshold_good_query_coverage:
                good_matches.append(match)
            else:
                partial_matches.append(match)

        if full_matches:
            nfull_matches += 1
        elif good_matches:
            ngood_matches += 1
        elif partial_matches:
            npartial_matches += 1

        # compute coverage of sequence with matches
        intervals = []
        for match in full_matches + good_matches + partial_matches:
            intervals.append((match.mQueryFrom, match.mQueryTo))

        rest = Intervals.complement(intervals, 0, match.mQueryLength)

        query_coverage = 100.0 * \
            (match.mQueryLength -
             sum([x[1] - x[0] for x in rest])) / match.mQueryLength

        if query_coverage >= 99.9:
            fully_matched.append(query_id)
        elif query_coverage > options.threshold_good_query_coverage:
            well_matched.append(query_id)
        else:
            partially_matched.append(query_id)

        aggregate_coverages.append(query_coverage)

        # select matches to output
        matches, msg = selectMatches(query_id, matches, options, queries_fasta)

        if len(matches) > 0:
            for match in matches:
                if options.query_forward_coordinates:
                    match.convertCoordinates()

                if options.output_format == "map":
                    options.stdout.write("%s\n" %
                                         "\t".join(map(str, (
                                             match.mQueryId, match.mSbjctId,
                                             match.strand,
                                             "%5.2f" % match.mQueryCoverage,
                                             "%5.2f" % match.mSbjctCoverage,
                                             "%5.2f" % match.mPid,
                                             match.mQueryLength,
                                             match.mSbjctLength,
                                             match.mQueryFrom, match.mQueryTo,
                                             match.mSbjctFrom, match.mSbjctTo,
                                             ",".join(
                                                 map(str, match.mBlockSizes)),
                                             ",".join(
                                                 map(str, match.mQueryBlockStarts)),
                                             ",".join(
                                                 map(str, match.mSbjctBlockStarts)),
                                         ))))
                elif options.output_format == "psl":
                    options.stdout.write(str(match) + "\n")

            noutput += 1
        else:
            if outfile_empty:
                outfile_empty.write(
                    "%s\tno matches selected: %s\n" % (query_id, msg))
            nempty += 1

    if options.output_format == "map":
        options.stdout.write("\t".join(("query_id", "sbjct_id", "sstrand", "qcoverage", "scoverage",
                                        "pid", "qlen", "slen", "qfrom", "qto", "sfrom", "sto", "blocks", "qstarts", "sstarts")) + "\n")
    elif options.output_format == "psl":
        options.stdout.write(Blat.Match().getHeader() + "\n")

    ################################################
    ################################################
    ################################################
    # main loop
    ################################################
    nfully_covered = None
    matches = []
    last_query_id = None
    is_complete = True
    ninput_lines = 0

    skip = 0

    iterator = Blat.BlatIterator(infile)

    while 1:

        try:
            match = next(iterator)
        except Blat.ParsingError:
            iterator = Blat.BlatIterator(infile)
            continue

        if match is None:
            break

        ninput_lines += 1

        if options.test and ninput_lines > options.test:
            break

        if match.mQueryId != last_query_id:
            if last_query_id:
                processChunk(last_query_id, matches)
            matches = []
            last_query_id = match.mQueryId

        matches.append(match)

    processChunk(last_query_id, matches)

    printHistogram(aggregate_coverages, "aggregate", options)

    printHistogram(mapped_coverages, "mapped", options)

    if "full" in options.print_matched:
        printMatched(fully_matched, "full", options)

    if "good" in options.print_matched:
        printMatched(well_matched, "good", options)

    if "partial" in options.print_matched:
        printMatched(partially_matched, "partial", options)

    if options.loglevel >= 1:
        options.stdlog.write(
            "# alignments: ninput=%i, is_complete=%s\n" % (ninput_lines, str(is_complete)))
        options.stdlog.write(
            "# queries: ninput=%i, noutput=%i\n" % (ninput, noutput))
        options.stdlog.write("# individual coverage: full=%i, good=%i, partial=%i\n" % (
            nfull_matches, ngood_matches, npartial_matches))
        options.stdlog.write("# aggregate  coverage: full=%i, good=%i, partial=%i\n" % (
            len(fully_matched), len(well_matched), len(partially_matched)))
        options.stdlog.write("# omitted queries: total=%i, thresholds=%i, regions=%i, selection=%i\n" %
                             (nskipped + nqueries_removed_region + nempty,
                              nskipped, nqueries_removed_region, nempty))
        options.stdlog.write("# omitted matches: pid=%i, query_coverage=%i, gaps=%i, regions=%i, nmatches=%i\n" % (
            nremoved_pid, nremoved_query_coverage, nremoved_gaps, nremoved_regions, nremoved_nmatches))

    E.Stop()
示例#24
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id: psl2table.py 2891 2010-04-07 08:59:18Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("--mask-lowercase", dest="mask_lowercase", action="store_true",
                      help="mask lowercase characters before computing properties [default=%default]")

    parser.add_option("--with-match", dest="with_match", action="store_true",
                      help="echo the match in output [default=%default]")

    parser.add_option("--without-match", dest="with_match", action="store_false",
                      help="do not echo the match in output [default=%default]")

    parser.add_option("-m", "--method", dest="methods", type="choice", action="append",
                      choices=(
                          "counts", "baseml", "match", "query-counts", "sbjct-counts"),
                      help="methods to compute properties between sequence pairs.")

    WrapperCodeML.BaseML().AddOptions(parser)

    parser.set_defaults(
        methods=[],
        mask_lowercase=False,
        is_pslx=True,
        with_match=True,
    )

    (options, args) = E.Start(parser)

    counters_plain = []
    counters = []

    for method in options.methods:
        if method == "counts":
            counters.append(
                SequencePairProperties.SequencePairPropertiesCountsNa())
        elif method == "query-counts":
            counters.append(QueriesCounter())
        elif method == "sbjct-counts":
            counters.append(SbjctsCounter())
        elif method == "baseml":
            counters.append(
                SequencePairProperties.SequencePairPropertiesBaseML(options))
        elif method == "match":
            counters_plain.append(CounterMatch(options))

    if counters:
        iterator = Blat.iterator_pslx(options.stdin)
        header = "\t".join(Blat.MatchPSLX().getHeaders())
    else:
        iterator = Blat.iterator(options.stdin)
        header = "\t".join(Blat.Match().getHeaders())

    if not options.with_match:
        header = "qName"

    options.stdout.write("\t".join(
        [header, ] +
        ["\t".join(x.getHeaders()) for x in counters] +
        ["\t".join(x.getHeaders()) for x in counters_plain]) + "\n")

    ninput, noutput, nskipped = 0, 0, 0



    for match in iterator:
        ninput += 1

        if options.with_match:
            options.stdout.write(str(match))
        else:
            options.stdout.write(match.mQueryId)

        if counters:

            qseq = match.mQuerySequence
            sseq = match.mSbjctSequence

            # mask non printable characters - sometimes
            # appear after using pslToPslX
            qseq = [re.sub("[^a-zA-Z]", "N", x) for x in qseq]
            sseq = [re.sub("[^a-zA-Z]", "N", x) for x in sseq]

            if options.mask_lowercase:
                qseq = [re.sub("[a-z]", "N", x) for x in qseq]
                sseq = [re.sub("[a-z]", "N", x) for x in sseq]

            match.mQuerySequence = qseq
            match.mSbjctSequence = sseq

            qseq = "".join(match.mQuerySequence).upper()
            sseq = "".join(match.mSbjctSequence).upper()

            if len(qseq) != len(sseq):
                if options.loglevel >= 1:
                    options.stdlog.write(
                        "# WARNING: two sequences of unequal length in match\n# %s\n" % str(match))
                nskipped += 1
                continue

            for counter in counters:
                counter(qseq, sseq)

            options.stdout.write("\t" +
                                 "\t".join(
                                     [str(counter) for counter in counters]))

        if counters_plain:

            for counter in counters_plain:
                counter(match)

            options.stdout.write("\t" +
                                 "\t".join(
                                     [str(counter) for counter in counters_plain]))

        options.stdout.write("\n")

        noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write(
            "# ninput=%i, noutput=%i, nskipped=%i\n" % (ninput, noutput, nskipped))

    E.Stop()
示例#25
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: psl2wiggle_stats.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"])

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome.")

    parser.add_option("--wiggle-files", dest="wiggle_files", type="string",
                      help="glob expression for wiggle files [%default].")

    parser.add_option("--prefix", dest="prefix", type="string",
                      help="prefix to add to contig names before lookup [%default].")

    parser.add_option("-z", "--from-zipped", dest="from_zipped", action="store_true",
                      help="input is zipped.")

    parser.add_option("--test", dest="test", type="int",
                      help="test - stop after # rows of parsing [%default].")

    parser.add_option("--with-values", dest="with_values", action="store_true",
                      help="output values in last column [%default].")

    parser.set_defaults(wiggle_files="*.data.bz2",
                        from_zipped=False,
                        prefix="",
                        with_values=False,
                        test=None)

    (options, args) = E.Start(parser, add_pipe_options=True)

    # open indexed access to wiggles
    wiggle_files = glob.glob(options.wiggle_files)
    if not wiggle_files:
        raise IOError("could not find wiggle files with '%s'" %
                      options.wiggle_files)

    index = Wiggle.WiggleMultiIndexedAccess(wiggle_files,
                                            keep_open=True,
                                            use_cache=False)

    iterator = Blat.BlatIterator(sys.stdin)

    ninput, noutput, nskipped = 0, 0, 0

    options.stdout.write(
        "query\tnali\t%s" % ("\t".join(Stats.DistributionalParameters().getHeaders())))
    if options.with_values:
        options.stdout.write("\tvalues")
    options.stdout.write("\n")

    while 1:

        if options.test and ninput >= options.test:
            break

        match = iterator.next()

        if match is None:
            break

        ninput += 1

        if options.loglevel >= 2:
            options.stdlog.write(str(match) + "\n")

        # psl always matches on the forward strand

        map_genome2query = alignlib_lite.py_makeAlignmentBlocks()
        f = alignlib_lite.py_AlignmentFormatBlat("%i\t%i\t%i\t%i\t%s\t%s\t%s\n" % (
            match.mSbjctFrom,
            match.mSbjctTo,
            match.mQueryFrom,
            match.mQueryTo,
            match.mSbjctBlockStarts,
            match.mQueryBlockStarts,
            match.mBlockSizes))
        f.copy(map_genome2query)

        data = index.get(options.prefix + match.mSbjctId,
                         match.mSbjctFrom,
                         match.mSbjctTo)

        values = []
        for x, vv in data:
            for v in vv:
                if map_genome2query.mapRowToCol(x) >= 0:
                    values.append(v)
                x += 1
        if len(values) == 0:
            nskipped += 1
            continue

        noutput += 1

        if options.loglevel >= 2:
            options.stdlog.write(
                "# %s\n" % ",".join(["%5.3f" % v for v in values]))

        s = Stats.DistributionalParameters(values)
        options.stdout.write("%s\t%i\t%s" % (match.mQueryId,
                                             match.mNMismatches +
                                             match.mNMatches,
                                             str(s)))

        if options.with_values:
            options.stdout.write(
                "\t%s" % ",".join(["%5.3f" % v for v in values]))

        options.stdout.write("\n")

    if options.loglevel >= 1:
        options.stdlog.write(
            "# ninput=%i, noutput=%i, nskipped=%i\n" % (ninput, noutput, nskipped))

    E.Stop()
示例#26
0
文件: chain2psl.py 项目: yangjl/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: chain2psl.py 2899 2010-04-13 14:37:37Z andreas $",
        usage=globals()["__doc__"])

    ## add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    ## do sth
    ninput, nskipped, noutput = 0, 0, 0

    psl = None

    def chain_iterator(infile):
        lines = []
        for line in options.stdin:

            if line.startswith("#"): continue
            if line.strip() == "": continue
            if line.startswith("chain"):
                if lines: yield lines
                lines = []
            lines.append(line)

        yield lines

    for lines in chain_iterator(options.stdin):

        ninput += 1
        psl = Blat.Match()

        (_, _, psl.mSbjctId, target_length, target_strand, target_start,
         target_end, psl.mQueryId, query_length, query_strand, query_start,
         query_end, alignment_id) = lines[0][:-1].split()

        ( psl.mQueryStart, psl.mQueryEnd, psl.mQueryLength,
          psl.mSbjctStart, psl.mSbjctEnd, psl.mSbjctLength ) = \
        [ int(x) for x in
          (query_start,
           query_end,
           query_length,
           target_start,
           target_end,
           target_length) ]

        map_query2target = alignlib_lite.py_makeAlignmentBlocks()

        qstart, tstart = psl.mQueryStart, psl.mSbjctStart

        for line in lines[1:-1]:
            size, dt, dq = [int(x) for x in line[:-1].split()]
            map_query2target.addDiagonal(qstart, qstart + size,
                                         tstart - qstart)
            qstart += size + dq
            tstart += size + dt

        size = int(lines[-1][:-1])

        map_query2target.addDiagonal(qstart, qstart + size, tstart - qstart)

        psl.fromMap(map_query2target)

        # sort out strand
        # target_strand is always positive
        assert (target_strand == "+")

        # if query strand is negative
        if query_strand == "-":
            # invert both query and target
            psl.switchTargetStrand()
            # manually invert the query coordinates
            psl.mQueryFrom, psl.mQueryTo = psl.mQueryLength - psl.mQueryTo, psl.mQueryLength - psl.mQueryFrom

        options.stdout.write("%s\n" % psl)
        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    ## write footer and output benchmark information.
    E.Stop()
示例#27
0
文件: psl2psl.py 项目: Q-KIM/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id$",
        usage=globals()["__doc__"])

    parser.add_option("--filter-query", dest="filename_filter_query",
                      type="string",
                      help="filename with intervals in the query "
                      "to filter (in gff format) [default=%default].")

    parser.add_option("--filter-target", dest="filename_filter_target",
                      type="string",
                      help="filename with intervals in the target to "
                      "filter (in gff format) [default=%default].")

    parser.add_option("-m", "--method", dest="methods", type="choice",
                      action="append",
                      choices=("map", "merge",
                               "add-sequence", "complement",
                               "select-query", "test",
                               "filter-keep", "filter-remove",
                               "rename-query",
                               "sanitize",
                               "filter-fasta",
                               "remove-overlapping-query",
                               "remove-overlapping-target"),
                      help="""action to perform [default=%default].""")

    parser.add_option("--select", dest="select", type="choice",
                      choices=("most-nmatches", "least-nmatches",
                               "most-nmismatches", "least-nmismatches"),
                      help="entry to select [default=%default].")

    parser.add_option("--header-names", dest="header", type="choice",
                      choices=("none", "table", "full"),
                      help="output psl header [default=%default].")

    parser.add_option("--format", dest="format", type="choice",
                      choices=("gff", "gtf"),
                      help="format of intervals [default=%default].")

    parser.add_option("--queries-tsv-file", dest="filename_queries",
                      type="string",
                      help="fasta filename with queries.")

    parser.add_option("--target-psl-file", dest="filename_sbjcts",
                      type="string",
                      help="fasta filename with sbjct [default=%default].")

    parser.add_option("--id-format", dest="id_format", type="string",
                      help="format of new identifiers for the rename "
                      "function [default=%default].")

    parser.add_option("--unique", dest="unique", action="store_true",
                      help="in the rename function, make each match "
                      "unique [default=%default].")

    parser.add_option("--output-filename-map", dest="output_filename_map",
                      type="string",
                      help="filename with map of old to new labels for "
                      "rename function [default=%default].")

    parser.add_option("--complement-min-length", dest="complement_min_length",
                      type="int",
                      help="minimum length for complemented blocks "
                      "[default=%default].")

    parser.add_option("--complement-border", dest="complement_border",
                      type="int",
                      help="number of residues to exclude before alignment "
                      "at either end [default=%default].")

    parser.add_option("--complement-aligner", dest="complement_aligner",
                      type="choice",
                      choices=("clustal", "dba", "dialign", "dialign-lgs"),
                      help="aligner for complemented segments "
                      "[default=%default].")

    parser.add_option("--threshold-merge-distance",
                      dest="threshold_merge_distance", type="int",
                      help="distance in nucleotides at which two adjacent "
                      "reads shall be merged even if they are not "
                      "overlapping [%default].")

    parser.add_option("--test", dest="test", type="int",
                      help="for debugging purposes - stop after x "
                      "iterations [default=%default].")

    parser.set_defaults(filename_filter_target=None,
                        filename_filter_query=None,
                        filename_queries=None,
                        filename_sbjcts=None,
                        threshold_merge_distance=0,
                        report_step=100000,
                        min_aligned=100,
                        methods=[],
                        format="gff",
                        select="most-nmatches",
                        id_format="%06i",
                        unique=False,
                        output_filename_map=None,
                        header=None,
                        test=None)

    (options, args) = E.Start(parser, add_pipe_options=True)

    if options.filename_queries:
        query_fasta = IndexedFasta.IndexedFasta(options.filename_queries)
    else:
        query_fasta = None

    if options.filename_sbjcts:
        sbjct_fasta = IndexedFasta.IndexedFasta(options.filename_sbjcts)
    else:
        sbjct_fasta = None

    if "add-sequence" in options.methods and \
       (sbjct_fasta is None or query_fasta is None):
        raise ValueError(
            "please supply both indexed query and "
            "target/genome sequence data.")

    iterator = Blat.iterator(options.stdin)

    if options.header is not None or options.header != "none":
        if options.header == "table":
            options.stdout.write("\t".join(Blat.FIELDS) + "\n")
        elif options.header == "full":
            options.stdout.write(Blat.HEADER + "\n")

    for method in options.methods:

        if "map" == method:
            pslMap(options)
            break
        elif "filter-keep" == method:
            pslFilter(options, keep=True)
            break
        elif "filter-remove" == method:
            pslFilter(options, keep=False)
            break
        elif "merge" == method:
            pslMerge(options)
            break
        elif "add-sequence" == method:
            pslAddSequence(query_fasta, sbjct_fasta, options)
            break
        elif "complement" == method:
            pslComplement(query_fasta, sbjct_fasta, options)
            break
        elif "select-query" == method:
            pslSelectQuery(options)
            break
        elif "test" == method:
            iterator = Blat.iterator_test(iterator, options.report_step)
        elif "rename-query" == method:
            iterator = iterator_rename_query(iterator, options)
        elif "sanitize" == method:
            iterator = iterator_sanitize(
                iterator, query_fasta, sbjct_fasta, options)
        elif "filter-fasta" == method:
            iterator = iterator_filter_fasta(
                iterator, query_fasta, sbjct_fasta, options)
        elif "remove-overlapping-query" == method:
            iterator = iterator_filter_overlapping_query(iterator, options)
        elif "remove-overlapping-target" == method:
            iterator = iterator_filter_overlapping_target(iterator, options)

    for psl in iterator:
        options.stdout.write("%s\n" % str(psl))

    E.Stop()
示例#28
0
def pslMerge(options):
    """merge psl alignments.
    """

    iterator = Blat.BlatIterator(sys.stdin)

    ninput, noutput, ndiscarded, nskipped = 0, 0, 0, 0

    last_query = None
    last_target = None
    last_strand = None

    def process(matches):

        new = matches[0].copy()

        map_query2target = alignlib_lite.py_makeAlignmentBlocks()

        graph = networkx.DiGraph()
        graph.add_nodes_from(range(len(matches) + 2))

        matches.sort(key=lambda x: x.mQueryFrom)

        if Genomics.IsPositiveStrand(matches[0].strand):
            f = lambda x, y: x.mSbjctTo < y.mSbjctFrom
        else:
            f = lambda x, y: x.mSbjctFrom > y.mSbjctTo

        for x in range(0, len(matches)):

            xx = matches[x]
            if options.loglevel >= 6:
                options.stdlog.write("# graph: %2i %s\n" % (x, str(xx)))

            for y in range(x + 1, len(matches)):
                yy = matches[y]
                d = min(xx.mQueryTo, yy.mQueryTo) - \
                    max(xx.mQueryFrom, yy.mQueryFrom)
                if d > 0 or not f(xx, yy):
                    continue
                else:
                    graph.add_edge(x, y, {'weight': -d})

        source = len(matches)
        target = len(matches) + 1
        for x in range(len(matches)):
            xx = matches[x]
            graph.add_edge(source, x, {'weight': xx.mQueryFrom})
            graph.add_edge(
                x, target, {'weight': xx.mQueryLength - xx.mQueryTo})

        if options.loglevel >= 6:
            networkx.write_edgelist(graph, options.stdlog)

        path = networkx.dijkstra_path(graph, source, target)

        if options.loglevel >= 6:
            options.stdlog.write("# path: %s\n" % (str(path)))

        new_matches = [matches[x] for x in path[1:-1]]

        if len(matches) != len(new_matches):
            E.warn(("query=%s, target=%s, strand=%s: "
                    "removed overlapping/out-of-order segments: "
                    "before=%i, after=%i") %
                   (matches[0].mQueryId,
                    matches[0].mSbjctId,
                    matches[0].strand,
                    len(matches),
                    len(new_matches)))

        matches = new_matches

        for match in matches:
            m = match.getMapQuery2Target()
            alignlib_lite.py_addAlignment2Alignment(map_query2target, m)

        new.fromMap(map_query2target, use_strand=True)

        options.stdout.write(str(new) + "\n")
        options.stdout.flush()
        return 1

    while 1:

        match = next(iterator)
        if not match:
            break

        ninput += 1
        if options.test and ninput >= options.test:
            break

        if options.loglevel >= 10:
            options.stdlog.write("# input: %s\n" % (str(match)))

        if ninput % options.report_step == 0:
            E.info("progress: ninput=%i, noutput=%i" % (ninput, noutput))

        if match.mQueryId != last_query or\
           match.strand != last_strand or\
           match.mSbjctId != last_target:
            if last_query:
                noutput += process(matches)
            matches = []
            last_query, last_target, last_strand = (
                match.mQueryId, match.mSbjctId, match.strand)

        matches.append(match)

    if last_query:
        noutput += process(matches)

    E.info("ninput=%i, noutput=%i, nskipped=%i, ndiscarded=%i" %
           (ninput, noutput, nskipped, ndiscarded))
示例#29
0
文件: psl2fasta.py 项目: siping/cgat
        forward_query = False,
        )
    
    (options, args) = E.Start( parser )

    if options.filename_query:
        query = IndexedFasta.IndexedFasta( options.filename_query )

    if options.filename_target:
        target = IndexedFasta.IndexedFasta( options.filename_target )

    if options.method == "full":
        getAlignment = getAlignmentFull

    id = 0
    for match in Blat.iterator( options.stdin ):        
        if options.loglevel >= 2:
            options.stdout.write("# %s\n" % str(match))

        m = match.getMapQuery2Target()
        m.moveAlignment( -min(match.mQueryBlockStarts), -min(match.mSbjctBlockStarts) )
        q = query.getSequence( match.mQueryId, match.strand, match.mQueryFrom, match.mQueryTo )
        t = target.getSequence( match.mSbjctId, "+", match.mSbjctFrom, match.mSbjctTo )
        query_ali, sbjct_ali = getAlignment( m, q, t, options )

        if match.strand == "-" and options.forward_query:
            query_ali = Genomics.complement( query_ali )
            sbjct_ali = Genomics.complement( sbjct_ali )

        options.stdout.write(">%s%s:%s/%i-%i\n%s\n>%s%s:%s%s/%i-%i\n%s\n" % \
                                 (options.query_prefix, 
示例#30
0
文件: maq2psl.py 项目: yangjl/cgat
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: maq2psl.py 2781 2009-09-10 11:33:14Z andreas $",
                                    usage = globals()["__doc__"] )

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome."  )

    parser.add_option("-c", "--filename-coordinates", dest="filename_coordinates", type="string",
                      help="filename with coordinates."  )

    parser.add_option( "-p", "--output-filename-pattern", dest="output_filename_pattern", type="string" ,
                       help="OUTPUT filename pattern for additional data [%default].")

    parser.set_defaults(
        genome_file = "genome",
        filename_coordinates = None,
        segment_length = 32,
        )

    (options, args) = E.Start( parser )

    if options.genome_file:
        genome = IndexedFasta.IndexedFasta( options.genome_file )
    else:
        genome = None

    ninput, noutput = 0, 0

    if options.filename_coordinates:
        segment_length = options.segment_length
        a = matchby_sequence(  iterator_segments( open( options.filename_coordinates, "r"), options.segment_length ),
                               Maq.iterator( options.stdin ),
                               lambda x: (x.mSegment),
                               lambda x: (x.contig) )
        
        for segments, maqs in a:
            pairs = match_smaller( segments, maqs,
                                   lambda x: x.start, 
                                   lambda x: x.start ) 

            for segment, maq in pairs:
                ninput += 1

                assert maq.start >= segment.start, "maq start < segment start: %i < %i" % (maq.start, segment.start)
                assert maq.start + maq.mLength <= segment.start + 2 * segment_length, "maq end > segment end: %i < %i" % (maq.start + maq.mLength, segment.start + 2 * segment_length)
        
                psl = Blat.Match()
                psl.fromMaq( maq )

                match_start = maq.start
                segment_start = segment.start
                contig, left_start, right_start = segment.contig, segment.mLeftStart, segment.mRightStart

                if options.loglevel >= 2:
                    options.stdlog.write("# mapping: name=%s, match_start=%i, segment=%s\n" % (maq.contig, match_start, str(segment)))

                # build positions of the two blocks
                left_size = segment_length - (match_start - segment_start)
                right_size = segment_length - left_size
                mapped1_start = left_start + match_start - segment_start
                mapped1_end   = left_start + segment_length
                mapped2_start = right_start
                mapped2_end   = right_start + right_size

                if options.loglevel >= 3:
                    options.stdlog.write("# mapped: match_start=%i, segment_start=%i, left_size=%i, right_size=%i, mapped1=(%i-%i), mapped2=(%i-%i)\n" %\
                                             (match_start, segment_start, left_size, right_size, mapped1_start, mapped1_end, mapped2_start, mapped2_end) )


                psl.mSbjctId = contig
                if genome: psl.mSbjctLength = genome.getLength( contig )
                psl.mSbjctFrom = mapped1_start
                psl.mSbjctTo = mapped2_end
                psl.mNBlocks = 2
                psl.mBlockSizes= [left_size, right_size]
                psl.mQueryBlockStarts = [0, left_size]
                psl.mSbjctBlockStarts = [mapped1_start, mapped2_start]
                psl.mSbjctNGapsCounts = 1
                psl.mSbjctNGapsBases = mapped2_start - mapped1_end

                options.stdout.write( str(psl) + "\n" )
                noutput += 1

    else:

        for maq in Maq.iterator( options.stdin ):
            ninput += 1

            psl = Blat.Match()
            psl.fromMaq( maq )

            options.stdout.write( str(psl) + "\n" )
            noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write( "# ninput=%i, noutput=%i\n" % (ninput, noutput) )

    E.Stop()
示例#31
0
def main():

    parser = E.OptionParser( version = "%prog version: $Id: malis2masks.py 2781 2009-09-10 11:33:14Z andreas $", usage = globals()["__doc__"])

    parser.add_option("--random-proportion", dest="random_proportion", type="float",
                      help="mask randomly columns in multiple alignments [default=%default]" )

    parser.add_option("--random", dest="random", action="store_true",
                      help="shuffle quality scores before masking [default=%default]" )

    parser.set_defaults(
        quality_threshold = 40,
        quality_file = "quality",
        filename_map = None,
        frame = 3,
        )

    (options, args) = E.Start( parser )

    ##################################################
    ##################################################
    ##################################################
    ## read map
    ##################################################
    infile = open(options.filename_map) 
    map_genes2genome = {}
    for match in Blat.iterator( infile ):
        assert match.mQueryId not in map_genes2genome, "duplicate entry %s" % match.mQueryId
        map_genes2genome[match.mQueryId] = match
    infile.close()

    ##################################################
    ##################################################
    ##################################################
    ## get quality scores
    ##################################################
    quality = IndexedFasta.IndexedFasta( options.quality_file )
    quality.setTranslator( IndexedFasta.TranslatorBytes() )

    ##################################################
    ##################################################
    ##################################################
    ## main loop
    ##################################################
    ninput, noutput, nmissed = 0, 0, 0

    options.stdout.write( "cluster_id\tstart\tend\n" )

    for line in options.stdin:
        if line.startswith("cluster_id"): continue
        ninput += 1
        cluster_id, gene_id, alignment = line[:-1].split("\t")

        if gene_id not in map_genes2genome:
            nmissed += 1
            E.warn( "gene_id %s not found in map." % gene_id )
            continue
        
        match = map_genes2genome[gene_id]
        map_gene2genome = match.getMapQuery2Target()
        is_negative = match.strand == "-"

        # if strand is negative, the coordinates are 
        # on the negative strand of the gene/query
        # in order to work in the right coordinate system
        # revert the sequence
        if is_negative: 
            alignment = alignment[::-1]

        # get map of gene to alignment
        map_gene2mali = alignlib_lite.py_makeAlignmentVector()
        fillAlignment( map_gene2mali, alignment )

        # get quality scores
        quality_scores = quality.getSequence( match.mSbjctId, "+", match.mSbjctFrom, match.mSbjctTo)


        # print str(alignlib_lite.py_AlignmentFormatEmissions( map_gene2genome))
        # print str(alignlib_lite.py_AlignmentFormatEmissions( map_gene2mali))
        # print quality_scores

        map_mali2genome = alignlib_lite.py_makeAlignmentVector()
        alignlib_lite.py_combineAlignment( map_mali2genome, map_gene2mali, map_gene2genome, alignlib_lite.py_RR )
        # print str(alignlib_lite.py_AlignmentFormatEmissions( map_mali2genome))

        # shuffle quality scores, but only those that are aligned
        if options.random:
            positions = []
            for fp,c in enumerate(alignment):
                if c == "-": continue
                y = map_mali2genome.mapRowToCol( fp ) - match.mSbjctFrom 
                if y < 0: continue
                positions.append( y )
            scores = [ quality_scores[ x ] for x in positions ]
            random.shuffle(scores)
            for p,q in zip( positions,scores): quality_scores[p] = q

        # negative strand
        to_mask = []
        ## reverse position
        rp = len(alignment)
        for fp,c in enumerate(alignment):
            rp -= 1
            if c == "-": continue
            y = map_mali2genome.mapRowToCol( fp ) - match.mSbjctFrom
            if y < 0: continue
            if quality_scores[y] < options.quality_threshold:
                if is_negative: p = rp
                else: p = fp
                E.debug( "low quality base: id=%s, mali=%i, char=%s, contig=%s, strand=%s, pos=%i, quality=%i" % \
                             (cluster_id, p, c, match.mSbjctId, match.strand, map_mali2genome.mapRowToCol( fp ), quality_scores[y] ) )
                if options.frame > 1:
                    start = (p // options.frame) * options.frame
                    to_mask.extend( list( range(start, start + options.frame) ) )
                else:
                    to_mask.append( p ) 

        regions = Iterators.group_by_distance( sorted(to_mask) )
            
        for start,end in regions:
            options.stdout.write( "%s\t%i\t%i\n" % (cluster_id, start, end ) )

        noutput += 1

    E.info( "ninput=%i, noutput=%i, nmissed=%i" % (ninput, noutput, nmissed) )

    E.Stop()
示例#32
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: bed2psl.py 2899 2010-04-13 14:37:37Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-q",
                      "--query",
                      dest="query",
                      type="string",
                      help="sequence to use for query [default=%default].")

    parser.add_option("-t",
                      "--target",
                      dest="target",
                      type="string",
                      help="sequence to use for target [default=%default].")

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome.")

    parser.set_defaults(
        genome_file=None,
        query=None,
        target=None,
    )

    ## add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    ## do sth
    ninput, nskipped, noutput = 0, 0, 0

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        fasta = None

    psl = Blat.Match()

    for bed in Bed.iterator(options.stdin):

        ninput += 1

        start, end = bed.start, bed.end

        if "blockSizes" in bed:
            psl.mQueryId = bed["name"]
            blocksizes = [int(x) for x in bed["blockSizes"].split(",")[:-1]]
            sbjctblockstarts = [
                int(x) + start for x in bed["blockStarts"].split(",")[:-1]
            ]
            strand = bed["strand"]
        else:
            psl.mQueryId = "%i" % ninput
            blocksizes = [end - start]
            sbjctblockstarts = [
                start,
            ]

            strand = "+"

        psl.mSbjctId = bed.contig
        psl.mSbjctFrom, psl.mSbjctTo = start, end
        psl.mQueryFrom, psl.mQueryTo = 0, end - start

        psl.mBlockSizes = blocksizes
        psl.mNBlocks = len(blocksizes)
        psl.strand = strand
        q, qp = [], 0
        for x in blocksizes:
            q.append(qp)
            qp += x

        psl.mQueryBlockStarts = q
        psl.mSbjctBlockStarts = sbjctblockstarts
        psl.mQueryLength = sum(psl.mBlockSizes)
        if fasta:
            psl.mSbjctLength = fasta.getLength(bed.contig)

        options.stdout.write("%s\n" % str(psl))
        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    ## write footer and output benchmark information.
    E.Stop()
示例#33
0
def main():

    parser = E.OptionParser(
        version=
        "%prog version: $Id: psl2gff.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-a",
                      "--as-gtf",
                      dest="as_gtf",
                      action="store_true",
                      help="output as gtf.")

    parser.add_option(
        "-s",
        "--filename-strand",
        dest="filename_strand",
        type="string",
        help="set strand information according to file [default=%DEFAULT].")

    parser.set_defaults(as_gtf=False, filename_strand=None, test=None)

    (options, args) = E.Start(parser, add_pipe_options=True)

    ####################################
    if options.filename_strand:
        map_id2strand = IOTools.readMap(open(options.filename_strand, "r"))
    else:
        map_id2strand = {}

    iterator = Blat.BlatIterator(sys.stdin)

    ninput, noutput, nskipped = 0, 0, 0

    if options.as_gtf:
        gff = GTF.Entry()
    else:
        gff = GTF.Entry()

    gff.source = "psl"
    gff.feature = "exon"

    ids = {}

    while 1:

        if options.test and ninput >= options.test:
            break

        match = iterator.next()

        if match is None:
            break

        ninput += 1

        if match.mQueryId not in ids:
            ids[match.mQueryId] = 1
            id = match.mQueryId
        else:
            id = match.mQueryId + ":%i" % ids[match.mQueryId]
            ids[match.mQueryId] += 1

        if options.as_gtf:
            gff.contig = match.mSbjctId
            gff.gene_id = id
            gff.transcript_id = id
        else:
            gff.contig = match.mSbjctId
            gff.clearAttributes()
            gff.addAttribute("gene_id", id)

        if id in map_id2strand:
            gff.strand = map_id2strand[id]
        else:
            gff.strand = match.strand

        for qstart, sstart, size in match.getBlocks():

            gff.start = sstart
            gff.end = sstart + size
            options.stdout.write(str(gff) + "\n")

        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    E.Stop()
示例#34
0
def main(argv=sys.argv):

    parser = E.OptionParser(
        version="%prog version: $Id: psl2wiggle.py 2834 2009-11-24 16:11:23Z andreas $", usage=globals()["__doc__"])

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome [default=%default].")

    parser.add_option("-b", "--output-filename-pattern", dest="output_filename", type="string",
                      help="filename for output [default=%default]")

    parser.add_option("-o", "--output-format", dest="output_format", type="choice",
                      choices=("bedgraph", "wiggle", "bigbed", "bigwig"),
                      help="output format [default=%default]")

    parser.set_defaults(genome_file=None,
                        typecode=numpy.int16,
                        output_filename=None,
                        output_format="wiggle",
                        test=None)

    (options, args) = E.Start(parser, add_pipe_options=True)

    typecode = options.typecode

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        counts = {}
        contig_sizes = fasta.getContigSizes(with_synonyms=False)
        E.info("allocating memory for %i contigs and %i bytes" %
               (len(contig_sizes), sum(contig_sizes.values()) * typecode().itemsize))
        for contig, size in contig_sizes.items():
            E.debug("allocating %s: %i bases" % (contig, size))
            counts[contig] = numpy.zeros(size, typecode)

        E.info("allocated memory for %i contigs" % len(fasta))

    else:
        fasta = None
        contig_sizes = {}

    if options.output_format in ("bigwig", "bigbed"):

        if not options.genome_file:
            raise ValueError(
                "please supply genome file for bigwig/bigbed computation.")

        if not options.output_filename:
            raise ValueError(
                "please output file for bigwig/bigbed computation.")

        if options.output_format == "bigwig":
            executable_name = "wigToBigWig"
        elif options.output_format == "bigbed":
            executable_name = "bedToBigBed"
        else:
            raise ValueError("unknown output format `%s`" %
                             options.output_format)

        executable = IOTools.which(executable_name)

        if not executable:
            raise OSError("could not find %s in path." % executable_name)

        tmpdir = tempfile.mkdtemp()
        E.debug("temporary files are in %s" % tmpdir)

        tmpfile_wig = os.path.join(tmpdir, "wig")
        tmpfile_sizes = os.path.join(tmpdir, "sizes")

        # write contig sizes
        outfile_size = open(tmpfile_sizes, "w")
        for contig, size in contig_sizes.items():
            outfile_size.write("%s\t%s\n" % (contig, size))
        outfile_size.close()

        outfile = open(tmpfile_wig, "w")

    else:
        outfile = options.stdout

    iterator = Blat.BlatIterator(sys.stdin)

    ninput, ncontigs, nskipped = 0, 0, 0

    E.info("started counting")

    while 1:

        if options.test and ninput >= options.test:
            break

        match = iterator.next()

        if match is None:
            break

        ninput += 1

        contig = match.mSbjctId

        for start, length in zip(match.mSbjctBlockStarts, match.mBlockSizes):
            counts[contig][start:start + length] += 1

    E.info("finished counting")

    if options.output_format in ("wig", "bigwig"):
        E.info("starting wig output")

        for contig, vals in counts.items():

            E.debug("output for %s" % contig)
            for val, iter in itertools.groupby(enumerate(vals), lambda x: x[1]):
                l = list(iter)
                start, end = l[0][0], l[-1][0]
                val = vals[start]
                if val > 0:
                    outfile.write("variableStep chrom=%s span=%i\n" %
                                  (contig, end - start + 1))
                    outfile.write("%i\t%i\n" % (start, val))

            ncontigs += 1
    elif options.output_format in ("bedgraph", "bigbed"):

        E.info("starting bedgraph output")

        for contig, vals in counts.items():
            E.debug("output for %s" % contig)
            for val, iter in itertools.groupby(enumerate(vals), lambda x: x[1]):
                l = list(iter)
                start, end = l[0][0], l[-1][0]
                val = vals[start]
                if val > 0:
                    outfile.write("%s\t%i\t%i\t%i\n" %
                                  (contig, start, end + 1, val))

            ncontigs += 1

    E.info("finished output")

    if options.output_format in ("bigwig", "bigbed"):
        outfile.close()

        E.info("starting bigwig conversion")
        try:
            retcode = subprocess.call(" ".join((executable,
                                                tmpfile_wig,
                                                tmpfile_sizes,
                                                os.path.abspath(options.output_filename)), ),
                                      shell=True)
            if retcode < 0:
                warn("wigToBigWig terminated with signal: %i" % -retcode)
                return -retcode
        except OSError, msg:
            warn("Error while executing bigwig: %s" % e)
            return 1

        shutil.rmtree(tmpdir)

        E.info("finished bigwig conversion")
示例#35
0
文件: psl2table.py 项目: siping/cgat
    counters = []

    for method in options.methods:
        if method == "counts":
            counters.append( SequencePairProperties.SequencePairPropertiesCountsNa() )
        elif method == "query-counts":
            counters.append( QueriesCounter() )
        elif method == "sbjct-counts":
            counters.append( SbjctsCounter() )
        elif method == "baseml":
            counters.append( SequencePairProperties.SequencePairPropertiesBaseML( options ) )
        elif method == "match":
            counters_plain.append( CounterMatch( options ) )
            
    if counters:
        iterator = Blat.iterator_pslx( options.stdin )
        header = "\t".join(Blat.MatchPSLX().getHeaders())
    else:
        iterator = Blat.iterator( options.stdin )
        header = "\t".join(Blat.Match().getHeaders())

    if not options.with_match:
        header = "qName"

    options.stdout.write( "\t".join( 
            [header,] + 
            [ "\t".join(x.getHeaders()) for x in counters] +
            [ "\t".join(x.getHeaders()) for x in counters_plain] ) + "\n" )

    ninput, noutput, nskipped = 0, 0, 0
示例#36
0
文件: gff2psl.py 项目: santayana/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: gff2psl.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"])

    parser.add_option("--is-gtf", dest="is_gtf", action="store_true",
                      help="input is gtf.")

    parser.add_option("--no-header", dest="with_header", action="store_false",
                      help="do not output BLAT header [default=%default].")

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome.")

    parser.add_option("--input-filename-queries", dest="input_filename_queries", type="string",
                      help="fasta filename with queries [default=%default].")

    parser.add_option("--allow-duplicates", dest="allow_duplicates", action="store_true",
                      help="""permit duplicate entries. Adjacent exons of a transcript will still be merged [default=%default]."""  )

    parser.set_defaults(is_gtf=False,
                        genome_file=None,
                        with_header=True,
                        allow_duplicates=False,
                        test=None)

    (options, args) = E.Start(parser, add_pipe_options=True)

    if options.genome_file:
        genome_fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        genome_fasta = None

    if options.input_filename_queries:
        queries_fasta = IndexedFasta.IndexedFasta(
            options.input_filename_queries)
    else:
        queries_fasta = None

    ninput, noutput, nskipped = 0, 0, 0

    if options.is_gtf:
        iterator = GTF.transcript_iterator(GTF.iterator_filtered(GTF.iterator(sys.stdin),
                                                                 feature="exon"),
                                           strict=not options.allow_duplicates)
    else:
        iterator = GTF.joined_iterator(GTF.iterator(sys.stdin))

    if options.with_header:
        options.stdout.write(Blat.Match().getHeader() + "\n")

    for gffs in iterator:

        if options.test and ninput >= options.test:
            break

        ninput += 1

        result = alignlib_lite.py_makeAlignmentBlocks()

        xstart = 0

        intervals = Intervals.combine([(gff.start, gff.end) for gff in gffs])

        for start, end in intervals:
            xend = xstart + end - start

            result.addDiagonal(xstart, xend,
                               start - xstart)
            xstart = xend

        entry = Blat.Match()
        entry.mQueryId = gff.transcript_id
        entry.mSbjctId = gff.contig
        entry.strand = gff.strand

        if genome_fasta:
            if entry.mSbjctId in genome_fasta:
                entry.mSbjctLength = genome_fasta.getLength(entry.mSbjctId)
            else:
                entry.mSbjctLength = result.getColTo()

        if queries_fasta:
            if entry.mQueryId in queries_fasta:
                entry.mQueryLength = queries_fasta.getLength(entry.mQueryId)
        else:
            entry.mQueryLength = result.getRowTo()

        entry.fromMap(result)

        options.stdout.write(str(entry) + "\n")
        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    E.Stop()
示例#37
0
def main(argv=None):

    parser = E.OptionParser(
        version=
        "%prog version: $Id: malis2masks.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "--random-proportion",
        dest="random_proportion",
        type="float",
        help="mask randomly columns in multiple alignments [default=%default]")

    parser.add_option(
        "--random",
        dest="random",
        action="store_true",
        help="shuffle quality scores before masking [default=%default]")

    parser.set_defaults(
        quality_threshold=40,
        quality_file="quality",
        filename_map=None,
        frame=3,
    )

    (options, args) = E.Start(parser)

    ##################################################
    ##################################################
    ##################################################
    # read map
    ##################################################
    infile = open(options.filename_map)
    map_genes2genome = {}
    for match in Blat.iterator(infile):
        assert match.mQueryId not in map_genes2genome, "duplicate entry %s" % match.mQueryId
        map_genes2genome[match.mQueryId] = match
    infile.close()

    ##################################################
    ##################################################
    ##################################################
    # get quality scores
    ##################################################
    quality = IndexedFasta.IndexedFasta(options.quality_file)
    quality.setTranslator(IndexedFasta.TranslatorBytes())

    ##################################################
    ##################################################
    ##################################################
    # main loop
    ##################################################
    ninput, noutput, nmissed = 0, 0, 0

    options.stdout.write("cluster_id\tstart\tend\n")

    for line in options.stdin:
        if line.startswith("cluster_id"):
            continue
        ninput += 1
        cluster_id, gene_id, alignment = line[:-1].split("\t")

        if gene_id not in map_genes2genome:
            nmissed += 1
            E.warn("gene_id %s not found in map." % gene_id)
            continue

        match = map_genes2genome[gene_id]
        map_gene2genome = match.getMapQuery2Target()
        is_negative = match.strand == "-"

        # if strand is negative, the coordinates are
        # on the negative strand of the gene/query
        # in order to work in the right coordinate system
        # revert the sequence
        if is_negative:
            alignment = alignment[::-1]

        # get map of gene to alignment
        map_gene2mali = alignlib_lite.py_makeAlignmentVector()
        fillAlignment(map_gene2mali, alignment)

        # get quality scores
        quality_scores = quality.getSequence(match.mSbjctId, "+",
                                             match.mSbjctFrom, match.mSbjctTo)

        # print str(alignlib_lite.py_AlignmentFormatEmissions( map_gene2genome))
        # print str(alignlib_lite.py_AlignmentFormatEmissions( map_gene2mali))
        # print quality_scores
        map_mali2genome = alignlib_lite.py_makeAlignmentVector()
        alignlib_lite.py_combineAlignment(map_mali2genome, map_gene2mali,
                                          map_gene2genome, alignlib_lite.py_RR)
        # print str(alignlib_lite.py_AlignmentFormatEmissions(
        # map_mali2genome))

        # shuffle quality scores, but only those that are aligned
        if options.random:
            positions = []
            for fp, c in enumerate(alignment):
                if c == "-":
                    continue
                y = map_mali2genome.mapRowToCol(fp) - match.mSbjctFrom
                if y < 0:
                    continue
                positions.append(y)
            scores = [quality_scores[x] for x in positions]
            random.shuffle(scores)
            for p, q in zip(positions, scores):
                quality_scores[p] = q

        # negative strand
        to_mask = []
        # reverse position
        rp = len(alignment)
        for fp, c in enumerate(alignment):
            rp -= 1
            if c == "-":
                continue
            y = map_mali2genome.mapRowToCol(fp) - match.mSbjctFrom
            if y < 0:
                continue
            if quality_scores[y] < options.quality_threshold:
                if is_negative:
                    p = rp
                else:
                    p = fp
                E.debug(
                    "low quality base: id=%s, mali=%i, char=%s, contig=%s, strand=%s, pos=%i, quality=%i"
                    % (cluster_id, p, c, match.mSbjctId, match.strand,
                       map_mali2genome.mapRowToCol(fp), quality_scores[y]))
                if options.frame > 1:
                    start = (p // options.frame) * options.frame
                    to_mask.extend(list(range(start, start + options.frame)))
                else:
                    to_mask.append(p)

        regions = Iterators.group_by_distance(sorted(to_mask))

        for start, end in regions:
            options.stdout.write("%s\t%i\t%i\n" % (cluster_id, start, end))

        noutput += 1

    E.info("ninput=%i, noutput=%i, nmissed=%i" % (ninput, noutput, nmissed))

    E.Stop()
示例#38
0
文件: psl2psl.py 项目: siping/cgat
    (options, args) = E.Start( parser, add_pipe_options = True )

    if options.filename_queries:
        query_fasta =  IndexedFasta.IndexedFasta( options.filename_queries )
    else:
        query_fasta = None

    if options.filename_sbjcts:
        sbjct_fasta =  IndexedFasta.IndexedFasta( options.filename_sbjcts )
    else:
        sbjct_fasta = None

    if "add-sequence" in options.methods and (sbjct_fasta == None or query_fasta == None):
        raise ValueError( "please supply both indexed query and target/genome sequence data." )

    iterator = Blat.iterator( options.stdin )

    if options.header != None or options.header != "none":
        if options.header == "table":
            options.stdout.write( "\t".join( Blat.FIELDS ) + "\n" )
        elif options.header == "full":
            options.stdout.write( Blat.HEADER + "\n" )

    for method in options.methods:
    
        if "map" == method:
            pslMap( options )
            break
        elif "filter-keep" == method:
            pslFilter( options, keep = True )
            break
示例#39
0
def main( argv = None ):
    
    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: align_pairs.py 2781 2009-09-10 11:33:14Z andreas $", usage = globals()["__doc__"] )

    parser.add_option("--skip-statistics", dest="skip_stats", action="store_true",
                      help="do not compute alignment statistics [%default]."  )

    parser.add_option("--method", dest="methods", type="choice", action="append",
                      choices=("dialign", "clustal", "blastz", "nw", "sw", "dba", "dialignlgs" ),
                      help="alignment method [%default]."  )

    parser.add_option("--anchor-alignment", dest="anchor_alignment", type="int",
                      help="anchor alignmet with xxx residues [%default]."  )

    parser.add_option("--output-format", dest="output_formats", type="choice", action="append",
                      choices=("fasta", "stats", "psl" ),
                      help="anchor alignment with xxx residues [%default]."  )

    parser.add_option("--input-format", dest="input_format", type="choice", 
                      choices=("fasta", "list" ),
                      help="input format of stdin [%default]."  )

    parser.add_option("--output-filename-pattern", dest="output_filename_pattern", type="string",
                      help="output pattern for multiple files [%default]."  )

    parser.add_option("--filename-sequences1", dest="filename_sequences1", type="string",
                      help="first indexed input filename with sequences [%default]."  )

    parser.add_option("--filename-sequences2", dest="filename_sequences2", type="string",
                      help="second indexed input filename with sequences [%default]."  )

    parser.add_option("--options-blastz", dest="options_blastz", type="string",
                      help="command line options for blastz [%default]."  )

    parser.set_defaults( 
        skip_stats = False,
        methods = [],
        output_formats = [],
        input_format = "fasta",
        output_filename_pattern = None,
        filename_sequences1 = None,
        filename_sequences2 = None,
        anchor_alignment = 0,
        options_blastz = "C=2 B=1 T=0 W=6 K=2200" )

    (options, args) = E.Start( parser, add_pipe_options = True )

    if len(options.methods) == 0:
        print USAGE
        print "please specify an alignment method."
        sys.exit(1)

    if len(options.output_formats) == 0:
        print USAGE
        print "please specify at least one output format."
        sys.exit(1)

    if len(args) == 2:
        iterator = iterate_double_fasta( args[0], args[1] )        
    elif options.filename_sequences1 and options.filename_sequences2:
        if len(args) == 0 or (len(args) == 1 and args[0] == "-"):
            infile = options.stdin
        elif len(args) == 1:
            infile = open( args[0], "r") 
                
        iterator = iterate_list( infile, options.filename_sequences1, options.filename_sequences2 )
    else:
        iterator = iterate_single_fasta( options.stdin )
        

    npairs, ntoken_pairs = 0, 0
    ninput, nskipped, nerrors = 0, 0, 0

    outfile_table = None
    outfile_fasta = None
    outfile_psl = None
    if "table" in options.output_formats:
        outfile_table = getFile( "table ", options )
        outfile_table.write( """# CATEGORY:       category [intron|exon]
# METHOD:         alignment method
# TOKEN:          name
# ID:             segment id
# TOTAL:          number of segments
# LEN:            length of segment
# NALIGNED:       number of aligned positions
# PALIGNED:       percentage of aligned positions
# IDENT:          number of identical positions
# TRANSIT:        number of transitions
# TRANSVERS:      number of transversion
# MATCHES:        number of matching positions
# PIDENT:         percentage of identical positions
# PTRANSIT:       precentage of transitions
# PTRANSVERS:     precentage of transversion
# BLOCKSIZES:     alignment, length of blocks
# GAPS:           gap sizes in sequence 1/2
CATEGORY\tMETHOD\tTOKEN1\tID1\tTOTAL1\tLEN1\tTOKEN2\tID2\tTOTAL2\tLEN2\tNALIGNED\tPALIGNED\tIDENT\tTRANSIT\tTRANSVER\tMATCHES\tPIDENT\tPTRANSVIT\tPTRANVER\tBLOCKSIZES\tGAPSIZES\tGAPSIZES\tTYPE1\tTYPE2\n""")

    if "fasta" in options.output_formats:
        outfile_fasta = getFile( "fasta", options )

    if "psl" in options.output_formats:
        outfile_psl = getFile( "psl", options )

    ## setup alignment objects
    for unaligned_pair in iterator:

        ninput += 1
        
        for method in options.methods:

            pair = AlignedPairs.AlignedPair( unaligned_pair )
            pair.mOptionsBlastZ = options.options_blastz

            try:
                pair.Align( method, anchor = options.anchor_alignment )
            except AlignedPairs.AlignmentError, msg:
                
                if options.loglevel >= 1:
                    options.stdlog.write( "# %s - %s: %s\n" % (msg, unaligned_pair.mToken1, unaligned_pair.mToken2))
                    if options.loglevel >= 2:
                        options.stdlog.write( "# input=%s\n" % (str(unaligned_pair)))

                nskipped += 1
                continue

            if outfile_table:
                outfile_table.write( str(pair) + "\n" )
            
            if outfile_fasta:
                outfile_fasta.write( ">%s\n%s\n>%s\n%s\n" % (pair.mToken1, pair.mAlignedSequence1, pair.mToken2, pair.mAlignedSequence2 ) )

            if outfile_psl:
                entry = Blat.Match()
                entry.mQueryId, entry.mSbjctId = pair.mToken1, pair.mToken2
                entry.strand = pair.strand
                entry.fromMap( pair.mAlignment )
                outfile_psl.write( str(entry) + "\n" )

            npairs += 1