Exemplo n.º 1
0
def n50(args):
    """
    %prog n50 filename

    Given a file with a list of numbers denoting contig lengths, calculate N50.
    Input file can be both FASTA or a list of sizes.
    """
    from jcvi.graphics.histogram import loghistogram

    p = OptionParser(n50.__doc__)
    p.add_option(
        "--print0",
        default=False,
        action="store_true",
        help="Print size and L50 to stdout",
    )

    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    ctgsizes = []

    # Guess file format
    probe = open(args[0]).readline()[0]
    isFasta = probe == ">"
    if isFasta:
        for filename in args:
            f = Fasta(filename)
            ctgsizes += list(b for a, b in f.itersizes())

    else:
        for row in must_open(args):
            try:
                ctgsize = int(float(row.split()[-1]))
            except ValueError:
                continue
            ctgsizes.append(ctgsize)

    a50, l50, nn50 = calculate_A50(ctgsizes)
    sumsize = sum(ctgsizes)
    minsize = min(ctgsizes)
    maxsize = max(ctgsizes)
    n = len(ctgsizes)
    print(", ".join(args), file=sys.stderr)

    summary = (sumsize, l50, nn50, minsize, maxsize, n)
    print(
        " ".join("{0}={1}".format(a, b) for a, b in zip(header, summary)),
        file=sys.stderr,
    )
    loghistogram(ctgsizes)

    if opts.print0:
        print("\t".join(str(x) for x in (",".join(args), sumsize, l50)))

    return zip(header, summary)
Exemplo n.º 2
0
def A50(args):
    """
    %prog A50 contigs_A.fasta contigs_B.fasta ...

    Plots A50 graphics, see blog post (http://blog.malde.org/index.php/a50/)
    """
    p = OptionParser(A50.__doc__)
    p.add_option("--overwrite", default=False, action="store_true",
            help="overwrite .rplot file if exists [default: %default]")
    p.add_option("--cutoff", default=0, type="int", dest="cutoff",
            help="use contigs above certain size [default: %default]")
    p.add_option("--stepsize", default=10, type="int", dest="stepsize",
            help="stepsize for the distribution [default: %default]")
    opts, args = p.parse_args(args)

    if not args:
        sys.exit(p.print_help())

    import numpy as np
    from jcvi.utils.table import loadtable

    stepsize = opts.stepsize  # use stepsize to speed up drawing
    rplot = "A50.rplot"
    if not op.exists(rplot) or opts.overwrite:
        fw = open(rplot, "w")
        header = "\t".join(("index", "cumsize", "fasta"))
        statsheader = ("Fasta", "L50", "N50", "Min", "Max", "Average", "Sum",
                "Counts")
        statsrows = []
        print >>fw, header
        for fastafile in args:
            f = Fasta(fastafile, index=False)
            ctgsizes = [length for k, length in f.itersizes()]
            ctgsizes = np.array(ctgsizes)

            a50, l50, n50 = calculate_A50(ctgsizes, cutoff=opts.cutoff)
            cmin, cmax, cmean = min(ctgsizes), max(ctgsizes), np.mean(ctgsizes)
            csum, counts = np.sum(ctgsizes), len(ctgsizes)
            cmean = int(round(cmean))
            statsrows.append((fastafile, l50, n50, cmin, cmax, cmean, csum,
                counts))

            logging.debug("`{0}` ctgsizes: {1}".format(fastafile, ctgsizes))

            tag = "{0} (L50={1})".format(\
                    op.basename(fastafile).rsplit(".", 1)[0], l50)
            logging.debug(tag)

            for i, s in zip(xrange(0, len(a50), stepsize), a50[::stepsize]):
                print >> fw, "\t".join((str(i), str(s / 1000000.), tag))
        fw.close()

        table = loadtable(statsheader, statsrows)
        print >> sys.stderr, table

    generate_plot(rplot)
Exemplo n.º 3
0
def A50(args):
    """
    %prog A50 contigs_A.fasta contigs_B.fasta ...

    Plots A50 graphics, see blog post (http://blog.malde.org/index.php/a50/)
    """
    p = OptionParser(A50.__doc__)
    p.add_option("--overwrite", default=False, action="store_true",
            help="overwrite .rplot file if exists [default: %default]")
    p.add_option("--cutoff", default=0, type="int", dest="cutoff",
            help="use contigs above certain size [default: %default]")
    p.add_option("--stepsize", default=10, type="int", dest="stepsize",
            help="stepsize for the distribution [default: %default]")
    opts, args = p.parse_args(args)

    if not args:
        sys.exit(p.print_help())

    import numpy as np
    from jcvi.utils.table import loadtable

    stepsize = opts.stepsize  # use stepsize to speed up drawing
    rplot = "A50.rplot"
    if not op.exists(rplot) or opts.overwrite:
        fw = open(rplot, "w")
        header = "\t".join(("index", "cumsize", "fasta"))
        statsheader = ("Fasta", "L50", "N50", "Min", "Max", "Average", "Sum",
                "Counts")
        statsrows = []
        print >>fw, header
        for fastafile in args:
            f = Fasta(fastafile, index=False)
            ctgsizes = [length for k, length in f.itersizes()]
            ctgsizes = np.array(ctgsizes)

            a50, l50, n50 = calculate_A50(ctgsizes, cutoff=opts.cutoff)
            cmin, cmax, cmean = min(ctgsizes), max(ctgsizes), np.mean(ctgsizes)
            csum, counts = np.sum(ctgsizes), len(ctgsizes)
            cmean = int(round(cmean))
            statsrows.append((fastafile, l50, n50, cmin, cmax, cmean, csum,
                counts))

            logging.debug("`{0}` ctgsizes: {1}".format(fastafile, ctgsizes))

            tag = "{0} (L50={1})".format(\
                    op.basename(fastafile).rsplit(".", 1)[0], l50)
            logging.debug(tag)

            for i, s in zip(xrange(0, len(a50), stepsize), a50[::stepsize]):
                print >> fw, "\t".join((str(i), str(s / 1000000.), tag))
        fw.close()

        table = loadtable(statsheader, statsrows)
        print >> sys.stderr, table

    generate_plot(rplot)
Exemplo n.º 4
0
def minimap(args):
    """
    %prog minimap ref.fasta query.fasta

    Wrap minimap2 aligner using query against sequences. When query and ref
    is the same, we are in "self-scan" mode (e.g. useful for finding internal
    duplications resulted from mis-assemblies).
    """
    from jcvi.apps.grid import MakeManager
    from jcvi.formats.fasta import Fasta

    p = OptionParser(minimap.__doc__)
    p.add_option(
        "--chunks",
        type="int",
        default=2000000,
        help="Split ref.fasta into chunks of size in self-scan mode",
    )
    p.set_outdir(outdir="outdir")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ref, query = args
    chunks = opts.chunks
    outdir = opts.outdir
    if ref != query:
        raise NotImplementedError

    # "self-scan" mode
    # build faidx (otherwise, parallel make may complain)
    sh("samtools faidx {}".format(ref))
    f = Fasta(ref)
    mkdir(outdir)
    mm = MakeManager()
    for name, size in f.itersizes():
        start = 0
        for end in range(chunks, size, chunks):
            fafile = op.join(outdir,
                             "{}_{}_{}.fa".format(name, start + 1, end))
            cmd = "samtools faidx {} {}:{}-{} -o {}".format(
                ref, name, start + 1, end, fafile)
            mm.add(ref, fafile, cmd)

            paffile = fafile.rsplit(".", 1)[0] + ".paf"
            cmd = "minimap2 -P {} {} > {}".format(fafile, fafile, paffile)
            mm.add(fafile, paffile, cmd)

            epsfile = fafile.rsplit(".", 1)[0] + ".eps"
            cmd = "minidot {} > {}".format(paffile, epsfile)
            mm.add(paffile, epsfile, cmd)
            start += chunks

    mm.write()
Exemplo n.º 5
0
Arquivo: base.py Projeto: rrane/jcvi
def n50(args):
    """
    %prog n50 filename

    Given a file with a list of numbers denoting contig lengths, calculate N50.
    Input file can be both FASTA or a list of sizes.
    """
    from jcvi.graphics.histogram import loghistogram

    p = OptionParser(n50.__doc__)
    p.add_option(
        "--print0", default=False, action="store_true", help="Print size and L50 to stdout [default: %default]"
    )

    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    ctgsizes = []

    # Guess file format
    probe = open(args[0]).readline()[0]
    isFasta = probe == ">"
    if isFasta:
        for filename in args:
            f = Fasta(filename)
            ctgsizes += list(b for a, b in f.itersizes())

    else:
        for row in must_open(args):
            try:
                ctgsize = int(row.split()[-1])
            except ValueError:
                continue
            ctgsizes.append(ctgsize)

    a50, l50, nn50 = calculate_A50(ctgsizes)
    sumsize = sum(ctgsizes)
    minsize = min(ctgsizes)
    maxsize = max(ctgsizes)
    n = len(ctgsizes)
    print >> sys.stderr, ", ".join(args)

    summary = (sumsize, l50, nn50, minsize, maxsize, n)
    print >> sys.stderr, " ".join("{0}={1}".format(a, b) for a, b in zip(header, summary))
    loghistogram(ctgsizes)

    if opts.print0:
        print "\t".join(str(x) for x in (",".join(args), sumsize, l50))

    return zip(header, summary)
Exemplo n.º 6
0
def n50(args):
    """
    %prog n50 filename

    Given a file with a list of numbers denoting contig lengths, calculate N50.
    Input file can be both FASTA or a list of sizes.
    """
    p = OptionParser(n50.__doc__)

    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    ctgsizes = []

    # Guess file format
    probe = open(args[0]).readline()[0]
    isFasta = (probe == '>')
    if isFasta:
        for filename in args:
            f = Fasta(filename)
            ctgsizes += list(b for a, b in f.itersizes())

    else:
        for row in must_open(args):
            try:
                ctgsize = int(row.split()[-1])
            except ValueError:
                continue
            ctgsizes.append(ctgsize)

    a50, l50, nn50 = calculate_A50(ctgsizes)
    sumsize = sum(ctgsizes)
    minsize = min(ctgsizes)
    maxsize = max(ctgsizes)
    n = len(ctgsizes)
    print >> sys.stderr, ", ".join(args)

    summary = (sumsize, l50, nn50, minsize, maxsize, n)
    print >> sys.stderr, " ".join("{0}={1}".format(a, b) for a, b in \
                        zip(header, summary))
    loghistogram(ctgsizes, summary=False)

    return zip(header, summary)
Exemplo n.º 7
0
def main(blast_file, cds_file, bed_file, N=3):

    # get the sizes for the CDS first
    f = Fasta(cds_file)
    sizes = dict(f.itersizes())

    # retrieve the locations
    bed = Bed(bed_file).order

    # filter the blast file
    g = Grouper()
    fp = open(blast_file)
    for row in fp:
        b = BlastLine(row)
        query_len = sizes[b.query]
        subject_len = sizes[b.subject]
        if b.hitlen < min(query_len, subject_len) / 2:
            continue

        query, subject = gene_name(b.query), gene_name(b.subject)
        qi, q = bed[query]
        si, s = bed[subject]

        if q.seqid == s.seqid and abs(qi - si) <= N:
            g.join(query, subject)

    # dump the grouper
    ngenes, nfamilies = 0, 0
    families = []
    for group in sorted(g):
        if len(group) >= 2:
            print ",".join(sorted(group))
            ngenes += len(group)
            nfamilies += 1
            families.append(sorted(group))

    longest_family = max(families, key=lambda x: len(x))

    # generate reports
    print >> sys.stderr, "Proximal paralogues (dist=%d):" % N
    print >> sys.stderr, "Total %d genes in %d families" % (ngenes, nfamilies)
    print >> sys.stderr, "Longest families (%d): %s" % (
        len(longest_family), ",".join(longest_family))
Exemplo n.º 8
0
def main(blast_file, cds_file, bed_file, N=3):

    # get the sizes for the CDS first
    f = Fasta(cds_file)
    sizes = dict(f.itersizes())

    # retrieve the locations
    bed = Bed(bed_file).order

    # filter the blast file
    g = Grouper()
    fp = open(blast_file)
    for row in fp:
        b = BlastLine(row)
        query_len = sizes[b.query]
        subject_len = sizes[b.subject]
        if b.hitlen < min(query_len, subject_len) / 2:
            continue

        query, subject = gene_name(b.query), gene_name(b.subject)
        qi, q = bed[query]
        si, s = bed[subject]

        if q.seqid == s.seqid and abs(qi - si) <= N:
            g.join(query, subject)

    # dump the grouper
    ngenes, nfamilies = 0, 0
    families = []
    for group in sorted(g):
        if len(group) >= 2:
            print ",".join(sorted(group))
            ngenes += len(group)
            nfamilies += 1
            families.append(sorted(group))

    longest_family = max(families, key=lambda x: len(x))

    # generate reports
    print >>sys.stderr, "Proximal paralogues (dist=%d):" % N
    print >>sys.stderr, "Total %d genes in %d families" % (ngenes, nfamilies)
    print >>sys.stderr, "Longest families (%d): %s" % (len(longest_family),
        ",".join(longest_family))
Exemplo n.º 9
0
def clr(args):
    """
    %prog blastfile fastafiles

    Calculate the vector clear range file based BLAST to the vectors.
    """
    p = OptionParser(clr.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    blastfile = args[0]
    fastafiles = args[1:]

    sizes = {}
    for fa in fastafiles:
        f = Fasta(fa)
        sizes.update(f.itersizes())

    b = Blast(blastfile)
    seen = set()
    for query, hits in b.iter_hits():

        qsize = sizes[query]
        vectors = list((x.qstart, x.qstop) for x in hits)
        vmin, vmax = range_minmax(vectors)

        left_size = vmin - 1
        right_size = qsize - vmax

        if left_size > right_size:
            clr_start, clr_end = 0, vmin
        else:
            clr_start, clr_end = vmax, qsize

        print "\t".join(str(x) for x in (query, clr_start, clr_end))
        del sizes[query]

    for q, size in sorted(sizes.items()):
        print "\t".join(str(x) for x in (q, 0, size))
Exemplo n.º 10
0
Arquivo: ca.py Projeto: rrane/jcvi
def clr(args):
    """
    %prog blastfile fastafiles

    Calculate the vector clear range file based BLAST to the vectors.
    """
    p = OptionParser(clr.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    blastfile = args[0]
    fastafiles = args[1:]

    sizes = {}
    for fa in fastafiles:
        f = Fasta(fa)
        sizes.update(f.itersizes())

    b = Blast(blastfile)
    seen = set()
    for query, hits in b.iter_hits():

        qsize = sizes[query]
        vectors = list((x.qstart, x.qstop) for x in hits)
        vmin, vmax = range_minmax(vectors)

        left_size = vmin - 1
        right_size = qsize - vmax

        if left_size > right_size:
            clr_start, clr_end = 0, vmin
        else:
            clr_start, clr_end = vmax, qsize

        print "\t".join(str(x) for x in (query, clr_start, clr_end))
        del sizes[query]

    for q, size in sorted(sizes.items()):
        print "\t".join(str(x) for x in (q, 0, size))
Exemplo n.º 11
0
def tandem_main(blast_file, cds_file, bed_file, N=3, P=50, is_self=True, \
    evalue=.01, strip_name=".", ofile=sys.stderr, genefam=False):

    if genefam:
        N = 1e5

    # get the sizes for the CDS first
    f = Fasta(cds_file)
    sizes = dict(f.itersizes())

    # retrieve the locations
    bed = Bed(bed_file)
    order = bed.order

    if is_self:
        # filter the blast file
        g = Grouper()
        fp = open(blast_file)
        for row in fp:
            b = BlastLine(row)
            query_len = sizes[b.query]
            subject_len = sizes[b.subject]
            if b.hitlen < min(query_len, subject_len)*P/100.:
                continue

            query = gene_name(b.query, strip_name)
            subject = gene_name(b.subject, strip_name)
            qi, q = order[query]
            si, s = order[subject]

            if abs(qi - si) <= N and b.evalue <= evalue:
                if genefam:
                    g.join(query, subject)
                elif q.seqid == s.seqid:
                    g.join(query, subject)

    else:
        homologs = Grouper()
        fp = open(blast_file)
        for row in fp:
            b = BlastLine(row)
            query_len = sizes[b.query]
            subject_len = sizes[b.subject]
            if b.hitlen < min(query_len, subject_len)*P/100.:
                continue
            if b.evalue > evalue:
                continue

            query = gene_name(b.query, strip_name)
            subject = gene_name(b.subject, strip_name)
            homologs.join(query, subject)

        if genefam:
            g = homologs
        else:
            g = Grouper()
            for i, atom in enumerate(bed):
                for x in range(1, N+1):
                    if all([i-x >= 0, bed[i-x].seqid == atom.seqid, \
                        homologs.joined(bed[i-x].accn, atom.accn)]):
                        leni = sizes[bed[i].accn]
                        lenx = sizes[bed[i-x].accn]
                        if abs(leni - lenx) > max(leni, lenx)*(1-P/100.):
                            continue
                        g.join(bed[i-x].accn, atom.accn)

    # dump the grouper
    fw = must_open(ofile, "w")
    ngenes, nfamilies = 0, 0
    families = []
    for group in sorted(g):
        if len(group) >= 2:
            print >>fw, ",".join(sorted(group))
            ngenes += len(group)
            nfamilies += 1
            families.append(sorted(group))

    longest_family = max(families, key=lambda x: len(x))

    # generate reports
    print >>sys.stderr, "Proximal paralogues (dist=%d):" % N
    print >>sys.stderr, "Total %d genes in %d families" % (ngenes, nfamilies)
    print >>sys.stderr, "Longest families (%d): %s" % (len(longest_family),
        ",".join(longest_family))

    return families
Exemplo n.º 12
0
def ace(args):
    """
    %prog ace bamfile fastafile

    convert bam format to ace format. This often allows the remapping to be
    assessed as a denovo assembly format. bam file needs to be indexed. also
    creates a .mates file to be used in amos/bambus, and .astat file to mark
    whether the contig is unique or repetitive based on A-statistics in Celera
    assembler.
    """
    p = OptionParser(ace.__doc__)
    p.add_option(
        "--splitdir",
        dest="splitdir",
        default="outRoot",
        help="split the ace per contig to dir",
    )
    p.add_option(
        "--unpaired",
        dest="unpaired",
        default=False,
        help="remove read pairs on the same contig",
    )
    p.add_option(
        "--minreadno",
        dest="minreadno",
        default=3,
        type="int",
        help="minimum read numbers per contig",
    )
    p.add_option(
        "--minctgsize",
        dest="minctgsize",
        default=100,
        type="int",
        help="minimum contig size per contig",
    )
    p.add_option(
        "--astat",
        default=False,
        action="store_true",
        help="create .astat to list repetitiveness",
    )
    p.add_option(
        "--readids",
        default=False,
        action="store_true",
        help="create file of mapped and unmapped ids",
    )

    from pysam import Samfile

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bamfile, fastafile = args
    astat = opts.astat
    readids = opts.readids

    f = Fasta(fastafile)
    prefix = bamfile.split(".")[0]
    acefile = prefix + ".ace"
    readsfile = prefix + ".reads"
    astatfile = prefix + ".astat"

    logging.debug("Load {0}".format(bamfile))
    s = Samfile(bamfile, "rb")

    ncontigs = s.nreferences
    genomesize = sum(x for a, x in f.itersizes())
    logging.debug("Total {0} contigs with size {1} base".format(ncontigs, genomesize))
    qual = "20"  # default qual

    totalreads = sum(s.count(x) for x in s.references)
    logging.debug("Total {0} reads mapped".format(totalreads))

    fw = open(acefile, "w")
    if astat:
        astatfw = open(astatfile, "w")
    if readids:
        readsfw = open(readsfile, "w")

    print("AS {0} {1}".format(ncontigs, totalreads), file=fw)
    print(file=fw)

    for i, contig in enumerate(s.references):
        cseq = f[contig]
        nbases = len(cseq)

        mapped_reads = [x for x in s.fetch(contig) if not x.is_unmapped]
        nreads = len(mapped_reads)

        nsegments = 0
        print("CO {0} {1} {2} {3} U".format(contig, nbases, nreads, nsegments), file=fw)
        print(fill(str(cseq.seq)), file=fw)
        print(file=fw)

        if astat:
            astat = Astat(nbases, nreads, genomesize, totalreads)
            print("{0}\t{1:.1f}".format(contig, astat), file=astatfw)

        text = fill([qual] * nbases, delimiter=" ", width=30)
        print("BQ\n{0}".format(text), file=fw)
        print(file=fw)

        rnames = []
        for a in mapped_reads:
            readname = a.qname
            rname = readname

            if readids:
                print(readname, file=readsfw)
            rnames.append(rname)

            strand = "C" if a.is_reverse else "U"
            paddedstart = a.pos + 1  # 0-based to 1-based
            af = "AF {0} {1} {2}".format(rname, strand, paddedstart)
            print(af, file=fw)

        print(file=fw)

        for a, rname in zip(mapped_reads, rnames):
            aseq, npadded = cigar_to_seq(a)
            if aseq is None:
                continue

            ninfos = 0
            ntags = 0
            alen = len(aseq)
            rd = "RD {0} {1} {2} {3}\n{4}".format(
                rname, alen, ninfos, ntags, fill(aseq)
            )
            qs = "QA 1 {0} 1 {0}".format(alen)

            print(rd, file=fw)
            print(file=fw)
            print(qs, file=fw)
            print(file=fw)
Exemplo n.º 13
0
def tandem_main(blast_file, cds_file, bed_file, N=3, P=50, is_self=True, \
    evalue=.01, strip_name=".", ofile=sys.stderr, genefam=False):

    if genefam:
        N = 1e5

    # get the sizes for the CDS first
    f = Fasta(cds_file)
    sizes = dict(f.itersizes())

    # retrieve the locations
    bed = Bed(bed_file)
    order = bed.order

    if is_self:
        # filter the blast file
        g = Grouper()
        fp = open(blast_file)
        for row in fp:
            b = BlastLine(row)
            query_len = sizes[b.query]
            subject_len = sizes[b.subject]
            if b.hitlen < min(query_len, subject_len) * P / 100.:
                continue

            query = gene_name(b.query, strip_name)
            subject = gene_name(b.subject, strip_name)
            qi, q = order[query]
            si, s = order[subject]

            if abs(qi - si) <= N and b.evalue <= evalue:
                if genefam:
                    g.join(query, subject)
                elif q.seqid == s.seqid:
                    g.join(query, subject)

    else:
        homologs = Grouper()
        fp = open(blast_file)
        for row in fp:
            b = BlastLine(row)
            query_len = sizes[b.query]
            subject_len = sizes[b.subject]
            if b.hitlen < min(query_len, subject_len) * P / 100.:
                continue
            if b.evalue > evalue:
                continue

            query = gene_name(b.query, strip_name)
            subject = gene_name(b.subject, strip_name)
            homologs.join(query, subject)

        if genefam:
            g = homologs
        else:
            g = Grouper()
            for i, atom in enumerate(bed):
                for x in range(1, N + 1):
                    if all([i-x >= 0, bed[i-x].seqid == atom.seqid, \
                        homologs.joined(bed[i-x].accn, atom.accn)]):
                        leni = sizes[bed[i].accn]
                        lenx = sizes[bed[i - x].accn]
                        if abs(leni - lenx) > max(leni, lenx) * (1 - P / 100.):
                            continue
                        g.join(bed[i - x].accn, atom.accn)

    # dump the grouper
    fw = must_open(ofile, "w")
    ngenes, nfamilies = 0, 0
    families = []
    for group in sorted(g):
        if len(group) >= 2:
            print >> fw, ",".join(sorted(group))
            ngenes += len(group)
            nfamilies += 1
            families.append(sorted(group))

    longest_family = max(families, key=lambda x: len(x))

    # generate reports
    print >> sys.stderr, "Proximal paralogues (dist=%d):" % N
    print >> sys.stderr, "Total %d genes in %d families" % (ngenes, nfamilies)
    print >> sys.stderr, "Longest families (%d): %s" % (
        len(longest_family), ",".join(longest_family))

    return families
Exemplo n.º 14
0
Arquivo: sam.py Projeto: arvin580/jcvi
def ace(args):
    """
    %prog ace bamfile fastafile

    convert bam format to ace format. This often allows the remapping to be
    assessed as a denovo assembly format. bam file needs to be indexed. also
    creates a .mates file to be used in amos/bambus, and .astat file to mark
    whether the contig is unique or repetitive based on A-statistics in Celera
    assembler.
    """
    p = OptionParser(ace.__doc__)
    p.add_option("--splitdir", dest="splitdir", default="outRoot",
            help="split the ace per contig to dir [default: %default]")
    p.add_option("--unpaired", dest="unpaired", default=False,
            help="remove read pairs on the same contig [default: %default]")
    p.add_option("--minreadno", dest="minreadno", default=3, type="int",
            help="minimum read numbers per contig [default: %default]")
    p.add_option("--minctgsize", dest="minctgsize", default=100, type="int",
            help="minimum contig size per contig [default: %default]")
    p.add_option("--astat", default=False, action="store_true",
            help="create .astat to list repetitiveness [default: %default]")
    p.add_option("--readids", default=False, action="store_true",
            help="create file of mapped and unmapped ids [default: %default]")

    from pysam import Samfile

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bamfile, fastafile = args
    astat = opts.astat
    readids = opts.readids

    f = Fasta(fastafile)
    prefix = bamfile.split(".")[0]
    acefile = prefix + ".ace"
    readsfile = prefix + ".reads"
    astatfile = prefix + ".astat"

    logging.debug("Load {0}".format(bamfile))
    s = Samfile(bamfile, "rb")

    ncontigs = s.nreferences
    genomesize = sum(x for a, x in f.itersizes())
    logging.debug("Total {0} contigs with size {1} base".format(ncontigs,
        genomesize))
    qual = "20"  # default qual

    totalreads = sum(s.count(x) for x in s.references)
    logging.debug("Total {0} reads mapped".format(totalreads))

    fw = open(acefile, "w")
    if astat:
        astatfw = open(astatfile, "w")
    if readids:
        readsfw = open(readsfile, "w")

    print >> fw, "AS {0} {1}".format(ncontigs, totalreads)
    print >> fw

    for i, contig in enumerate(s.references):
        cseq = f[contig]
        nbases = len(cseq)

        mapped_reads = [x for x in s.fetch(contig) if not x.is_unmapped]
        nreads = len(mapped_reads)

        nsegments = 0
        print >> fw, "CO {0} {1} {2} {3} U".format(contig, nbases, nreads,
                nsegments)
        print >> fw, fill(str(cseq.seq))
        print >> fw

        if astat:
            astat = Astat(nbases, nreads, genomesize, totalreads)
            print >> astatfw, "{0}\t{1:.1f}".format(contig, astat)

        text = fill([qual] * nbases, delimiter=" ", width=30)
        print >> fw, "BQ\n{0}".format(text)
        print >> fw

        rnames = []
        for a in mapped_reads:
            readname = a.qname
            rname = readname

            if readids:
                print >> readsfw, readname
            rnames.append(rname)

            strand = "C" if a.is_reverse else "U"
            paddedstart = a.pos + 1  # 0-based to 1-based
            af = "AF {0} {1} {2}".format(rname, strand, paddedstart)
            print >> fw, af

        print >> fw

        for a, rname in zip(mapped_reads, rnames):
            aseq, npadded = cigar_to_seq(a)
            if aseq is None:
                continue

            ninfos = 0
            ntags = 0
            alen = len(aseq)
            rd = "RD {0} {1} {2} {3}\n{4}".format(rname, alen, ninfos, ntags,
                    fill(aseq))
            qs = "QA 1 {0} 1 {0}".format(alen)

            print >> fw, rd
            print >> fw
            print >> fw, qs
            print >> fw