예제 #1
0
def contamination(args):
    """
    %prog contamination folder Ecoli.fasta

    Remove contaminated reads. The FASTQ files in the folder will automatically
    pair and filtered against Ecoli.fasta to remove contaminants using BOWTIE2.
    """
    from jcvi.apps.bowtie import align

    p = OptionParser(contamination.__doc__)
    p.add_option(
        "--mapped",
        default=False,
        action="store_true",
        help="Retain contaminated reads instead",
    )
    p.set_cutoff(cutoff=800)
    p.set_mateorientation(mateorientation="+-")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    folder, ecoli = args
    ecoli = get_abs_path(ecoli)
    tag = "--mapped" if opts.mapped else "--unmapped"
    for p, pf in iter_project(folder):
        align_opts = [ecoli] + p + [tag]
        align_opts += ["--cutoff={0}".format(opts.cutoff), "--null"]
        if opts.mateorientation:
            align_opts += [
                "--mateorientation={0}".format(opts.mateorientation)
            ]
        align(align_opts)
예제 #2
0
def contamination(args):
    """
    %prog contamination folder Ecoli.fasta

    Remove contaminated reads. The FASTQ files in the folder will automatically
    pair and filtered against Ecoli.fasta to remove contaminants using BOWTIE2.
    """
    from jcvi.apps.bowtie import align

    p = OptionParser(contamination.__doc__)
    p.add_option("--mapped", default=False, action="store_true",
                 help="Retain contaminated reads instead [default: %default]")
    p.set_cutoff(cutoff=800)
    p.set_mateorientation(mateorientation="+-")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    folder, ecoli = args
    ecoli = get_abs_path(ecoli)
    tag = "--mapped" if opts.mapped else "--unmapped"
    for p, pf in iter_project(folder, 2):
        align_opts = [ecoli] + p + [tag]
        align_opts += ["--cutoff={0}".format(opts.cutoff), "--null"]
        if opts.mateorientation:
            align_opts += ["--mateorientation={0}".format(opts.mateorientation)]
        samfile, logfile = align(align_opts)
예제 #3
0
def pairs(args):
    """
    %prog pairs folder reference.fasta

    Estimate insert size distribution. Compatible with a variety of aligners,
    including CLC, BOWTIE and BWA.
    """
    p = OptionParser(pairs.__doc__)
    p.set_firstN()
    p.set_mates()
    p.set_aligner()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    cwd = os.getcwd()
    aligner = opts.aligner
    work = "-".join(("pairs", aligner))
    mkdir(work)

    if aligner == "clc":
        from jcvi.apps.clc import align
        from jcvi.formats.cas import pairs as ps
    else:
        from jcvi.formats.sam import pairs as ps

    if aligner == "bowtie":
        from jcvi.apps.bowtie import align
    elif aligner == "bwa":
        from jcvi.apps.bwa import align

    folder, ref = args
    ref = get_abs_path(ref)
    messages = []
    for p, prefix in iter_project(folder, 2):
        samplefq = op.join(work, prefix + ".first.fastq")
        first([str(opts.firstN)] + p + ["-o", samplefq])

        os.chdir(work)
        align_args = [ref, op.basename(samplefq)]
        outfile, logfile = align(align_args)
        bedfile, stats = ps([outfile, "--rclip={0}".format(opts.rclip)])
        os.chdir(cwd)

        median = stats.median
        tag = "MP" if median > 1000 else "PE"
        median = str(median)
        pf, sf = median[:2], median[2:]
        if sf and int(sf) != 0:
            pf = str(int(pf) + 1)  # Get the first two effective digits
        lib = "{0}-{1}".format(tag, pf + "0" * len(sf))
        for i, xp in enumerate(p):
            suffix = "fastq.gz" if xp.endswith(".gz") else "fastq"
            link = "{0}-{1}.{2}.{3}".format(lib, prefix.replace("-", ""), i + 1, suffix)
            m = "\t".join(str(x) for x in (xp, link))
            messages.append(m)

    messages = "\n".join(messages)
    write_file("f.meta", messages, tee=True)
예제 #4
0
def pairs(args):
    """
    %prog pairs folder reference.fasta

    Estimate insert size distribution. Compatible with a variety of aligners,
    including BOWTIE and BWA.
    """
    p = OptionParser(pairs.__doc__)
    p.set_firstN()
    p.set_mates()
    p.set_aligner()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    cwd = os.getcwd()
    aligner = opts.aligner
    work = "-".join(("pairs", aligner))
    mkdir(work)

    from jcvi.formats.sam import pairs as ps

    if aligner == "bowtie":
        from jcvi.apps.bowtie import align
    elif aligner == "bwa":
        from jcvi.apps.bwa import align

    folder, ref = args
    ref = get_abs_path(ref)
    messages = []
    for p, prefix in iter_project(folder):
        samplefq = []
        for i in range(2):
            samplefq.append(
                op.join(work, prefix + "_{0}.first.fastq".format(i + 1)))
            first([str(opts.firstN)] + [p[i]] + ["-o", samplefq[i]])

        os.chdir(work)
        align_args = [ref] + [op.basename(fq) for fq in samplefq]
        outfile, logfile = align(align_args)
        bedfile, stats = ps([outfile, "--rclip={0}".format(opts.rclip)])
        os.chdir(cwd)

        median = stats.median
        tag = "MP" if median > 1000 else "PE"
        median = str(median)
        pf, sf = median[:2], median[2:]
        if sf and int(sf) != 0:
            pf = str(int(pf) + 1)  # Get the first two effective digits
        lib = "{0}-{1}".format(tag, pf + "0" * len(sf))
        for i, xp in enumerate(p):
            suffix = "fastq.gz" if xp.endswith(".gz") else "fastq"
            link = "{0}-{1}.{2}.{3}".format(lib, prefix.replace("-", ""),
                                            i + 1, suffix)
            m = "\t".join(str(x) for x in (xp, link))
            messages.append(m)

    messages = "\n".join(messages)
    write_file("f.meta", messages, tee=True)
예제 #5
0
def contamination(args):
    """
    %prog contamination Ecoli.fasta genome.fasta read.fastq

    Check read contamination on a folder of paired reads. Use bowtie2 to compare
    the reads against:
    1. Ecoli.fsata - this will tell us the lower bound of contamination
    2. genome.fasta - this will tell us the upper bound of contamination
    """
    from jcvi.apps.bowtie import BowtieLogFile, align

    p = OptionParser(contamination.__doc__)
    p.set_firstN()
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    ecoli, genome, fq = args
    firstN_opt = "--firstN={0}".format(opts.firstN)
    samfile, logfile = align([ecoli, fq, firstN_opt])
    bl = BowtieLogFile(logfile)
    lowerbound = bl.rate
    samfile, logfile = align([genome, fq, firstN_opt])
    bl = BowtieLogFile(logfile)
    upperbound = 100 - bl.rate

    median = (lowerbound + upperbound) / 2

    clogfile = fq + ".Ecoli"
    fw = open(clogfile, "w")
    lowerbound = "{0:.1f}".format(lowerbound)
    upperbound = "{0:.1f}".format(upperbound)
    median = "{0:.1f}".format(median)

    print >> fw, "\t".join((fq, lowerbound, median, upperbound))
    print >> sys.stderr, "{0}: Ecoli contamination rate {1}-{2}".\
                        format(fq, lowerbound, upperbound)
    fw.close()
예제 #6
0
def contamination(args):
    """
    %prog contamination Ecoli.fasta genome.fasta read.fastq

    Check read contamination on a folder of paired reads. Use bowtie2 to compare
    the reads against:
    1. Ecoli.fsata - this will tell us the lower bound of contamination
    2. genome.fasta - this will tell us the upper bound of contamination
    """
    from jcvi.apps.bowtie import BowtieLogFile, align

    p = OptionParser(contamination.__doc__)
    p.set_firstN()
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    ecoli, genome, fq = args
    firstN_opt = "--firstN={0}".format(opts.firstN)
    samfile, logfile = align([ecoli, fq, firstN_opt])
    bl = BowtieLogFile(logfile)
    lowerbound = bl.rate
    samfile, logfile = align([genome, fq, firstN_opt])
    bl = BowtieLogFile(logfile)
    upperbound = 100 - bl.rate

    median = (lowerbound + upperbound) / 2

    clogfile = fq + ".Ecoli"
    fw = open(clogfile, "w")
    lowerbound = "{0:.1f}".format(lowerbound)
    upperbound = "{0:.1f}".format(upperbound)
    median = "{0:.1f}".format(median)

    print >> fw, "\t".join((fq, lowerbound, median, upperbound))
    print >> sys.stderr, "{0}: Ecoli contamination rate {1}-{2}".\
                        format(fq, lowerbound, upperbound)
    fw.close()
예제 #7
0
def expand(args):
    """
    %prog expand bes.fasta reads.fastq

    Expand sequences using short reads. Useful, for example for getting BAC-end
    sequences. The template to use, in `bes.fasta` may just contain the junction
    sequences, then align the reads to get the 'flanks' for such sequences.
    """
    import math

    from jcvi.formats.fasta import Fasta, SeqIO
    from jcvi.formats.fastq import readlen, first, fasta
    from jcvi.formats.blast import Blast
    from jcvi.formats.base import FileShredder
    from jcvi.apps.bowtie import align, get_samfile
    from jcvi.apps.align import blast

    p = OptionParser(expand.__doc__)
    p.set_depth(depth=200)
    p.set_firstN()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bes, reads = args
    size = Fasta(bes).totalsize
    rl = readlen([reads])
    expected_size = size + 2 * rl
    nreads = expected_size * opts.depth / rl
    nreads = int(math.ceil(nreads / 1000.)) * 1000

    # Attract reads
    samfile, logfile = align([bes, reads, "--reorder", "--mapped",
           "--firstN={0}".format(opts.firstN)])

    samfile, mapped, _ = get_samfile(reads, bes, bowtie=True, mapped=True)
    logging.debug("Extract first {0} reads from `{1}`.".format(nreads, mapped))

    pf = mapped.split(".")[0]
    pf = pf.split("-")[0]
    bespf = bes.split(".")[0]
    reads = pf + ".expand.fastq"
    first([str(nreads), mapped, "-o", reads])

    # Perform mini-assembly
    fastafile = reads.rsplit(".", 1)[0] + ".fasta"
    qualfile = ""
    if need_update(reads, fastafile):
        fastafile, qualfile = fasta([reads])

    contigs = op.join(pf, "454LargeContigs.fna")
    if need_update(fastafile, contigs):
        cmd = "runAssembly -o {0} -cpu 8 {1}".format(pf, fastafile)
        sh(cmd)
    assert op.exists(contigs)

    # Annotate contigs
    blastfile = blast([bes, contigs])
    mapping = {}
    for query, b in Blast(blastfile).iter_best_hit():
        mapping[query] = b

    f = Fasta(contigs, lazy=True)
    annotatedfasta = ".".join((pf, bespf, "fasta"))
    fw = open(annotatedfasta, "w")
    keys = list(Fasta(bes).iterkeys_ordered())  # keep an ordered list
    recs = []
    for key, v in f.iteritems_ordered():
        vid = v.id
        if vid not in mapping:
            continue
        b = mapping[vid]
        subject = b.subject
        rec = v.reverse_complement() if b.orientation == '-' else v
        rec.id = rid = "_".join((pf, vid, subject))
        rec.description = ""
        recs.append((keys.index(subject), rid, rec))

    recs = [x[-1] for x in sorted(recs)]
    SeqIO.write(recs, fw, "fasta")
    fw.close()

    FileShredder([samfile, logfile, mapped, reads, fastafile, qualfile, blastfile, pf])
    logging.debug("Annotated seqs (n={0}) written to `{1}`.".\
                    format(len(recs), annotatedfasta))

    return annotatedfasta
예제 #8
0
def expand(args):
    """
    %prog expand bes.fasta reads.fastq

    Expand sequences using short reads. Useful, for example for getting BAC-end
    sequences. The template to use, in `bes.fasta` may just contain the junction
    sequences, then align the reads to get the 'flanks' for such sequences.
    """
    import math

    from jcvi.formats.fasta import Fasta, SeqIO
    from jcvi.formats.fastq import readlen, first, fasta
    from jcvi.formats.blast import Blast
    from jcvi.formats.base import FileShredder
    from jcvi.apps.bowtie import align, get_samfile
    from jcvi.apps.align import blast

    p = OptionParser(expand.__doc__)
    p.set_depth(depth=200)
    p.set_firstN()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bes, reads = args
    size = Fasta(bes).totalsize
    rl = readlen([reads])
    expected_size = size + 2 * rl
    nreads = expected_size * opts.depth / rl
    nreads = int(math.ceil(nreads / 1000.)) * 1000

    # Attract reads
    samfile, logfile = align([bes, reads, "--reorder", "--mapped",
           "--firstN={0}".format(opts.firstN)])

    samfile, mapped, _ = get_samfile(reads, bes, bowtie=True, mapped=True)
    logging.debug("Extract first {0} reads from `{1}`.".format(nreads, mapped))

    pf = mapped.split(".")[0]
    pf = pf.split("-")[0]
    bespf = bes.split(".")[0]
    reads = pf + ".expand.fastq"
    first([str(nreads), mapped, "-o", reads])

    # Perform mini-assembly
    fastafile = reads.rsplit(".", 1)[0] + ".fasta"
    qualfile = ""
    if need_update(reads, fastafile):
        fastafile, qualfile = fasta([reads])

    contigs = op.join(pf, "454LargeContigs.fna")
    if need_update(fastafile, contigs):
        cmd = "runAssembly -o {0} -cpu 8 {1}".format(pf, fastafile)
        sh(cmd)
    assert op.exists(contigs)

    # Annotate contigs
    blastfile = blast([bes, contigs])
    mapping = {}
    for query, b in Blast(blastfile).iter_best_hit():
        mapping[query] = b

    f = Fasta(contigs, lazy=True)
    annotatedfasta = ".".join((pf, bespf, "fasta"))
    fw = open(annotatedfasta, "w")
    keys = list(Fasta(bes).iterkeys_ordered())  # keep an ordered list
    recs = []
    for key, v in f.iteritems_ordered():
        vid = v.id
        if vid not in mapping:
            continue
        b = mapping[vid]
        subject = b.subject
        rec = v.reverse_complement() if b.orientation == '-' else v
        rec.id = rid = "_".join((pf, vid, subject))
        rec.description = ""
        recs.append((keys.index(subject), rid, rec))

    recs = [x[-1] for x in sorted(recs)]
    SeqIO.write(recs, fw, "fasta")
    fw.close()

    FileShredder([samfile, logfile, mapped, reads, fastafile, qualfile, blastfile, pf])
    logging.debug("Annotated seqs (n={0}) written to `{1}`.".\
                    format(len(recs), annotatedfasta))

    return annotatedfasta