Пример #1
0
Файл: ca.py Проект: zjwang6/jcvi
def merger(args):
    """
    %prog merger layout gkpStore contigs.fasta

    Merge reads into one contig.
    """
    p = OptionParser(merger.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    layout, gkpstore, contigs = args
    fp = open(layout)
    pf = "0"
    iidfile = pf + ".iids"
    for i, row in enumerate(fp):
        logging.debug("Read unitig {0}".format(i))
        fw = open(iidfile, "w")
        layout = row.split("|")
        print("\n".join(layout), file=fw)
        fw.close()
        cmd = "gatekeeper -iid {0}.iids -dumpfasta {0} {1}".format(
            pf, gkpstore)
        sh(cmd)

        fastafile = "{0}.fasta".format(pf)
        newfastafile = "{0}.new.fasta".format(pf)
        format([
            fastafile,
            newfastafile,
            "--sequential=replace",
            "--sequentialoffset=1",
            "--nodesc",
        ])
        fasta([newfastafile])

        sh("rm -rf {0}".format(pf))
        cmd = "runCA {0}.frg -p {0} -d {0} consensus=pbutgcns".format(pf)
        cmd += " unitigger=bogart doFragmentCorrection=0 doUnitigSplitting=0"
        sh(cmd)
        outdir = "{0}/9-terminator".format(pf)

        cmd = "cat {0}/{1}.ctg.fasta {0}/{1}.deg.fasta {0}/{1}.singleton.fasta".format(
            outdir, pf)
        sh(cmd, outfile=contigs, append=True)
Пример #2
0
def build(args):
    """
    %prog build current.fasta Bacteria_Virus.fasta prefix

    Build assembly files after a set of clean-ups:
    1. Use cdhit (100%) to remove duplicate scaffolds
    2. Screen against the bacteria and virus database (remove scaffolds 95% id, 50% cov)
    3. Mask matches to UniVec_Core
    4. Sort by decreasing scaffold sizes
    5. Rename the scaffolds sequentially
    6. Build the contigs by splitting scaffolds at gaps
    7. Rename the contigs sequentially
    """
    from jcvi.apps.cdhit import deduplicate
    from jcvi.apps.vecscreen import mask
    from jcvi.formats.fasta import sort

    p = OptionParser(build.__doc__)
    p.add_option(
        "--nodedup",
        default=False,
        action="store_true",
        help="Do not deduplicate [default: deduplicate]",
    )
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    fastafile, bacteria, pf = args
    dd = deduplicate([fastafile, "--pctid=100"
                      ]) if not opts.nodedup else fastafile
    screenfasta = screen([dd, bacteria])
    tidyfasta = mask([screenfasta])
    sortedfasta = sort([tidyfasta, "--sizes"])
    scaffoldfasta = pf + ".assembly.fasta"
    format([sortedfasta, scaffoldfasta, "--prefix=scaffold_", "--sequential"])
    gapsplitfasta = pf + ".gapSplit.fasta"
    cmd = "gapSplit -minGap=10 {0} {1}".format(scaffoldfasta, gapsplitfasta)
    sh(cmd)
    contigsfasta = pf + ".contigs.fasta"
    format([gapsplitfasta, contigsfasta, "--prefix=contig_", "--sequential"])
Пример #3
0
def merger(args):
    """
    %prog merger layout gkpStore contigs.fasta

    Merge reads into one contig.
    """
    p = OptionParser(merger.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    layout, gkpstore, contigs = args
    fp = open(layout)
    pf = "0"
    iidfile = pf + ".iids"
    for i, row in enumerate(fp):
        logging.debug("Read unitig {0}".format(i))
        fw = open(iidfile, "w")
        layout = row.split("|")
        print >> fw, "\n".join(layout)
        fw.close()
        cmd = "gatekeeper -iid {0}.iids -dumpfasta {0} {1}".format(pf, gkpstore)
        sh(cmd)

        fastafile = "{0}.fasta".format(pf)
        newfastafile = "{0}.new.fasta".format(pf)
        format([fastafile, newfastafile, "--sequential=replace", \
                "--sequentialoffset=1", "--nodesc"])
        fasta([newfastafile])

        sh("rm -rf {0}".format(pf))
        cmd = "runCA {0}.frg -p {0} -d {0} consensus=pbutgcns".format(pf)
        cmd += " unitigger=bogart doFragmentCorrection=0 doUnitigSplitting=0"
        sh(cmd)
        outdir = "{0}/9-terminator".format(pf)

        cmd = "cat {0}/{1}.ctg.fasta {0}/{1}.deg.fasta {0}/{1}.singleton.fasta"\
                .format(outdir, pf)
        sh(cmd, outfile=contigs, append=True)
Пример #4
0
def build(args):
    """
    %prog build current.fasta Bacteria_Virus.fasta prefix

    Build assembly files after a set of clean-ups:
    1. Use cdhit (100%) to remove duplicate scaffolds
    2. Screen against the bacteria and virus database (remove scaffolds 95% id, 50% cov)
    3. Mask matches to UniVec_Core
    4. Sort by decreasing scaffold sizes
    5. Rename the scaffolds sequentially
    6. Build the contigs by splitting scaffolds at gaps
    7. Rename the contigs sequentially
    """
    from jcvi.apps.cdhit import deduplicate
    from jcvi.apps.vecscreen import mask
    from jcvi.formats.fasta import sort

    p = OptionParser(build.__doc__)
    p.add_option("--nodedup", default=False, action="store_true",
                 help="Do not deduplicate [default: deduplicate]")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    fastafile, bacteria, pf = args
    dd = deduplicate([fastafile, "--pctid=100"]) \
                if not opts.nodedup else fastafile
    screenfasta = screen([dd, bacteria])
    tidyfasta = mask([screenfasta])
    sortedfasta = sort([tidyfasta, "--sizes"])
    scaffoldfasta = pf + ".assembly.fasta"
    format([sortedfasta, scaffoldfasta, "--prefix=scaffold_", "--sequential"])
    gapsplitfasta = pf + ".gapSplit.fasta"
    cmd = "gapSplit -minGap=10 {0} {1}".format(scaffoldfasta, gapsplitfasta)
    sh(cmd)
    contigsfasta = pf + ".contigs.fasta"
    format([gapsplitfasta, contigsfasta, "--prefix=contig_", "--sequential"])
Пример #5
0
def overlap(args):
    """
    %prog overlap ctgfasta poolfasta

    Fish out the sequences in `poolfasta` that overlap with `ctgfasta`.
    Mix and combine using `minimus2`.
    """
    p = OptionParser(overlap.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ctgfasta, poolfasta = args
    prefix = ctgfasta.split(".")[0]
    rid = list(Fasta(ctgfasta).iterkeys())
    assert len(rid) == 1, "Use overlapbatch() to improve multi-FASTA file"

    rid = rid[0]
    splitctgfasta = ctgfasta.rsplit(".", 1)[0] + ".split.fasta"
    ctgfasta = run_gapsplit(infile=ctgfasta, outfile=splitctgfasta)

    # Run BLAST
    blastfile = ctgfasta + ".blast"
    run_megablast(infile=ctgfasta, outfile=blastfile, db=poolfasta)

    # Extract contigs and merge using minimus2
    closuredir = prefix + ".closure"
    closure = False
    if need_update(blastfile, closuredir):
        mkdir(closuredir, overwrite=True)
        closure = True

    if closure:
        idsfile = op.join(closuredir, prefix + ".ids")
        cmd = "cut -f2 {0} | sort -u".format(blastfile)
        sh(cmd, outfile=idsfile)

        idsfastafile = op.join(closuredir, prefix + ".ids.fasta")
        cmd = "faSomeRecords {0} {1} {2}".format(poolfasta, idsfile,
                                                 idsfastafile)
        sh(cmd)

        # This step is a hack to weight the bases from original sequences more
        # than the pulled sequences, by literally adding another copy to be used
        # in consensus calls.
        redundantfastafile = op.join(closuredir, prefix + ".redundant.fasta")
        format([ctgfasta, redundantfastafile, "--prefix=RED."])

        mergedfastafile = op.join(closuredir, prefix + ".merged.fasta")
        cmd = "cat {0} {1} {2}".format(ctgfasta, redundantfastafile,
                                       idsfastafile)
        sh(cmd, outfile=mergedfastafile)

        afgfile = op.join(closuredir, prefix + ".afg")
        cmd = "toAmos -s {0} -o {1}".format(mergedfastafile, afgfile)
        sh(cmd)

        cwd = os.getcwd()
        os.chdir(closuredir)
        cmd = "minimus2 {0} -D REFCOUNT=0".format(prefix)
        cmd += " -D OVERLAP=100 -D MINID=98"
        sh(cmd)
        os.chdir(cwd)

    # Analyze output, make sure that:
    # + Get the singletons of the original set back
    # + Drop any contig that is comprised entirely of pulled set
    originalIDs = set(Fasta(ctgfasta).iterkeys())
    minimuscontig = op.join(closuredir, prefix + ".contig")
    c = ContigFile(minimuscontig)
    excludecontigs = set()
    for rec in c.iter_records():
        reads = set(x.id for x in rec.reads)
        if reads.isdisjoint(originalIDs):
            excludecontigs.add(rec.id)

    logging.debug("Exclude contigs: {0}".\
            format(", ".join(sorted(excludecontigs))))

    finalfasta = prefix + ".improved.fasta_"
    fw = open(finalfasta, "w")
    minimusfasta = op.join(closuredir, prefix + ".fasta")
    f = Fasta(minimusfasta)
    for id, rec in f.iteritems_ordered():
        if id in excludecontigs:
            continue
        SeqIO.write([rec], fw, "fasta")

    singletonfile = op.join(closuredir, prefix + ".singletons")
    singletons = set(x.strip() for x in open(singletonfile))
    leftovers = singletons & originalIDs

    logging.debug("Pull leftover singletons: {0}".\
            format(", ".join(sorted(leftovers))))

    f = Fasta(ctgfasta)
    for id, rec in f.iteritems_ordered():
        if id not in leftovers:
            continue
        SeqIO.write([rec], fw, "fasta")

    fw.close()

    fastafile = finalfasta
    finalfasta = fastafile.rstrip("_")
    format([
        fastafile, finalfasta, "--sequential", "--pad0=3",
        "--prefix={0}_".format(rid)
    ])

    logging.debug("Improved FASTA written to `{0}`.".format(finalfasta))

    n50([ctgfasta])
    n50([finalfasta])

    errlog = "error.log"
    for f in (fastafile, blastfile, errlog):
        if op.exists(f):
            os.remove(f)
Пример #6
0
def novo(args):
    """
    %prog novo reads.fastq

    Reference-free tGBS pipeline.
    """
    from jcvi.assembly.kmer import jellyfish, histogram
    from jcvi.assembly.preprocess import diginorm
    from jcvi.formats.fasta import filter as fasta_filter, format
    from jcvi.apps.cdhit import filter as cdhit_filter

    p = OptionParser(novo.__doc__)
    p.add_option("--technology", choices=("illumina", "454", "iontorrent"),
                 default="iontorrent", help="Sequencing platform")
    p.add_option("--dedup", choices=("uclust", "cdhit"),
                 default="cdhit", help="Dedup algorithm")
    p.set_depth(depth=50)
    p.set_align(pctid=96)
    p.set_home("cdhit", default="/usr/local/bin/")
    p.set_home("fiona", default="/usr/local/bin/")
    p.set_home("jellyfish", default="/usr/local/bin/")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastqfile, = args
    cpus = opts.cpus
    depth = opts.depth
    pf, sf = fastqfile.rsplit(".", 1)

    diginormfile = pf + ".diginorm." + sf
    if need_update(fastqfile, diginormfile):
        diginorm([fastqfile, "--single", "--depth={0}".format(depth)])
        keepabund = fastqfile + ".keep.abundfilt"
        sh("cp -s {0} {1}".format(keepabund, diginormfile))

    jf = pf + "-K23.histogram"
    if need_update(diginormfile, jf):
        jellyfish([diginormfile, "--prefix={0}".format(pf),
                    "--cpus={0}".format(cpus),
                    "--jellyfish_home={0}".format(opts.jellyfish_home)])

    genomesize = histogram([jf, pf, "23"])
    fiona = pf + ".fiona.fa"
    if need_update(diginormfile, fiona):
        cmd = op.join(opts.fiona_home, "fiona")
        cmd += " -g {0} -nt {1} --sequencing-technology {2}".\
                    format(genomesize, cpus, opts.technology)
        cmd += " -vv {0} {1}".format(diginormfile, fiona)
        logfile = pf + ".fiona.log"
        sh(cmd, outfile=logfile, errfile=logfile)

    dedup = opts.dedup
    pctid = opts.pctid
    cons = fiona + ".P{0}.{1}.consensus.fasta".format(pctid, dedup)
    if need_update(fiona, cons):
        if dedup == "cdhit":
            deduplicate([fiona, "--consensus", "--reads",
                         "--pctid={0}".format(pctid),
                         "--cdhit_home={0}".format(opts.cdhit_home)])
        else:
            uclust([fiona, "--pctid={0}".format(pctid)])

    filteredfile = pf + ".filtered.fasta"
    if need_update(cons, filteredfile):
        covfile = pf + ".cov.fasta"
        cdhit_filter([cons, "--outfile={0}".format(covfile),
                      "--minsize={0}".format(depth / 5)])
        fasta_filter([covfile, "50", "--outfile={0}".format(filteredfile)])

    finalfile = pf + ".final.fasta"
    if need_update(filteredfile, finalfile):
        format([filteredfile, finalfile, "--sequential=replace",
                    "--prefix={0}_".format(pf)])
Пример #7
0
def patch(args):
    """
    %prog patch reference.fasta reads.fasta

    Run PBJelly with reference and reads.
    """
    from jcvi.formats.base import write_file
    from jcvi.formats.fasta import format

    p = OptionParser(patch.__doc__)
    p.add_option("--cleanfasta", default=False, action="store_true",
                 help="Clean FASTA to remove description [default: %default]")
    p.add_option("--highqual", default=False, action="store_true",
                 help="Reads are of high quality [default: %default]")
    p.set_home("pbjelly")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ref, reads = args
    cmd = op.join(opts.pbjelly_home, "setup.sh")
    if not which("fakeQuals.py"):
        setup = "source {0}".format(cmd)
        sh(setup)

    # Check environment
    try:
        import networkx
        version = networkx.version
    except:
        logging.error("You need networkx==1.1 to run PBJELLY")
        return

    try:
        import argparse
    except ImportError:
        logging.error("You need Python2.7 or at least argparse lib")
        return

    pf = ref.rsplit(".", 1)[0]
    pr, px = reads.rsplit(".", 1)
    # Remove description line
    if opts.cleanfasta:
        oref = pf + ".f.fasta"
        oreads = pr + ".f.fasta"
        format([ref, oref])
        format([reads, oreads])
        ref, reads = oref, oreads

    # Check if the FASTA has qual
    ref, refq = fake_quals(ref)
    convert_reads = not px in ("fq", "fastq", "txt")
    if convert_reads:
        reads, readsq = fake_quals(reads)
        readsfiles = " ".join((reads, readsq))
    else:
        readsfiles = reads

    # Make directory structure
    dref, dreads = "data/reference", "data/reads"
    sh("mkdir -p {0}".format(dref))
    sh("mkdir -p {0}".format(dreads))
    sh("cp {0} {1}/".format(" ".join((ref, refq)), dref))
    sh("cp {0} {1}/".format(readsfiles, dreads))
    cwd = os.getcwd()

    outputDir = cwd
    reference = op.join(cwd, "{0}/{1}".format(dref, ref))
    reads = op.join(cwd, "{0}/{1}".format(dreads, reads))
    p = Protocol(outputDir, reference, reads, highqual=opts.highqual)
    p.write_xml()

    # Make sure we have the patched version of Extraction.py
    # See discussion <http://seqanswers.com/forums/showthread.php?t=27599>
    # This check has been removed

    # Build the pipeline
    runsh = [setup]
    for action in "setup|mapping|support|extraction".split("|"):
        runsh.append("Jelly.py {0} Protocol.xml".format(action))

    #pcmds = """find assembly -name "ref*" -exec echo \\
    #    "Assembly.py {} \\
    #    > {}/assembly.out 2> {}/assembly.err" \; > commands.list"""
    #runsh.append(pcmds)

    runsh.append("Jelly.py assembly Protocol.xml")
    runsh.append("cp assembly/assembly_chunk0.sh commands.list")
    runsh.append("parallel < commands.list")
    runsh.append("Jelly.py output Protocol.xml")

    runfile = "run.sh"
    contents = "\n".join(runsh)
    write_file(runfile, contents)
Пример #8
0
def patch(args):
    """
    %prog patch reference.fasta reads.fasta

    Run PBJelly with reference and reads.
    """
    from jcvi.formats.base import write_file
    from jcvi.formats.fasta import format

    p = OptionParser(patch.__doc__)
    p.add_option("--cleanfasta", default=False, action="store_true",
                 help="Clean FASTA to remove description [default: %default]")
    p.add_option("--highqual", default=False, action="store_true",
                 help="Reads are of high quality [default: %default]")
    p.set_home("pbjelly")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ref, reads = args
    cpus = opts.cpus
    cmd = op.join(opts.pbjelly_home, "setup.sh")
    setup = "source {0}".format(cmd)
    if not which("fakeQuals.py"):
        sh(setup)

    pf = ref.rsplit(".", 1)[0]
    pr, px = reads.rsplit(".", 1)
    # Remove description line
    if opts.cleanfasta:
        oref = pf + ".f.fasta"
        oreads = pr + ".f.fasta"
        format([ref, oref])
        format([reads, oreads])
        ref, reads = oref, oreads

    # Check if the FASTA has qual
    ref, refq = fake_quals(ref)
    convert_reads = not px in ("fq", "fastq", "txt")
    if convert_reads:
        reads, readsq = fake_quals(reads)
        readsfiles = " ".join((reads, readsq))
    else:
        readsfiles = reads

    # Make directory structure
    dref, dreads = "data/reference", "data/reads"
    cwd = os.getcwd()
    reference = op.join(cwd, "{0}/{1}".format(dref, ref))
    reads = op.join(cwd, "{0}/{1}".format(dreads, reads))
    if not op.exists(reference):
        sh("mkdir -p {0}".format(dref))
        sh("cp {0} {1}/".format(" ".join((ref, refq)), dref))
    if not op.exists(reads):
        sh("mkdir -p {0}".format(dreads))
        sh("cp {0} {1}/".format(readsfiles, dreads))

    outputDir = cwd
    p = Protocol(outputDir, reference, reads, highqual=opts.highqual)
    p.write_xml()

    # Build the pipeline
    runsh = [setup]
    for action in "setup|mapping|support|extraction".split("|"):
        runsh.append("Jelly.py {0} Protocol.xml".format(action))

    runsh.append('Jelly.py assembly Protocol.xml -x "--nproc={0}"'.format(cpus))
    runsh.append("Jelly.py output Protocol.xml")

    runfile = "run.sh"
    contents = "\n".join(runsh)
    write_file(runfile, contents)
Пример #9
0
def patch(args):
    """
    %prog patch reference.fasta reads.fasta

    Run PBJelly with reference and reads.
    """
    from jcvi.formats.base import write_file
    from jcvi.formats.fasta import format

    p = OptionParser(patch.__doc__)
    p.add_option("--cleanfasta",
                 default=False,
                 action="store_true",
                 help="Clean FASTA to remove description [default: %default]")
    p.add_option("--highqual",
                 default=False,
                 action="store_true",
                 help="Reads are of high quality [default: %default]")
    p.set_home("pbjelly")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ref, reads = args
    cpus = opts.cpus
    cmd = op.join(opts.pbjelly_home, "setup.sh")
    setup = "source {0}".format(cmd)
    if not which("fakeQuals.py"):
        sh(setup)

    pf = ref.rsplit(".", 1)[0]
    pr, px = reads.rsplit(".", 1)
    # Remove description line
    if opts.cleanfasta:
        oref = pf + ".f.fasta"
        oreads = pr + ".f.fasta"
        format([ref, oref])
        format([reads, oreads])
        ref, reads = oref, oreads

    # Check if the FASTA has qual
    ref, refq = fake_quals(ref)
    convert_reads = not px in ("fq", "fastq", "txt")
    if convert_reads:
        reads, readsq = fake_quals(reads)
        readsfiles = " ".join((reads, readsq))
    else:
        readsfiles = reads

    # Make directory structure
    dref, dreads = "data/reference", "data/reads"
    cwd = os.getcwd()
    reference = op.join(cwd, "{0}/{1}".format(dref, ref))
    reads = op.join(cwd, "{0}/{1}".format(dreads, reads))
    if not op.exists(reference):
        sh("mkdir -p {0}".format(dref))
        sh("cp {0} {1}/".format(" ".join((ref, refq)), dref))
    if not op.exists(reads):
        sh("mkdir -p {0}".format(dreads))
        sh("cp {0} {1}/".format(readsfiles, dreads))

    outputDir = cwd
    p = Protocol(outputDir, reference, reads, highqual=opts.highqual)
    p.write_xml()

    # Build the pipeline
    runsh = [setup]
    for action in "setup|mapping|support|extraction".split("|"):
        runsh.append("Jelly.py {0} Protocol.xml".format(action))

    runsh.append(
        'Jelly.py assembly Protocol.xml -x "--nproc={0}"'.format(cpus))
    runsh.append("Jelly.py output Protocol.xml")

    runfile = "run.sh"
    contents = "\n".join(runsh)
    write_file(runfile, contents)
Пример #10
0
def overlap(args):
    """
    %prog overlap ctgfasta poolfasta

    Fish out the sequences in `poolfasta` that overlap with `ctgfasta`.
    Mix and combine using `minimus2`.
    """
    p = OptionParser(overlap.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ctgfasta, poolfasta = args
    prefix = ctgfasta.split(".")[0]
    rid = list(Fasta(ctgfasta).iterkeys())
    assert len(rid) == 1, "Use overlapbatch() to improve multi-FASTA file"

    rid = rid[0]
    splitctgfasta = ctgfasta.rsplit(".", 1)[0] + ".split.fasta"
    ctgfasta = run_gapsplit(infile=ctgfasta, outfile=splitctgfasta)

    # Run BLAST
    blastfile = ctgfasta + ".blast"
    run_megablast(infile=ctgfasta, outfile=blastfile, db=poolfasta)

    # Extract contigs and merge using minimus2
    closuredir = prefix + ".closure"
    closure = False
    if need_update(blastfile, closuredir):
        mkdir(closuredir, overwrite=True)
        closure = True

    if closure:
        idsfile = op.join(closuredir, prefix + ".ids")
        cmd = "cut -f2 {0} | sort -u".format(blastfile)
        sh(cmd, outfile=idsfile)

        idsfastafile = op.join(closuredir, prefix + ".ids.fasta")
        cmd = "faSomeRecords {0} {1} {2}".format(poolfasta, idsfile, idsfastafile)
        sh(cmd)

        # This step is a hack to weight the bases from original sequences more
        # than the pulled sequences, by literally adding another copy to be used
        # in consensus calls.
        redundantfastafile = op.join(closuredir, prefix + ".redundant.fasta")
        format([ctgfasta, redundantfastafile, "--prefix=RED."])

        mergedfastafile = op.join(closuredir, prefix + ".merged.fasta")
        cmd = "cat {0} {1} {2}".format(ctgfasta, redundantfastafile, idsfastafile)
        sh(cmd, outfile=mergedfastafile)

        afgfile = op.join(closuredir, prefix + ".afg")
        cmd = "toAmos -s {0} -o {1}".format(mergedfastafile, afgfile)
        sh(cmd)

        cwd = os.getcwd()
        os.chdir(closuredir)
        cmd = "minimus2 {0} -D REFCOUNT=0".format(prefix)
        cmd += " -D OVERLAP=100 -D MINID=98"
        sh(cmd)
        os.chdir(cwd)

    # Analyze output, make sure that:
    # + Get the singletons of the original set back
    # + Drop any contig that is comprised entirely of pulled set
    originalIDs = set(Fasta(ctgfasta).iterkeys())
    minimuscontig = op.join(closuredir, prefix + ".contig")
    c = ContigFile(minimuscontig)
    excludecontigs = set()
    for rec in c.iter_records():
        reads = set(x.id for x in rec.reads)
        if reads.isdisjoint(originalIDs):
            excludecontigs.add(rec.id)

    logging.debug("Exclude contigs: {0}".\
            format(", ".join(sorted(excludecontigs))))

    finalfasta = prefix + ".improved.fasta_"
    fw = open(finalfasta, "w")
    minimusfasta = op.join(closuredir, prefix + ".fasta")
    f = Fasta(minimusfasta)
    for id, rec in f.iteritems_ordered():
        if id in excludecontigs:
            continue
        SeqIO.write([rec], fw, "fasta")

    singletonfile = op.join(closuredir, prefix + ".singletons")
    singletons = set(x.strip() for x in open(singletonfile))
    leftovers = singletons & originalIDs

    logging.debug("Pull leftover singletons: {0}".\
            format(", ".join(sorted(leftovers))))

    f = Fasta(ctgfasta)
    for id, rec in f.iteritems_ordered():
        if id not in leftovers:
            continue
        SeqIO.write([rec], fw, "fasta")

    fw.close()

    fastafile = finalfasta
    finalfasta = fastafile.rstrip("_")
    format([fastafile, finalfasta, "--sequential", "--pad0=3",
        "--prefix={0}_".format(rid)])

    logging.debug("Improved FASTA written to `{0}`.".format(finalfasta))

    n50([ctgfasta])
    n50([finalfasta])

    errlog = "error.log"
    for f in (fastafile, blastfile, errlog):
        if op.exists(f):
            os.remove(f)
Пример #11
0
def novo(args):
    """
    %prog novo reads.fastq

    Reference-free tGBS pipeline.
    """
    from jcvi.assembly.kmer import jellyfish, histogram
    from jcvi.assembly.preprocess import diginorm
    from jcvi.formats.fasta import filter as fasta_filter, format
    from jcvi.apps.cdhit import filter as cdhit_filter

    p = OptionParser(novo.__doc__)
    p.add_option("--technology",
                 choices=("illumina", "454", "iontorrent"),
                 default="iontorrent",
                 help="Sequencing platform")
    p.add_option("--dedup",
                 choices=("uclust", "cdhit"),
                 default="cdhit",
                 help="Dedup algorithm")
    p.set_depth(depth=50)
    p.set_align(pctid=96)
    p.set_home("cdhit")
    p.set_home("fiona")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastqfile, = args
    cpus = opts.cpus
    depth = opts.depth
    pf, sf = fastqfile.rsplit(".", 1)

    diginormfile = pf + ".diginorm." + sf
    if need_update(fastqfile, diginormfile):
        diginorm([fastqfile, "--single", "--depth={0}".format(depth)])
        keepabund = fastqfile + ".keep.abundfilt"
        sh("cp -s {0} {1}".format(keepabund, diginormfile))

    jf = pf + "-K23.histogram"
    if need_update(diginormfile, jf):
        jellyfish([
            diginormfile, "--prefix={0}".format(pf), "--cpus={0}".format(cpus)
        ])

    genomesize = histogram([jf, pf, "23"])
    fiona = pf + ".fiona.fa"
    if need_update(diginormfile, fiona):
        cmd = op.join(opts.fiona_home, "bin/fiona")
        cmd += " -g {0} -nt {1} --sequencing-technology {2}".\
                    format(genomesize, cpus, opts.technology)
        cmd += " -vv {0} {1}".format(diginormfile, fiona)
        logfile = pf + ".fiona.log"
        sh(cmd, outfile=logfile, errfile=logfile)

    dedup = opts.dedup
    pctid = opts.pctid
    cons = fiona + ".P{0}.{1}.consensus.fasta".format(pctid, dedup)
    if need_update(fiona, cons):
        if dedup == "cdhit":
            deduplicate([
                fiona, "--consensus", "--reads", "--pctid={0}".format(pctid),
                "--cdhit_home={0}".format(opts.cdhit_home)
            ])
        else:
            uclust([fiona, "--pctid={0}".format(pctid)])

    filteredfile = pf + ".filtered.fasta"
    if need_update(cons, filteredfile):
        covfile = pf + ".cov.fasta"
        cdhit_filter([
            cons, "--outfile={0}".format(covfile),
            "--minsize={0}".format(depth / 5)
        ])
        fasta_filter([covfile, "50", "--outfile={0}".format(filteredfile)])

    finalfile = pf + ".final.fasta"
    if need_update(filteredfile, finalfile):
        format([
            filteredfile, finalfile, "--sequential=replace",
            "--prefix={0}_".format(pf)
        ])
Пример #12
0
def patch(args):
    """
    %prog patch reference.fasta reads.fasta

    Run PBJelly with reference and reads.
    """
    from jcvi.formats.base import write_file
    from jcvi.formats.fasta import format

    p = OptionParser(patch.__doc__)
    p.add_option("--cleanfasta", default=False, action="store_true",
                 help="Clean FASTA to remove description [default: %default]")
    p.add_option("--highqual", default=False, action="store_true",
                 help="Reads are of high quality [default: %default]")
    p.set_home("pbjelly")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ref, reads = args
    cmd = op.join(opts.pbjelly_home, "setup.sh")
    if not which("fakeQuals.py"):
        setup = "source {0}".format(cmd)
        sh(setup)

    # Check environment
    try:
        import networkx
        version = networkx.version
    except:
        logging.error("You need networkx==1.1 to run PBJELLY")
        return

    try:
        import argparse
    except ImportError:
        logging.error("You need Python2.7 or at least argparse lib")
        return

    pf = ref.rsplit(".", 1)[0]
    pr, px = reads.rsplit(".", 1)
    # Remove description line
    if opts.cleanfasta:
        oref = pf + ".f.fasta"
        oreads = pr + ".f.fasta"
        format([ref, oref])
        format([reads, oreads])
        ref, reads = oref, oreads

    # Check if the FASTA has qual
    ref, refq = fake_quals(ref)
    convert_reads = not px in ("fq", "fastq", "txt")
    if convert_reads:
        reads, readsq = fake_quals(reads)
        readsfiles = " ".join((reads, readsq))
    else:
        readsfiles = reads

    # Make directory structure
    dref, dreads = "data/reference", "data/reads"
    sh("mkdir -p {0}".format(dref))
    sh("mkdir -p {0}".format(dreads))
    sh("cp {0} {1}/".format(" ".join((ref, refq)), dref))
    sh("cp {0} {1}/".format(readsfiles, dreads))
    cwd = os.getcwd()

    outputDir = cwd
    reference = op.join(cwd, "{0}/{1}".format(dref, ref))
    reads = op.join(cwd, "{0}/{1}".format(dreads, reads))
    p = Protocol(outputDir, reference, reads, highqual=opts.highqual)
    p.write_xml()

    # Make sure we have the patched version of Extraction.py
    # See discussion <http://seqanswers.com/forums/showthread.php?t=27599>
    # This check has been removed

    # Build the pipeline
    runsh = [setup]
    for action in "setup|mapping|support|extraction".split("|"):
        runsh.append("Jelly.py {0} Protocol.xml".format(action))

    #pcmds = """find assembly -name "ref*" -exec echo \\
    #    "Assembly.py {} \\
    #    > {}/assembly.out 2> {}/assembly.err" \; > commands.list"""
    #runsh.append(pcmds)

    runsh.append("Jelly.py assembly Protocol.xml")
    runsh.append("cp assembly/assembly_chunk0.sh commands.list")
    runsh.append("parallel < commands.list")
    runsh.append("Jelly.py output Protocol.xml")

    runfile = "run.sh"
    contents = "\n".join(runsh)
    write_file(runfile, contents, meta="run script")