示例#1
0
def alignextend(args):
    """
    %prog alignextend ref.fasta read.1.fastq read.2.fastq

    Wrapper around AMOS alignextend.
    """
    p = OptionParser(alignextend.__doc__)
    p.add_option("--nosuffix", default=False, action="store_true",
                 help="Do not add /1/2 suffix to the read [default: %default]")
    p.set_home("amos")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    ref, r1, r2 = args
    pf = op.basename(r1).split(".")[0]
    cmd = op.join(opts.amos_home, "src/Experimental/alignextend.pl")
    if not opts.nosuffix:
        cmd += " -suffix"
    bwa_idx = "{0}.ref.fa.sa".format(pf)
    if not need_update(ref, bwa_idx):
        cmd += " -noindex"
    cmd += " -threads {0}".format(opts.cpus)
    offset = guessoffset([r1])
    if offset == 64:
        cmd += " -I"
    cmd += " ".join(("", pf, ref, r1, r2))
    sh(cmd)
示例#2
0
文件: gmap.py 项目: fw1121/jcvi
def align(args):
    """
    %prog align database.fasta read1.fq read2.fq

    Wrapper for `gsnap` single-end or paired-end, depending on the number of
    args.
    """
    from jcvi.formats.fasta import join
    from jcvi.formats.fastq import guessoffset
    from jcvi.projects.tgbs import snp

    p = OptionParser(align.__doc__)
    p.add_option("--join", default=False, action="store_true",
                 help="Join sequences with padded 50Ns")
    p.add_option("--rnaseq", default=False, action="store_true",
                 help="Input is RNA-seq reads, turn splicing on")
    p.add_option("--snp", default=False, action="store_true",
                 help="Call SNPs after GSNAP")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) == 2:
        logging.debug("Single-end alignment")
    elif len(args) == 3:
        logging.debug("Paired-end alignment")
    else:
        sys.exit(not p.print_help())

    dbfile, readfile = args[0:2]
    if opts.join:
        dbfile = join([dbfile, "--gapsize=50", "--newid=chr1"])

    assert op.exists(dbfile) and op.exists(readfile)
    prefix = get_prefix(readfile, dbfile)
    logfile = prefix + ".log"
    gsnapfile = prefix + ".gsnap"
    if not need_update((dbfile, readfile), gsnapfile):
        logging.error("`{0}` exists. `gsnap` already run.".format(gsnapfile))
    else:
        dbdir, dbname = check_index(dbfile)
        cmd = "gsnap -D {0} -d {1}".format(dbdir, dbname)
        cmd += " -B 5 -m 0.1 -i 2 -n 3"  # memory, mismatch, indel penalty, nhits
        if opts.rnaseq:
            cmd += " -N 1"
        cmd += " -t {0}".format(opts.cpus)
        cmd += " --gmap-mode none --nofails"
        if readfile.endswith(".gz"):
            cmd += " --gunzip"
        try:
            offset = "sanger" if guessoffset([readfile]) == 33 else "illumina"
            cmd += " --quality-protocol {0}".format(offset)
        except AssertionError:
            pass
        cmd += " " + " ".join(args[1:])
        sh(cmd, outfile=gsnapfile, errfile=logfile)

    if opts.snp:
        snp([gsnapfile, "--cpus={0}".format(opts.cpus)])

    return gsnapfile, logfile
示例#3
0
def correct(args):
    """
    %prog correct *.fastq

    Correct the fastqfile and generated corrected fastqfiles. This calls
    assembly.allpaths.prepare() to generate input files for ALLPATHS-LG. The
    naming convention for your fastqfiles are important, and are listed below.

    By default, this will correct all PE reads, and remove duplicates of all MP
    reads, and results will be placed in `frag_reads.corr.{pairs,frags}.fastq`
    and `jump_reads.corr.{pairs,frags}.fastq`.
    """
    from jcvi.assembly.allpaths import prepare
    from jcvi.assembly.base import FastqNamings

    p = OptionParser(correct.__doc__ + FastqNamings)
    p.add_option(
        "--nofragsdedup",
        default=False,
        action="store_true",
        help="Don't deduplicate the fragment reads [default: %default]")
    p.add_option("--cpus",
                 default=32,
                 type="int",
                 help="Number of threads to run [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fastq = args
    tag, tagj = "frag_reads", "jump_reads"

    prepare(["Unknown"] + fastq + ["--norun"])

    datadir = "data"
    mkdir(datadir)
    fullpath = op.join(os.getcwd(), datadir)
    nthreads = " NUM_THREADS={0}".format(opts.cpus)
    phred64 = (guessoffset([args[0]]) == 64)

    orig = datadir + "/{0}_orig".format(tag)
    origfastb = orig + ".fastb"
    if need_update(fastq, origfastb):
        cmd = "PrepareAllPathsInputs.pl DATA_DIR={0} HOSTS='{1}'".\
                format(fullpath, opts.cpus)
        if phred64:
            cmd += " PHRED_64=True"
        sh(cmd)

    if op.exists(origfastb):
        dedup = not opts.nofragsdedup
        correct_frag(datadir, tag, origfastb, nthreads, dedup=dedup)

    origj = datadir + "/{0}_orig".format(tagj)
    origjfastb = origj + ".fastb"

    if op.exists(origjfastb):
        correct_jump(datadir, tagj, origjfastb, nthreads)
示例#4
0
文件: ca.py 项目: zjwang6/jcvi
def fastq(args):
    """
    %prog fastq fastqfile

    Convert reads formatted as FASTQ file, and convert to CA frg file.
    """
    from jcvi.formats.fastq import guessoffset

    p = OptionParser(fastq.__doc__)
    p.add_option(
        "--outtie",
        dest="outtie",
        default=False,
        action="store_true",
        help="Are these outie reads?",
    )
    p.set_phred()
    p.set_size()

    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(p.print_help())

    fastqfiles = [get_abs_path(x) for x in args]
    size = opts.size
    outtie = opts.outtie
    if size > 1000 and (not outtie):
        logging.debug(
            "[warn] long insert size {0} but not outtie".format(size))

    mated = size != 0
    libname = op.basename(args[0]).split(".")[0]
    libname = libname.replace("_1_sequence", "")

    frgfile = libname + ".frg"
    mean, sv = get_mean_sv(opts.size)

    cmd = "fastqToCA"
    cmd += " -libraryname {0} ".format(libname)
    fastqs = " ".join("-reads {0}".format(x) for x in fastqfiles)
    if mated:
        assert len(args) in (
            1, 2), "you need one or two fastq files for mated library"
        fastqs = "-mates {0}".format(",".join(fastqfiles))
        cmd += "-insertsize {0} {1} ".format(mean, sv)
    cmd += fastqs

    offset = int(opts.phred) if opts.phred else guessoffset([fastqfiles[0]])
    illumina = offset == 64
    if illumina:
        cmd += " -type illumina"
    if outtie:
        cmd += " -outtie"

    sh(cmd, outfile=frgfile)
示例#5
0
def tophat(args):
    """
    %prog tophat folder reference

    Run tophat on a folder of reads.
    """
    from jcvi.apps.bowtie import check_index
    from jcvi.formats.fastq import guessoffset

    p = OptionParser(tophat.__doc__)
    p.add_option("--gtf", help="Reference annotation [default: %default]")
    p.add_option("--single", default=False, action="store_true",
                 help="Single end mapping")
    p.add_option("--intron", default=15000, type="int",
                 help="Max intron size [default: %default]")
    p.add_option("--dist", default=-50, type="int",
                 help="Mate inner distance [default: %default]")
    p.add_option("--stdev", default=50, type="int",
                 help="Mate standard deviation [default: %default]")
    p.set_phred()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    num = 1 if opts.single else 2
    folder, reference = args
    reference = check_index(reference)
    for p, prefix in iter_project(folder, n=num):
        outdir = "{0}_tophat".format(prefix)
        outfile = op.join(outdir, "accepted_hits.bam")
        if op.exists(outfile):
            logging.debug("File `{0}` found. Skipping.".format(outfile))
            continue

        cmd = "tophat -p {0}".format(opts.cpus)
        if opts.gtf:
            cmd += " -G {0}".format(opts.gtf)
        cmd += " -o {0}".format(outdir)

        if num == 1:  # Single-end
            a, = p
        else:  # Paired-end
            a, b = p
            cmd += " --max-intron-length {0}".format(opts.intron)
            cmd += " --mate-inner-dist {0}".format(opts.dist)
            cmd += " --mate-std-dev {0}".format(opts.stdev)

        phred = opts.phred or str(guessoffset([a]))
        if phred == "64":
            cmd += " --phred64-quals"
        cmd += " {0} {1}".format(reference, " ".join(p))

        sh(cmd)
示例#6
0
def correct(args):
    """
    %prog correct *.fastq

    Correct the fastqfile and generated corrected fastqfiles. This calls
    assembly.allpaths.prepare() to generate input files for ALLPATHS-LG. The
    naming convention for your fastqfiles are important, and are listed below.

    By default, this will correct all PE reads, and remove duplicates of all MP
    reads, and results will be placed in `frag_reads.corr.{pairs,frags}.fastq`
    and `jump_reads.corr.{pairs,frags}.fastq`.
    """
    from jcvi.assembly.allpaths import prepare
    from jcvi.assembly.base import FastqNamings

    p = OptionParser(correct.__doc__ + FastqNamings)
    p.add_option("--nofragsdedup", default=False, action="store_true",
                 help="Don't deduplicate the fragment reads [default: %default]")
    p.add_option("--cpus", default=32, type="int",
                 help="Number of threads to run [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fastq = args
    tag, tagj = "frag_reads", "jump_reads"

    prepare(["Unknown"] + fastq + ["--norun"])

    datadir = "data"
    mkdir(datadir)
    fullpath = op.join(os.getcwd(), datadir)
    nthreads = " NUM_THREADS={0}".format(opts.cpus)
    phred64 = (guessoffset([args[0]]) == 64)

    orig = datadir + "/{0}_orig".format(tag)
    origfastb = orig + ".fastb"
    if need_update(fastq, origfastb):
        cmd = "PrepareAllPathsInputs.pl DATA_DIR={0} HOSTS='{1}'".\
                format(fullpath, opts.cpus)
        if phred64:
            cmd += " PHRED_64=True"
        sh(cmd)

    if op.exists(origfastb):
        dedup = not opts.nofragsdedup
        correct_frag(datadir, tag, origfastb, nthreads, dedup=dedup)

    origj = datadir + "/{0}_orig".format(tagj)
    origjfastb = origj + ".fastb"

    if op.exists(origjfastb):
        correct_jump(datadir, tagj, origjfastb, nthreads)
示例#7
0
文件: ca.py 项目: arvin580/jcvi
def fastq(args):
    """
    %prog fastq fastqfile

    Convert reads formatted as FASTQ file, and convert to CA frg file.
    """
    from jcvi.formats.fastq import guessoffset

    p = OptionParser(fastq.__doc__)
    p.add_option(
        "--outtie", dest="outtie", default=False, action="store_true", help="Are these outie reads? [default: %default]"
    )
    p.set_phred()
    p.set_size()

    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(p.print_help())

    fastqfiles = [get_abs_path(x) for x in args]
    size = opts.size
    outtie = opts.outtie
    if size > 1000 and (not outtie):
        logging.debug("[warn] long insert size {0} but not outtie".format(size))

    mated = size != 0
    libname = op.basename(args[0]).split(".")[0]
    libname = libname.replace("_1_sequence", "")

    frgfile = libname + ".frg"
    mean, sv = get_mean_sv(opts.size)

    cmd = "fastqToCA"
    cmd += " -libraryname {0} ".format(libname)
    fastqs = " ".join("-reads {0}".format(x) for x in fastqfiles)
    if mated:
        assert len(args) in (1, 2), "you need one or two fastq files for mated library"
        fastqs = "-mates {0}".format(",".join(fastqfiles))
        cmd += "-insertsize {0} {1} ".format(mean, sv)
    cmd += fastqs

    offset = int(opts.phred) if opts.phred else guessoffset([fastqfiles[0]])
    illumina = offset == 64
    if illumina:
        cmd += " -type illumina"
    if outtie:
        cmd += " -outtie"

    sh(cmd, outfile=frgfile)
示例#8
0
def check_aln(dbfile, readfile, cpus=32):
    from jcvi.formats.fastq import guessoffset

    saifile = readfile.rsplit(".", 1)[0] + ".sai"
    if need_update((dbfile, readfile), saifile):
        offset = guessoffset([readfile])
        cmd = "bwa aln " + " ".join((dbfile, readfile))
        cmd += " -t {0}".format(cpus)
        if offset == 64:
            cmd += " -I"
        sh(cmd, outfile=saifile)
    else:
        logging.error("`{0}` exists. `bwa aln` already run.".format(saifile))

    return saifile
示例#9
0
文件: bwa.py 项目: rrane/jcvi
def check_aln(dbfile, readfile, cpus=32):
    from jcvi.formats.fastq import guessoffset

    saifile = readfile.rsplit(".", 1)[0] + ".sai"
    if need_update((dbfile, readfile), saifile):
        offset = guessoffset([readfile])
        cmd = "bwa aln " + " ".join((dbfile, readfile))
        cmd += " -t {0}".format(cpus)
        if offset == 64:
            cmd += " -I"
        sh(cmd, outfile=saifile)
    else:
        logging.error("`{0}` exists. `bwa aln` already run.".format(saifile))

    return saifile
示例#10
0
def fastq(args):
    """
    %prog fastq fastqfile

    Convert reads formatted as FASTQ file, and convert to CA frg file.
    """
    from jcvi.formats.fastq import guessoffset

    p = OptionParser(fastq.__doc__)
    phdchoices = ("33", "64")
    p.add_option("--outtie", dest="outtie", default=False, action="store_true",
            help="Are these outie reads? [default: %default]")
    p.add_option("--phred", default=None, choices=phdchoices,
            help="Phred score offset {0} [default: guess]".format(phdchoices))
    add_size_option(p)

    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(p.print_help())

    fastqfiles = [get_abs_path(x) for x in args]

    mated = (opts.size != 0)
    outtie = opts.outtie
    libname = op.basename(args[0]).split(".")[0]
    libname = libname.replace("_1_sequence", "")

    frgfile = libname + ".frg"
    mean, sv = get_mean_sv(opts.size)

    cmd = CAPATH("fastqToCA")
    cmd += " -libraryname {0} ".format(libname)
    fastqs = " ".join("-reads {0}".format(x) for x in fastqfiles)
    if mated:
        assert len(args) in (1, 2), "you need one or two fastq files for mated library"
        fastqs = "-mates {0}".format(",".join(fastqfiles))
        cmd += "-insertsize {0} {1} ".format(mean, sv)
    cmd += fastqs

    offset = int(opts.phred) if opts.phred else guessoffset([fastqfiles[0]])
    illumina = (offset == 64)
    if illumina:
        cmd += " -type illumina"
    if outtie:
        cmd += " -outtie"

    sh(cmd, outfile=frgfile)
示例#11
0
def alignextend(args):
    """
    %prog alignextend ref.fasta read.1.fastq read.2.fastq

    Wrapper around AMOS alignextend.
    """
    choices = "prepare,align,filter,rmdup,genreads".split(",")
    p = OptionParser(alignextend.__doc__)
    p.add_option("--nosuffix", default=False, action="store_true",
                 help="Do not add /1/2 suffix to the read [default: %default]")
    p.add_option("--rc", default=False, action="store_true",
                 help="Reverse complement the reads before alignment")
    p.add_option("--len", default=100, type="int",
                 help="Extend to this length")
    p.add_option("--stage", default="prepare", choices=choices,
                 help="Start from certain stage")
    p.add_option("--dup", default=10, type="int",
                 help="Filter duplicates with coordinates within this distance")
    p.add_option("--maxdiff", default=1, type="int",
                 help="Maximum number of differences")
    p.set_home("amos")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    ref, r1, r2 = args
    pf = op.basename(r1).split(".")[0]
    cmd = op.join(opts.amos_home, "src/Experimental/alignextend.pl")
    if not opts.nosuffix:
        cmd += " -suffix"
    bwa_idx = "{0}.ref.fa.sa".format(pf)
    if not need_update(ref, bwa_idx):
        cmd += " -noindex"
    cmd += " -threads {0}".format(opts.cpus)
    offset = guessoffset([r1])
    if offset == 64:
        cmd += " -I"
    if opts.rc:
        cmd += " -rc"
    cmd += " -allow -len {0} -dup {1}".format(opts.len, opts.dup)
    cmd += " -min {0} -max {1}".format(2 * opts.len, 20 * opts.len)
    cmd += " -maxdiff {0}".format(opts.maxdiff)
    cmd += " -stage {0}".format(opts.stage)
    cmd += " ".join(("", pf, ref, r1, r2))
    sh(cmd)
示例#12
0
def alignextend(args):
    """
    %prog alignextend ref.fasta read.1.fastq read.2.fastq

    Wrapper around AMOS alignextend.
    """
    choices = "prepare,align,filter,rmdup,genreads".split(",")
    p = OptionParser(alignextend.__doc__)
    p.add_option("--nosuffix", default=False, action="store_true",
                 help="Do not add /1/2 suffix to the read [default: %default]")
    p.add_option("--rc", default=False, action="store_true",
                 help="Reverse complement the reads before alignment")
    p.add_option("--len", default=100, type="int",
                 help="Extend to this length")
    p.add_option("--stage", default="prepare", choices=choices,
                 help="Start from certain stage")
    p.add_option("--dup", default=10, type="int",
                 help="Filter duplicates with coordinates within this distance")
    p.add_option("--maxdiff", default=1, type="int",
                 help="Maximum number of differences")
    p.set_home("amos")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    ref, r1, r2 = args
    pf = op.basename(r1).split(".")[0]
    cmd = op.join(opts.amos_home, "src/Experimental/alignextend.pl")
    if not opts.nosuffix:
        cmd += " -suffix"
    bwa_idx = "{0}.ref.fa.sa".format(pf)
    if not need_update(ref, bwa_idx):
        cmd += " -noindex"
    cmd += " -threads {0}".format(opts.cpus)
    offset = guessoffset([r1])
    if offset == 64:
        cmd += " -I"
    if opts.rc:
        cmd += " -rc"
    cmd += " -allow -len {0} -dup {1}".format(opts.len, opts.dup)
    cmd += " -min {0} -max {1}".format(2 * opts.len, 20 * opts.len)
    cmd += " -maxdiff {0}".format(opts.maxdiff)
    cmd += " -stage {0}".format(opts.stage)
    cmd += " ".join(("", pf, ref, r1, r2))
    sh(cmd)
示例#13
0
文件: bwa.py 项目: linlifeng/jcvi
def check_aln(dbfile, readfile, grid=False, cpus=32):
    from jcvi.formats.fastq import guessoffset

    saifile = readfile.rsplit(".", 1)[0] + ".sai"
    if op.exists(saifile):
        logging.error("`{0}` exists. `bwa aln` already run.".format(saifile))

    else:
        offset = guessoffset([readfile])
        cmd = "bwa aln -t {0}".format(cpus)
        if offset == 64:
            cmd += " -I"

        cmd += " {0} {1}".format(dbfile, readfile)
        sh(cmd, grid=grid, outfile=saifile)

    return saifile
示例#14
0
def check_aln(dbfile, readfile, grid=False, cpus=32):
    from jcvi.formats.fastq import guessoffset

    saifile = readfile.rsplit(".", 1)[0] + ".sai"
    if op.exists(saifile):
        logging.error("`{0}` exists. `bwa aln` already run.".format(saifile))

    else:
        offset = guessoffset([readfile])
        cmd = "bwa aln -t {0}".format(cpus)
        if offset == 64:
            cmd += " -I"

        cmd += " {0} {1}".format(dbfile, readfile)
        sh(cmd, grid=grid, outfile=saifile)

    return saifile
示例#15
0
def clean(args):
    """
    %prog clean 1.fastq 2.fastq [insertsize]

    Clean and dedup paired FASTQ files.
    """
    p = OptionParser(clean.__doc__)
    p.add_option("-a",
                 default=0,
                 type="int",
                 help="Trim length at 5' end [default: %default]")
    p.add_option("-b",
                 default=50,
                 type="int",
                 help="Trim length at 3' end [default: %default]")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) == 2:
        p1, p2 = args
        size = get_size(p1)
    elif len(args) == 3:
        p1, p2, size = args
        size = int(size)
    else:
        sys.exit(not p.print_help())

    pf = p1.split(".")[0]
    cpus = opts.cpus

    offset = guessoffset([p1])
    a, b = opts.a, opts.b

    p1_clean = p1 + ".clean"
    p1_cleangz = p1_clean + ".gz"
    p2_clean = p2 + ".clean"
    p2_cleangz = p2_clean + ".gz"
    if need_update([p1, p2], [p1_cleangz, p2_cleangz]):
        cmd = "SOAPfilter_v2.0 -t {0} -m 2000000 -p -y -z -g".format(cpus)
        cmd += " -q {0} -w 10 -B 50 -f 0".format(offset)
        cmd += " -l {0} -a {1} -b {2} -c {1} -d {2}".format(size, a, b, a, b)
        cmd += " {0} {1} {2}.clean.stat {3} {4}".\
                    format(p1, p2, pf, p1_clean, p2_clean)
        sh(cmd)
示例#16
0
文件: soap.py 项目: tanghaibao/jcvi
def clean(args):
    """
    %prog clean 1.fastq 2.fastq [insertsize]

    Clean and dedup paired FASTQ files.
    """
    p = OptionParser(clean.__doc__)
    p.add_option("-a", default=0, type="int",
                 help="Trim length at 5' end [default: %default]")
    p.add_option("-b", default=50, type="int",
                 help="Trim length at 3' end [default: %default]")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) == 2:
        p1, p2 = args
        size = get_size(p1)
    elif len(args) == 3:
        p1, p2, size = args
        size = int(size)
    else:
        sys.exit(not p.print_help())

    pf = p1.split(".")[0]
    cpus = opts.cpus

    offset = guessoffset([p1])
    a, b = opts.a, opts.b

    p1_clean = p1 + ".clean"
    p1_cleangz = p1_clean + ".gz"
    p2_clean = p2 + ".clean"
    p2_cleangz = p2_clean + ".gz"
    if need_update([p1, p2], [p1_cleangz, p2_cleangz]):
        cmd = "SOAPfilter_v2.0 -t {0} -m 2000000 -p -y -z -g".format(cpus)
        cmd += " -q {0} -w 10 -B 50 -f 0".format(offset)
        cmd += " -l {0} -a {1} -b {2} -c {1} -d {2}".format(size, a, b, a, b)
        cmd += " {0} {1} {2}.clean.stat {3} {4}".\
                    format(p1, p2, pf, p1_clean, p2_clean)
        sh(cmd)
示例#17
0
文件: soap.py 项目: rrane/jcvi
def correct(args):
    """
    %prog correct *.fastq

    Correct reads using ErrorCorrection. Only PE will be used to build the K-mer
    table.
    """
    p = OptionParser(correct.__doc__)
    p.add_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    lstfile = "reads2cor.lst"
    fw = open(lstfile, "w")
    print >> fw, "\n".join(x for x in args if x[:2] == "PE")
    fw.close()

    p1 = args[0]
    offset = guessoffset([p1])
    cpus = opts.cpus

    freq = "output.freq.cz"
    freqlen = freq + ".len"
    if need_update(args, (freq, freqlen)):
        cmd = "KmerFreq_AR_v2.0 -k 17 -c -1 -q {0}".format(offset)
        cmd += " -m 1 -t {0}".format(cpus)
        cmd += " -p output {0}".format(lstfile)
        sh(cmd)

    fw = open(lstfile, "w")
    print >> fw, "\n".join(args)
    fw.close()

    cmd = "Corrector_AR_v2.0 -k 17 -l 3 -m 5 -c 5 -a 0 -e 1 -w 0 -r 45"
    cmd += " -Q {0} -q 30 -x 8 -t {1} -o 1 ".format(offset, cpus)
    cmd += " {0} {1} {2}".format(freq, freqlen, lstfile)
    sh(cmd)
示例#18
0
def correct(args):
    """
    %prog correct *.fastq

    Correct reads using ErrorCorrection. Only PE will be used to build the K-mer
    table.
    """
    p = OptionParser(correct.__doc__)
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    lstfile = "reads2cor.lst"
    fw = open(lstfile, "w")
    print("\n".join(x for x in args if x[:2] == "PE"), file=fw)
    fw.close()

    p1 = args[0]
    offset = guessoffset([p1])
    cpus = opts.cpus

    freq = "output.freq.cz"
    freqlen = freq + ".len"
    if need_update(args, (freq, freqlen)):
        cmd = "KmerFreq_AR_v2.0 -k 17 -c -1 -q {0}".format(offset)
        cmd += " -m 1 -t {0}".format(cpus)
        cmd += " -p output {0}".format(lstfile)
        sh(cmd)

    fw = open(lstfile, "w")
    print("\n".join(args), file=fw)
    fw.close()

    cmd = "Corrector_AR_v2.0 -k 17 -l 3 -m 5 -c 5 -a 0 -e 1 -w 0 -r 45"
    cmd += " -Q {0} -q 30 -x 8 -t {1} -o 1 ".format(offset, cpus)
    cmd += " {0} {1} {2}".format(freq, freqlen, lstfile)
    sh(cmd)
示例#19
0
文件: automaton.py 项目: zjwang6/jcvi
def tophat(args):
    """
    %prog tophat folder reference

    Run tophat on a folder of reads.
    """
    from jcvi.apps.bowtie import check_index
    from jcvi.formats.fastq import guessoffset

    p = OptionParser(tophat.__doc__)
    p.add_option("--gtf", help="Reference annotation [default: %default]")
    p.add_option("--single",
                 default=False,
                 action="store_true",
                 help="Single end mapping")
    p.add_option("--intron",
                 default=15000,
                 type="int",
                 help="Max intron size [default: %default]")
    p.add_option("--dist",
                 default=-50,
                 type="int",
                 help="Mate inner distance [default: %default]")
    p.add_option("--stdev",
                 default=50,
                 type="int",
                 help="Mate standard deviation [default: %default]")
    p.set_phred()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    num = 1 if opts.single else 2
    folder, reference = args
    reference = check_index(reference)
    for p, prefix in iter_project(folder, n=num):
        outdir = "{0}_tophat".format(prefix)
        outfile = op.join(outdir, "accepted_hits.bam")
        if op.exists(outfile):
            logging.debug("File `{0}` found. Skipping.".format(outfile))
            continue

        cmd = "tophat -p {0}".format(opts.cpus)
        if opts.gtf:
            cmd += " -G {0}".format(opts.gtf)
        cmd += " -o {0}".format(outdir)

        if num == 1:  # Single-end
            a, = p
        else:  # Paired-end
            a, b = p
            cmd += " --max-intron-length {0}".format(opts.intron)
            cmd += " --mate-inner-dist {0}".format(opts.dist)
            cmd += " --mate-std-dev {0}".format(opts.stdev)

        phred = opts.phred or str(guessoffset([a]))
        if phred == "64":
            cmd += " --phred64-quals"
        cmd += " {0} {1}".format(reference, " ".join(p))

        sh(cmd)
示例#20
0
def trim(args):
    """
    %prog trim fastqfiles

    Trim reads using TRIMMOMATIC. If two fastqfiles are given, then it invokes
    the paired reads mode. See manual:

    <http://www.usadellab.org/cms/index.php?page=trimmomatic>
    """
    TrimVersion = tv = "0.20"
    TrimJar = "trimmomatic-{0}.jar".format(tv)
    phdchoices = ("33", "64")
    p = OptionParser(trim.__doc__)
    p.add_option("--path", default=op.join("~/bin", TrimJar),
            help="Path to trimmomatic [default: %default]")
    p.add_option("--phred", default=None, choices=phdchoices,
            help="Phred score offset {0} [default: guess]".format(phdchoices))
    p.add_option("--nofrags", default=False, action="store_true",
            help="Discard frags file in PE mode [default: %default]")
    p.add_option("--minqv", default=10, type="int",
            help="Average qv after trimming [default: %default]")
    p.add_option("--minlen", default=30, type="int",
            help="Minimum length after trimming [default: %default]")
    p.add_option("--nogz", default=False, action="store_true",
            help="Do not write to gzipped files [default: %default]")
    set_grid(p)

    opts, args = p.parse_args(args)

    if len(args) not in (1, 2):
        sys.exit(not p.print_help())

    path = op.expanduser(opts.path)
    url = \
    "http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/Trimmomatic-{0}.zip"\
    .format(tv)

    if not op.exists(path):
        path = download(url)
        TrimUnzipped = "Trimmomatic-" + tv
        if not op.exists(TrimUnzipped):
            sh("unzip " + path)
        os.remove(path)
        path = op.join(TrimUnzipped, TrimJar)

    assert op.exists(path)

    adaptersfile = "adapters.fasta"
    if not op.exists(adaptersfile):
        write_file(adaptersfile, Adapters)

    assert op.exists(adaptersfile), \
        "Please place the illumina adapter sequence in `{0}`".\
        format(adaptersfile)

    if opts.phred is None:
        offset = guessoffset([args[0]])
    else:
        offset = int(opts.phred)

    phredflag = " -phred{0}".format(offset)

    cmd = JAVAPATH("java-1.6.0")
    cmd += " -Xmx4g -cp {0} org.usadellab.trimmomatic".format(path)
    frags = ".frags.fastq"
    pairs = ".pairs.fastq"
    if not opts.nogz:
        frags += ".gz"
        pairs += ".gz"

    get_prefix = lambda x: op.basename(x).replace(".gz", "").rsplit(".", 1)[0]
    if len(args) == 1:
        cmd += ".TrimmomaticSE"
        cmd += phredflag
        fastqfile, = args
        prefix = get_prefix(fastqfile)
        frags1 = prefix + frags
        cmd += " {0}".format(" ".join((fastqfile, frags1)))
    else:
        cmd += ".TrimmomaticPE"
        cmd += phredflag
        fastqfile1, fastqfile2 = args
        prefix1 = get_prefix(fastqfile1)
        prefix2 = get_prefix(fastqfile2)
        pairs1 = prefix1 + pairs
        pairs2 = prefix2 + pairs
        frags1 = prefix1 + frags
        frags2 = prefix2 + frags
        if opts.nofrags:
            frags1 = "/dev/null"
            frags2 = "/dev/null"
        cmd += " {0}".format(" ".join((fastqfile1, fastqfile2, \
                pairs1, frags1, pairs2, frags2)))

    cmd += " ILLUMINACLIP:{0}:2:40:12".format(adaptersfile)
    cmd += " LEADING:3 TRAILING:3"
    cmd += " SLIDINGWINDOW:4:{0} MINLEN:{1}".format(opts.minqv, opts.minlen)
    if offset != 33:
        cmd += " TOPHRED33"
    sh(cmd, grid=opts.grid)
示例#21
0
def prepare(args):
    """
    %prog prepare "B. oleracea" *.fastq

    Scan input fastq files (see below) and create `in_groups.csv` and
    `in_libs.csv`. The species name does not really matter.
    """
    from jcvi.utils.table import write_csv
    from jcvi.formats.base import write_file
    from jcvi.formats.fastq import guessoffset

    p = OptionParser(prepare.__doc__ + FastqNamings)
    p.add_option("--corr", default=False, action="store_true",
                 help="Extra parameters for corrected data [default: %default]")
    p.add_option("--norun", default=False, action="store_true",
                 help="Don't write `run.sh` script [default: %default]")
    p.add_option("--ploidy", default="2", choices=("1", "2"),
                 help="Ploidy [default: %default]")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    organism_name = args[0]
    project_name = "".join(x[0] for x in organism_name.split()).upper()
    fnames = sorted(glob("*.fastq*") if len(args) == 1 else args[1:])
    for x in fnames:
        assert op.exists(x), "File `{0}` not found.".format(x)

    offset = guessoffset([fnames[0]])
    phred64 = offset == 64

    assert all(guessoffset([x]) == offset for x in fnames[1:])

    groupheader = "group_name library_name file_name".split()
    libheader = "library_name project_name organism_name type paired "\
        "frag_size frag_stddev insert_size insert_stddev read_orientation "\
        "genomic_start genomic_end".split()
    groupcontents = []
    libs = []
    for file_name in fnames:
        group_name = op.basename(file_name).split(".")[0]
        library_name = "-".join(group_name.split("-")[:2])

        # Handle paired files and convert to wildcard
        if ".1." in file_name:
            file_name = file_name.replace(".1.", ".?.")
        elif ".2." in file_name:
            continue

        groupcontents.append((group_name, library_name, file_name))
        if library_name not in libs:
            libs.append(library_name)

    libcontents = []
    for library_name in libs:
        L = Library(library_name)
        size = L.size
        stddev = L.stddev
        type = L.type
        paired = L.paired
        read_orientation = L.read_orientation

        size = size or ""
        stddev = stddev or ""
        frag_size = size if type == "fragment" else ""
        frag_stddev = stddev if type == "fragment" else ""
        insert_size = size if type != "fragment" else ""
        insert_stddev = stddev if type != "fragment" else ""
        genomic_start, genomic_end = "", ""
        libcontents.append((library_name, project_name, organism_name, type, \
            paired, frag_size, frag_stddev, insert_size, insert_stddev, \
            read_orientation, genomic_start, genomic_end))

    write_csv(groupheader, groupcontents, filename="in_groups.csv", tee=True)
    logging.debug("`in_group.csv` created (# of groups = {0}).".\
        format(len(groupcontents)))

    write_csv(libheader, libcontents, filename="in_libs.csv", tee=True)
    logging.debug("`in_libs.csv` created (# of libs = {0}).".\
        format(len(libcontents)))

    runfile = "run.sh"

    extra = ""
    if opts.corr:
        extra += "FE_NUM_CYCLES=1 EC_K=28 FE_QUAL_CEIL_RADIUS=0"
        extra += " REMOVE_DODGY_READS_FRAG=False FE_MAX_KMER_FREQ_TO_MARK=1"

    if not opts.norun:
        contents = ALLPATHSRUN.format(opts.ploidy, opts.cpus, phred64, extra)
        write_file(runfile, contents)
示例#22
0
def align(args):
    """
    %prog align database.fasta read1.fq read2.fq

    Wrapper for `gsnap` single-end or paired-end, depending on the number of
    args.
    """
    from jcvi.formats.fastq import guessoffset

    p = OptionParser(align.__doc__)
    p.add_option("--rnaseq",
                 default=False,
                 action="store_true",
                 help="Input is RNA-seq reads, turn splicing on")
    p.add_option("--native",
                 default=False,
                 action="store_true",
                 help="Convert GSNAP output to NATIVE format")
    p.set_home("eddyyeh")
    p.set_outdir()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) == 2:
        logging.debug("Single-end alignment")
    elif len(args) == 3:
        logging.debug("Paired-end alignment")
    else:
        sys.exit(not p.print_help())

    dbfile, readfile = args[:2]
    outdir = opts.outdir
    assert op.exists(dbfile) and op.exists(readfile)
    prefix = get_prefix(readfile, dbfile)
    logfile = op.join(outdir, prefix + ".log")
    gsnapfile = op.join(outdir, prefix + ".gsnap")
    nativefile = gsnapfile.rsplit(".", 1)[0] + ".unique.native"

    if not need_update((dbfile, readfile), gsnapfile):
        logging.error("`{0}` exists. `gsnap` already run.".format(gsnapfile))
    else:
        dbdir, dbname = check_index(dbfile)
        cmd = "gsnap -D {0} -d {1}".format(dbdir, dbname)
        cmd += " -B 5 -m 0.1 -i 2 -n 3"  # memory, mismatch, indel penalty, nhits
        if opts.rnaseq:
            cmd += " -N 1"
        cmd += " -t {0}".format(opts.cpus)
        cmd += " --gmap-mode none --nofails"
        if readfile.endswith(".gz"):
            cmd += " --gunzip"
        try:
            offset = "sanger" if guessoffset([readfile]) == 33 else "illumina"
            cmd += " --quality-protocol {0}".format(offset)
        except AssertionError:
            pass
        cmd += " " + " ".join(args[1:])
        sh(cmd, outfile=gsnapfile, errfile=logfile)

    if opts.native:
        EYHOME = opts.eddyyeh_home
        if need_update(gsnapfile, nativefile):
            cmd = op.join(EYHOME, "convert2native.pl")
            cmd += " --gsnap {0} -o {1}".format(gsnapfile, nativefile)
            cmd += " -proc {0}".format(opts.cpus)
            sh(cmd)

    return gsnapfile, logfile
示例#23
0
def correct(args):
    """
    %prog correct *.fastq

    Correct the fastqfile and generated corrected fastqfiles. This calls
    assembly.allpaths.prepare() to generate input files for ALLPATHS-LG. The
    naming convention for your fastqfiles are important, and are listed below.

    By default, this will correct all PE reads, and remove duplicates of all MP
    reads, and results will be placed in `frag_reads.corr.{pairs,frags}.fastq`
    and `jump_reads.corr.{pairs,frags}.fastq`.
    """
    from jcvi.assembly.allpaths import prepare
    from jcvi.assembly.base import FastqNamings

    p = OptionParser(correct.__doc__ + FastqNamings)
    p.add_option("--dir", default="data",
                help="Working directory [default: %default]")
    p.add_option("--fragsdedup", default=False, action="store_true",
                 help="Don't deduplicate the fragment reads [default: %default]")
    p.add_option("--ploidy", default="2", choices=("1", "2"),
                 help="Ploidy [default: %default]")
    p.add_option("--haploidify", default=False, action="store_true",
                 help="Set HAPLOIDIFY=True [default: %default]")
    p.add_option("--suffix", default=False, action="store_true",
                 help="Add suffix /1, /2 to read names")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fastq = args
    tag, tagj, taglj = "frag_reads", "jump_reads", "long_jump_reads"

    ploidy = opts.ploidy
    haploidify = opts.haploidify
    suffix = opts.suffix
    assert (not haploidify) or (haploidify and ploidy == '2')

    prepare(["Unknown"] + fastq + ["--norun"])

    datadir = opts.dir
    mkdir(datadir)
    fullpath = op.join(os.getcwd(), datadir)
    nthreads = " NUM_THREADS={0}".format(opts.cpus)
    phred64 = (guessoffset([args[0]]) == 64)

    orig = datadir + "/{0}_orig".format(tag)
    origfastb = orig + ".fastb"
    if need_update(fastq, origfastb):
        cmd = "PrepareAllPathsInputs.pl DATA_DIR={0} HOSTS='{1}' PLOIDY={2}".\
                format(fullpath, opts.cpus, ploidy)
        if phred64:
            cmd += " PHRED_64=True"
        sh(cmd)

    if op.exists(origfastb):
        correct_frag(datadir, tag, origfastb, nthreads, dedup=opts.fragsdedup,
                     haploidify=haploidify, suffix=suffix)

    origj = datadir + "/{0}_orig".format(tagj)
    origjfastb = origj + ".fastb"
    if op.exists(origjfastb):
        correct_jump(datadir, tagj, origjfastb, nthreads, suffix=suffix)

    origlj = datadir + "/{0}_orig".format(taglj)
    origljfastb = origlj + ".fastb"
    if op.exists(origljfastb):
        correct_jump(datadir, taglj, origljfastb, nthreads, suffix=suffix)
示例#24
0
def align(args):
    """
    %prog align database.fasta read1.fq [read2.fq]

    Wrapper for `bowtie2` single-end or paired-end, depending on the number of args.
    """
    from jcvi.formats.fastq import guessoffset

    p = OptionParser(align.__doc__)
    p.set_firstN(firstN=0)
    p.add_option("--full",
                 default=False,
                 action="store_true",
                 help="Enforce end-to-end alignment [default: local]")
    p.add_option("--reorder",
                 default=False,
                 action="store_true",
                 help="Keep the input read order [default: %default]")
    p.add_option("--null",
                 default=False,
                 action="store_true",
                 help="Do not write to SAM/BAM output")
    p.add_option("--fasta",
                 default=False,
                 action="store_true",
                 help="Query reads are FASTA")
    p.set_cutoff(cutoff=800)
    p.set_mateorientation(mateorientation="+-")
    p.set_sam_options(bowtie=True)

    opts, args = p.parse_args(args)
    extra = opts.extra
    mo = opts.mateorientation
    if mo == '+-':
        extra += ""
    elif mo == '-+':
        extra += "--rf"
    else:
        extra += "--ff"

    PE = True
    if len(args) == 2:
        logging.debug("Single-end alignment")
        PE = False
    elif len(args) == 3:
        logging.debug("Paired-end alignment")
    else:
        sys.exit(not p.print_help())

    firstN = opts.firstN
    mapped = opts.mapped
    unmapped = opts.unmapped
    fasta = opts.fasta
    gl = "--end-to-end" if opts.full else "--local"

    dbfile, readfile = args[0:2]
    dbfile = check_index(dbfile)
    prefix = get_prefix(readfile, dbfile)
    samfile, mapped, unmapped = get_samfile(readfile,
                                            dbfile,
                                            bowtie=True,
                                            mapped=mapped,
                                            unmapped=unmapped,
                                            bam=opts.bam)
    logfile = prefix + ".log"
    if not fasta:
        offset = guessoffset([readfile])

    if not need_update(dbfile, samfile):
        logging.error("`{0}` exists. `bowtie2` already run.".format(samfile))
        return samfile, logfile

    cmd = "bowtie2 -x {0}".format(dbfile)
    if PE:
        r1, r2 = args[1:3]
        cmd += " -1 {0} -2 {1}".format(r1, r2)
        cmd += " --maxins {0}".format(opts.cutoff)
        mtag, utag = "--al-conc", "--un-conc"
    else:
        cmd += " -U {0}".format(readfile)
        mtag, utag = "--al", "--un"

    if mapped:
        cmd += " {0} {1}".format(mtag, mapped)
    if unmapped:
        cmd += " {0} {1}".format(utag, unmapped)

    if firstN:
        cmd += " --upto {0}".format(firstN)
    cmd += " -p {0}".format(opts.cpus)
    if fasta:
        cmd += " -f"
    else:
        cmd += " --phred{0}".format(offset)
    cmd += " {0}".format(gl)
    if opts.reorder:
        cmd += " --reorder"

    cmd += " {0}".format(extra)
    # Finally the log
    cmd += " 2> {0}".format(logfile)

    if opts.null:
        samfile = "/dev/null"

    cmd = output_bam(cmd, samfile)
    sh(cmd)
    print(open(logfile).read(), file=sys.stderr)

    return samfile, logfile
示例#25
0
文件: bowtie.py 项目: rrane/jcvi
def align(args):
    """
    %prog align database.fasta read1.fq [read2.fq]

    Wrapper for `bowtie2` single-end or paired-end, depending on the number of args.
    """
    from jcvi.formats.fastq import guessoffset

    p = OptionParser(align.__doc__)
    p.set_firstN(firstN=0)
    p.add_option("--full", default=False, action="store_true", help="Enforce end-to-end alignment [default: local]")
    p.add_option("--reorder", default=False, action="store_true", help="Keep the input read order [default: %default]")
    p.set_cutoff(cutoff=800)
    p.set_mateorientation(mateorientation="+-")
    p.set_sam_options(bowtie=True)

    opts, args = p.parse_args(args)
    extra = opts.extra
    mo = opts.mateorientation
    if mo == "+-":
        extra += ""
    elif mo == "-+":
        extra += "--rf"
    else:
        extra += "--ff"

    PE = True
    if len(args) == 2:
        logging.debug("Single-end alignment")
        PE = False
    elif len(args) == 3:
        logging.debug("Paired-end alignment")
    else:
        sys.exit(not p.print_help())

    firstN = opts.firstN
    mapped = opts.mapped
    unmapped = opts.unmapped
    gl = "--end-to-end" if opts.full else "--local"

    dbfile, readfile = args[0:2]
    dbfile = get_abs_path(dbfile)
    safile = check_index(dbfile)
    prefix = get_prefix(readfile, dbfile)
    samfile, mapped, unmapped = get_samfile(
        readfile, dbfile, bowtie=True, mapped=mapped, unmapped=unmapped, bam=opts.bam
    )
    logfile = prefix + ".log"
    offset = guessoffset([readfile])

    if not need_update(safile, samfile):
        logging.error("`{0}` exists. `bowtie2` already run.".format(samfile))
        return samfile, logfile

    cmd = "bowtie2 -x {0}".format(dbfile)
    if PE:
        r1, r2 = args[1:3]
        cmd += " -1 {0} -2 {1}".format(r1, r2)
        cmd += " --maxins {0}".format(opts.cutoff)
        mtag, utag = "--al-conc", "--un-conc"
    else:
        cmd += " -U {0}".format(readfile)
        mtag, utag = "--al", "--un"

    if mapped:
        cmd += " {0} {1}".format(mtag, mapped)
    if unmapped:
        cmd += " {0} {1}".format(utag, unmapped)

    if firstN:
        cmd += " --upto {0}".format(firstN)
    cmd += " -p {0}".format(opts.cpus)
    cmd += " --phred{0}".format(offset)
    cmd += " {0}".format(gl)
    if opts.reorder:
        cmd += " --reorder"

    cmd += " {0}".format(extra)
    # Finally the log
    cmd += " 2> {0}".format(logfile)

    cmd = output_bam(cmd, samfile)
    sh(cmd)
    print >>sys.stderr, open(logfile).read()

    return samfile, logfile
示例#26
0
def trim(args):
    """
    %prog trim fastqfiles

    Trim reads using TRIMMOMATIC. If two fastqfiles are given, then it invokes
    the paired reads mode. See manual:

    <http://www.usadellab.org/cms/index.php?page=trimmomatic>
    """
    tv = "0.32"
    TrimJar = "trimmomatic-{0}.jar".format(tv)
    phdchoices = ("33", "64")
    p = OptionParser(trim.__doc__)
    p.add_option("--path", default=op.join("~/bin", TrimJar),
            help="Path to trimmomatic jar file [default: %default]")
    p.add_option("--phred", default=None, choices=phdchoices,
            help="Phred score offset [default: guess]")
    p.add_option("--nofrags", default=False, action="store_true",
            help="Discard frags file in PE mode [default: %default]")
    p.add_option("--minqv", default=15, type="int",
            help="Average qv after trimming [default: %default]")
    p.add_option("--minlen", default=36, type="int",
            help="Minimum length after trimming [default: %default]")
    p.add_option("--adapteronly", default=False, action="store_true",
            help="Only trim adapters with no qv trimming [default: %default]")
    p.add_option("--nogz", default=False, action="store_true",
            help="Do not write to gzipped files [default: %default]")
    p.add_option("--log", default=None, dest="trimlog",
            help="Specify a `trimlog` file [default: %default]")
    p.set_cpus(cpus=4)
    opts, args = p.parse_args(args)

    if len(args) not in (1, 2):
        sys.exit(not p.print_help())

    path = op.expanduser(opts.path)
    url = \
    "http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/Trimmomatic-{0}.zip"\
    .format(tv)

    if not op.exists(path):
        path = download(url)
        TrimUnzipped = "Trimmomatic-" + tv
        if not op.exists(TrimUnzipped):
            sh("unzip " + path)
        os.remove(path)
        path = op.join(TrimUnzipped, TrimJar)

    assert op.exists(path), \
        "Couldn't find Trimmomatic jar file at `{0}`".\
        format(path)

    adaptersfile = "adapters.fasta"
    Adapters = must_open(op.join(datadir, adaptersfile)).read()
    write_file(adaptersfile, Adapters, skipcheck=True)

    assert op.exists(adaptersfile), \
        "Please place the illumina adapter sequence in `{0}`".\
        format(adaptersfile)

    if opts.phred is None:
        offset = guessoffset([args[0]])
    else:
        offset = int(opts.phred)

    phredflag = " -phred{0}".format(offset)
    threadsflag = " -threads {0}".format(opts.cpus)
    if opts.trimlog:
        trimlog = " -trimlog {0}".format(opts.trimlog)

    cmd = "java -Xmx4g -jar {0}".format(path)
    frags = ".frags.fastq"
    pairs = ".pairs.fastq"
    if not opts.nogz:
        frags += ".gz"
        pairs += ".gz"

    get_prefix = lambda x: op.basename(x).replace(".gz", "").rsplit(".", 1)[0]
    if len(args) == 1:
        cmd += " SE"
        cmd += phredflag
        cmd += threadsflag
        if opts.trimlog:
            cmd += trimlog
        fastqfile, = args
        prefix = get_prefix(fastqfile)
        frags1 = prefix + frags
        cmd += " {0}".format(" ".join((fastqfile, frags1)))
    else:
        cmd += " PE"
        cmd += phredflag
        cmd += threadsflag
        if opts.trimlog:
            cmd += trimlog
        fastqfile1, fastqfile2 = args
        prefix1 = get_prefix(fastqfile1)
        prefix2 = get_prefix(fastqfile2)
        pairs1 = prefix1 + pairs
        pairs2 = prefix2 + pairs
        frags1 = prefix1 + frags
        frags2 = prefix2 + frags
        if opts.nofrags:
            frags1 = "/dev/null"
            frags2 = "/dev/null"
        cmd += " {0}".format(" ".join((fastqfile1, fastqfile2, \
                pairs1, frags1, pairs2, frags2)))

    cmd += " ILLUMINACLIP:{0}:2:30:10".format(adaptersfile)

    if not opts.adapteronly:
        cmd += " LEADING:3 TRAILING:3"
        cmd += " SLIDINGWINDOW:4:{0}".format(opts.minqv)

    cmd += " MINLEN:{0}".format(opts.minlen)

    if offset != 33:
        cmd += " TOPHRED33"
    sh(cmd)
示例#27
0
def correct(args):
    """
    %prog correct *.fastq

    Correct the fastqfile and generated corrected fastqfiles. This calls
    assembly.allpaths.prepare() to generate input files for ALLPATHS-LG. The
    naming convention for your fastqfiles are important, and are listed below.

    By default, this will correct all PE reads, and remove duplicates of all MP
    reads, and results will be placed in `frag_reads.corr.{pairs,frags}.fastq`
    and `jump_reads.corr.{pairs,frags}.fastq`.
    """
    from jcvi.assembly.allpaths import prepare
    from jcvi.assembly.base import FastqNamings

    p = OptionParser(correct.__doc__ + FastqNamings)
    p.add_option("--dir", default="data",
                help="Working directory [default: %default]")
    p.add_option("--fragsdedup", default=False, action="store_true",
                 help="Don't deduplicate the fragment reads [default: %default]")
    p.add_option("--ploidy", default="2", choices=("1", "2"),
                 help="Ploidy [default: %default]")
    p.add_option("--haploidify", default=False, action="store_true",
                 help="Set HAPLOIDIFY=True [default: %default]")
    p.add_option("--suffix", default=False, action="store_true",
                 help="Add suffix /1, /2 to read names")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fastq = args
    tag, tagj, taglj = "frag_reads", "jump_reads", "long_jump_reads"

    ploidy = opts.ploidy
    haploidify = opts.haploidify
    suffix = opts.suffix
    assert (not haploidify) or (haploidify and ploidy == '2')

    prepare(["Unknown"] + fastq + ["--norun"])

    datadir = opts.dir
    mkdir(datadir)
    fullpath = op.join(os.getcwd(), datadir)
    nthreads = " NUM_THREADS={0}".format(opts.cpus)
    phred64 = (guessoffset([args[0]]) == 64)

    orig = datadir + "/{0}_orig".format(tag)
    origfastb = orig + ".fastb"
    if need_update(fastq, origfastb):
        cmd = "PrepareAllPathsInputs.pl DATA_DIR={0} HOSTS='{1}' PLOIDY={2}".\
                format(fullpath, opts.cpus, ploidy)
        if phred64:
            cmd += " PHRED_64=True"
        sh(cmd)

    if op.exists(origfastb):
        correct_frag(datadir, tag, origfastb, nthreads, dedup=opts.fragsdedup,
                     haploidify=haploidify, suffix=suffix)

    origj = datadir + "/{0}_orig".format(tagj)
    origjfastb = origj + ".fastb"
    if op.exists(origjfastb):
        correct_jump(datadir, tagj, origjfastb, nthreads, suffix=suffix)

    origlj = datadir + "/{0}_orig".format(taglj)
    origljfastb = origlj + ".fastb"
    if op.exists(origljfastb):
        correct_jump(datadir, taglj, origljfastb, nthreads, suffix=suffix)
示例#28
0
def trim(args):
    """
    %prog trim fastqfiles

    Trim reads using TRIMMOMATIC. If two fastqfiles are given, then it invokes
    the paired reads mode. See manual:

    <http://www.usadellab.org/cms/index.php?page=trimmomatic>
    """
    tv = "0.32"
    TrimJar = "trimmomatic-{0}.jar".format(tv)
    phdchoices = ("33", "64")
    p = OptionParser(trim.__doc__)
    p.add_option("--path", default=op.join("~/bin", TrimJar),
            help="Path to trimmomatic jar file [default: %default]")
    p.add_option("--phred", default=None, choices=phdchoices,
            help="Phred score offset [default: guess]")
    p.add_option("--nofrags", default=False, action="store_true",
            help="Discard frags file in PE mode [default: %default]")
    p.add_option("--minqv", default=15, type="int",
            help="Average qv after trimming [default: %default]")
    p.add_option("--minlen", default=36, type="int",
            help="Minimum length after trimming [default: %default]")
    p.add_option("--adapteronly", default=False, action="store_true",
            help="Only trim adapters with no qv trimming [default: %default]")
    p.add_option("--nogz", default=False, action="store_true",
            help="Do not write to gzipped files [default: %default]")
    p.add_option("--log", default=None, dest="trimlog",
            help="Specify a `trimlog` file [default: %default]")
    p.set_cpus(cpus=4)
    opts, args = p.parse_args(args)

    if len(args) not in (1, 2):
        sys.exit(not p.print_help())

    path = op.expanduser(opts.path)
    url = \
    "http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/Trimmomatic-{0}.zip"\
    .format(tv)

    if not op.exists(path):
        path = download(url)
        TrimUnzipped = "Trimmomatic-" + tv
        if not op.exists(TrimUnzipped):
            sh("unzip " + path)
        os.remove(path)
        path = op.join(TrimUnzipped, TrimJar)

    assert op.exists(path), \
        "Couldn't find Trimmomatic jar file at `{0}`".\
        format(path)

    adaptersfile = "adapters.fasta"
    Adapters = must_open(op.join(datadir, adaptersfile)).read()
    write_file(adaptersfile, Adapters, skipcheck=True)

    assert op.exists(adaptersfile), \
        "Please place the illumina adapter sequence in `{0}`".\
        format(adaptersfile)

    if opts.phred is None:
        offset = guessoffset([args[0]])
    else:
        offset = int(opts.phred)

    phredflag = " -phred{0}".format(offset)
    threadsflag = " -threads {0}".format(opts.cpus)
    if opts.trimlog:
        trimlog = " -trimlog {0}".format(opts.trimlog)

    cmd = "java -Xmx4g -jar {0}".format(path)
    frags = ".frags.fastq"
    pairs = ".pairs.fastq"
    if not opts.nogz:
        frags += ".gz"
        pairs += ".gz"

    get_prefix = lambda x: op.basename(x).replace(".gz", "").rsplit(".", 1)[0]
    if len(args) == 1:
        cmd += " SE"
        cmd += phredflag
        cmd += threadsflag
        if opts.trimlog:
            cmd += trimlog
        fastqfile, = args
        prefix = get_prefix(fastqfile)
        frags1 = prefix + frags
        cmd += " {0}".format(" ".join((fastqfile, frags1)))
    else:
        cmd += " PE"
        cmd += phredflag
        cmd += threadsflag
        if opts.trimlog:
            cmd += trimlog
        fastqfile1, fastqfile2 = args
        prefix1 = get_prefix(fastqfile1)
        prefix2 = get_prefix(fastqfile2)
        pairs1 = prefix1 + pairs
        pairs2 = prefix2 + pairs
        frags1 = prefix1 + frags
        frags2 = prefix2 + frags
        if opts.nofrags:
            frags1 = "/dev/null"
            frags2 = "/dev/null"
        cmd += " {0}".format(" ".join((fastqfile1, fastqfile2, \
                pairs1, frags1, pairs2, frags2)))

    cmd += " ILLUMINACLIP:{0}:2:30:10".format(adaptersfile)

    if not opts.adapteronly:
        cmd += " LEADING:3 TRAILING:3"
        cmd += " SLIDINGWINDOW:4:{0}".format(opts.minqv)

    cmd += " MINLEN:{0}".format(opts.minlen)

    if offset != 33:
        cmd += " TOPHRED33"
    sh(cmd)
示例#29
0
def align(args):
    """
    %prog align database.fasta read1.fq read2.fq

    Wrapper for `gsnap` single-end or paired-end, depending on the number of
    args.
    """
    from jcvi.formats.fasta import join
    from jcvi.formats.fastq import guessoffset
    from jcvi.projects.tgbs import snp

    p = OptionParser(align.__doc__)
    p.add_option("--join",
                 default=False,
                 action="store_true",
                 help="Join sequences with padded 50Ns")
    p.add_option("--rnaseq",
                 default=False,
                 action="store_true",
                 help="Input is RNA-seq reads, turn splicing on")
    p.add_option("--snp",
                 default=False,
                 action="store_true",
                 help="Call SNPs after GSNAP")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) == 2:
        logging.debug("Single-end alignment")
    elif len(args) == 3:
        logging.debug("Paired-end alignment")
    else:
        sys.exit(not p.print_help())

    dbfile, readfile = args[0:2]
    if opts.join:
        dbfile = join([dbfile, "--gapsize=50", "--newid=chr1"])

    assert op.exists(dbfile) and op.exists(readfile)
    prefix = get_prefix(readfile, dbfile)
    logfile = prefix + ".log"
    gsnapfile = prefix + ".gsnap"
    if not need_update((dbfile, readfile), gsnapfile):
        logging.error("`{0}` exists. `gsnap` already run.".format(gsnapfile))
    else:
        dbdir, dbname = check_index(dbfile)
        cmd = "gsnap -D {0} -d {1}".format(dbdir, dbname)
        cmd += " -B 5 -m 0.1 -i 2 -n 3"  # memory, mismatch, indel penalty, nhits
        if opts.rnaseq:
            cmd += " -N 1"
        cmd += " -t {0}".format(opts.cpus)
        cmd += " --gmap-mode none --nofails"
        if readfile.endswith(".gz"):
            cmd += " --gunzip"
        try:
            offset = "sanger" if guessoffset([readfile]) == 33 else "illumina"
            cmd += " --quality-protocol {0}".format(offset)
        except AssertionError:
            pass
        cmd += " " + " ".join(args[1:])
        sh(cmd, outfile=gsnapfile, errfile=logfile)

    if opts.snp:
        snp([gsnapfile, "--cpus={0}".format(opts.cpus)])

    return gsnapfile, logfile
示例#30
0
def trim(args):
    """
    %prog trim fastqfiles

    Trim reads using TRIMMOMATIC. If two fastqfiles are given, then it invokes
    the paired reads mode. See manual:

    <http://www.usadellab.org/cms/index.php?page=trimmomatic>
    """
    TrimVersion = tv = "0.20"
    TrimJar = "trimmomatic-{0}.jar".format(tv)
    phdchoices = ("33", "64")
    p = OptionParser(trim.__doc__)
    p.add_option("--path",
                 default=op.join("~/bin", TrimJar),
                 help="Path to trimmomatic [default: %default]")
    p.add_option(
        "--phred",
        default=None,
        choices=phdchoices,
        help="Phred score offset {0} [default: guess]".format(phdchoices))
    p.add_option("--nofrags",
                 default=False,
                 action="store_true",
                 help="Discard frags file in PE mode [default: %default]")
    p.add_option("--minqv",
                 default=10,
                 type="int",
                 help="Average qv after trimming [default: %default]")
    p.add_option("--minlen",
                 default=30,
                 type="int",
                 help="Minimum length after trimming [default: %default]")
    p.add_option("--nogz",
                 default=False,
                 action="store_true",
                 help="Do not write to gzipped files [default: %default]")
    set_grid(p)

    opts, args = p.parse_args(args)

    if len(args) not in (1, 2):
        sys.exit(not p.print_help())

    path = op.expanduser(opts.path)
    url = \
    "http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/Trimmomatic-{0}.zip"\
    .format(tv)

    if not op.exists(path):
        path = download(url)
        TrimUnzipped = "Trimmomatic-" + tv
        if not op.exists(TrimUnzipped):
            sh("unzip " + path)
        os.remove(path)
        path = op.join(TrimUnzipped, TrimJar)

    assert op.exists(path)

    adaptersfile = "adapters.fasta"
    if not op.exists(adaptersfile):
        write_file(adaptersfile, Adapters)

    assert op.exists(adaptersfile), \
        "Please place the illumina adapter sequence in `{0}`".\
        format(adaptersfile)

    if opts.phred is None:
        offset = guessoffset([args[0]])
    else:
        offset = int(opts.phred)

    phredflag = " -phred{0}".format(offset)

    cmd = JAVAPATH("java-1.6.0")
    cmd += " -Xmx4g -cp {0} org.usadellab.trimmomatic".format(path)
    frags = ".frags.fastq"
    pairs = ".pairs.fastq"
    if not opts.nogz:
        frags += ".gz"
        pairs += ".gz"

    get_prefix = lambda x: op.basename(x).replace(".gz", "").rsplit(".", 1)[0]
    if len(args) == 1:
        cmd += ".TrimmomaticSE"
        cmd += phredflag
        fastqfile, = args
        prefix = get_prefix(fastqfile)
        frags1 = prefix + frags
        cmd += " {0}".format(" ".join((fastqfile, frags1)))
    else:
        cmd += ".TrimmomaticPE"
        cmd += phredflag
        fastqfile1, fastqfile2 = args
        prefix1 = get_prefix(fastqfile1)
        prefix2 = get_prefix(fastqfile2)
        pairs1 = prefix1 + pairs
        pairs2 = prefix2 + pairs
        frags1 = prefix1 + frags
        frags2 = prefix2 + frags
        if opts.nofrags:
            frags1 = "/dev/null"
            frags2 = "/dev/null"
        cmd += " {0}".format(" ".join((fastqfile1, fastqfile2, \
                pairs1, frags1, pairs2, frags2)))

    cmd += " ILLUMINACLIP:{0}:2:40:12".format(adaptersfile)
    cmd += " LEADING:3 TRAILING:3"
    cmd += " SLIDINGWINDOW:4:{0} MINLEN:{1}".format(opts.minqv, opts.minlen)
    if offset != 33:
        cmd += " TOPHRED33"
    sh(cmd, grid=opts.grid)
示例#31
0
文件: gmap.py 项目: tanghaibao/jcvi
def align(args):
    """
    %prog align database.fasta read1.fq read2.fq

    Wrapper for `gsnap` single-end or paired-end, depending on the number of
    args.
    """
    from jcvi.formats.fastq import guessoffset

    p = OptionParser(align.__doc__)
    p.add_option("--rnaseq", default=False, action="store_true",
                 help="Input is RNA-seq reads, turn splicing on")
    p.add_option("--native", default=False, action="store_true",
                 help="Convert GSNAP output to NATIVE format")
    p.set_home("eddyyeh")
    p.set_outdir()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) == 2:
        logging.debug("Single-end alignment")
    elif len(args) == 3:
        logging.debug("Paired-end alignment")
    else:
        sys.exit(not p.print_help())

    dbfile, readfile = args[:2]
    outdir = opts.outdir
    assert op.exists(dbfile) and op.exists(readfile)
    prefix = get_prefix(readfile, dbfile)
    logfile = op.join(outdir, prefix + ".log")
    gsnapfile = op.join(outdir, prefix + ".gsnap")
    nativefile = gsnapfile.rsplit(".", 1)[0] + ".unique.native"

    if not need_update((dbfile, readfile), gsnapfile):
        logging.error("`{0}` exists. `gsnap` already run.".format(gsnapfile))
    else:
        dbdir, dbname = check_index(dbfile)
        cmd = "gsnap -D {0} -d {1}".format(dbdir, dbname)
        cmd += " -B 5 -m 0.1 -i 2 -n 3"  # memory, mismatch, indel penalty, nhits
        if opts.rnaseq:
            cmd += " -N 1"
        cmd += " -t {0}".format(opts.cpus)
        cmd += " --gmap-mode none --nofails"
        if readfile.endswith(".gz"):
            cmd += " --gunzip"
        try:
            offset = "sanger" if guessoffset([readfile]) == 33 else "illumina"
            cmd += " --quality-protocol {0}".format(offset)
        except AssertionError:
            pass
        cmd += " " + " ".join(args[1:])
        sh(cmd, outfile=gsnapfile, errfile=logfile)

    if opts.native:
        EYHOME = opts.eddyyeh_home
        if need_update(gsnapfile, nativefile):
            cmd = op.join(EYHOME, "convert2native.pl")
            cmd += " --gsnap {0} -o {1}".format(gsnapfile, nativefile)
            cmd += " -proc {0}".format(opts.cpus)
            sh(cmd)

    return gsnapfile, logfile