Пример #1
0
def kmc(args):
    """
    %prog kmc folder

    Run kmc3 on Illumina reads.
    """
    p = OptionParser(kmc.__doc__)
    p.add_option("-k", default=19, type="int", help="Kmer size")
    p.add_option("-c",
                 default=2,
                 type="int",
                 help="Maximal value of a counter")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    folder, = args
    K = opts.k
    mm = MakeManager()
    for p, pf in iter_project(folder):
        pf = pf.split("_")[0] + ".ms{}".format(K)
        infiles = pf + ".infiles"
        fw = open(infiles, "w")
        print >> fw, "\n".join(p)
        fw.close()

        cmd = "kmc -k{} -m64 -t{} -cs{}".format(K, opts.cpus, opts.c)
        cmd += " @{} {} .".format(infiles, pf)
        outfile = pf + ".kmc_suf"
        mm.add(p, outfile, cmd)

    mm.write()
Пример #2
0
def kmc(args):
    """
    %prog kmc folder

    Run kmc3 on Illumina reads.
    """
    p = OptionParser(kmc.__doc__)
    p.add_option("-k", default=21, type="int", help="Kmer size")
    p.add_option("--ci",
                 default=2,
                 type="int",
                 help="Exclude kmers with less than ci counts")
    p.add_option("--cs",
                 default=2,
                 type="int",
                 help="Maximal value of a counter")
    p.add_option("--cx",
                 default=None,
                 type="int",
                 help="Exclude kmers with more than cx counts")
    p.add_option("--single",
                 default=False,
                 action="store_true",
                 help="Input is single-end data, only one FASTQ/FASTA")
    p.add_option("--fasta",
                 default=False,
                 action="store_true",
                 help="Input is FASTA instead of FASTQ")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    folder, = args
    K = opts.k
    n = 1 if opts.single else 2
    pattern = "*.fa,*.fa.gz,*.fasta,*.fasta.gz" if opts.fasta else \
              "*.fq,*.fq.gz,*.fastq,*.fastq.gz"

    mm = MakeManager()
    for p, pf in iter_project(folder, pattern=pattern, n=n,
                              commonprefix=False):
        pf = pf.split("_")[0] + ".ms{}".format(K)
        infiles = pf + ".infiles"
        fw = open(infiles, "w")
        print("\n".join(p), file=fw)
        fw.close()

        cmd = "kmc -k{} -m64 -t{}".format(K, opts.cpus)
        cmd += " -ci{} -cs{}".format(opts.ci, opts.cs)
        if opts.cx:
            cmd += " -cx{}".format(opts.cx)
        if opts.fasta:
            cmd += " -fm"
        cmd += " @{} {} .".format(infiles, pf)
        outfile = pf + ".kmc_suf"
        mm.add(p, outfile, cmd)

    mm.write()
Пример #3
0
def tophat(args):
    """
    %prog tophat folder reference

    Run tophat on a folder of reads.
    """
    from jcvi.apps.bowtie import check_index
    from jcvi.formats.fastq import guessoffset

    p = OptionParser(tophat.__doc__)
    p.add_option("--gtf", help="Reference annotation [default: %default]")
    p.add_option("--single", default=False, action="store_true",
                 help="Single end mapping")
    p.add_option("--intron", default=15000, type="int",
                 help="Max intron size [default: %default]")
    p.add_option("--dist", default=-50, type="int",
                 help="Mate inner distance [default: %default]")
    p.add_option("--stdev", default=50, type="int",
                 help="Mate standard deviation [default: %default]")
    p.set_phred()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    num = 1 if opts.single else 2
    folder, reference = args
    reference = check_index(reference)
    for p, prefix in iter_project(folder, n=num):
        outdir = "{0}_tophat".format(prefix)
        outfile = op.join(outdir, "accepted_hits.bam")
        if op.exists(outfile):
            logging.debug("File `{0}` found. Skipping.".format(outfile))
            continue

        cmd = "tophat -p {0}".format(opts.cpus)
        if opts.gtf:
            cmd += " -G {0}".format(opts.gtf)
        cmd += " -o {0}".format(outdir)

        if num == 1:  # Single-end
            a, = p
        else:  # Paired-end
            a, b = p
            cmd += " --max-intron-length {0}".format(opts.intron)
            cmd += " --mate-inner-dist {0}".format(opts.dist)
            cmd += " --mate-std-dev {0}".format(opts.stdev)

        phred = opts.phred or str(guessoffset([a]))
        if phred == "64":
            cmd += " --phred64-quals"
        cmd += " {0} {1}".format(reference, " ".join(p))

        sh(cmd)
Пример #4
0
def star(args):
    """
    %prog star folder reference

    Run star on a folder with reads.
    """
    p = OptionParser(star.__doc__)
    p.add_option("--single",
                 default=False,
                 action="store_true",
                 help="Single end mapping")
    p.set_fastq_names()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    folder, reference = args
    cpus = opts.cpus
    mm = MakeManager()

    num = 1 if opts.single else 2
    folder, reference = args
    gd = "GenomeDir"
    mkdir(gd)
    STAR = "STAR --runThreadN {0} --genomeDir {1}".format(cpus, gd)

    # Step 0: build genome index
    genomeidx = op.join(gd, "Genome")
    if need_update(reference, genomeidx):
        cmd = STAR + " --runMode genomeGenerate"
        cmd += " --genomeFastaFiles {0}".format(reference)
        mm.add(reference, genomeidx, cmd)

    # Step 1: align
    for p, prefix in iter_project(folder, opts.names, num):
        pf = "{0}_star".format(prefix)
        bamfile = pf + "Aligned.sortedByCoord.out.bam"
        cmd = STAR + " --readFilesIn {0}".format(" ".join(p))
        if p[0].endswith(".gz"):
            cmd += " --readFilesCommand zcat"
        cmd += " --outSAMtype BAM SortedByCoordinate"
        cmd += " --outFileNamePrefix {0}".format(pf)
        cmd += " --twopassMode Basic"
        # Compatibility for cufflinks
        cmd += " --outSAMstrandField intronMotif"
        cmd += " --outFilterIntronMotifs RemoveNoncanonical"
        mm.add(p, bamfile, cmd)

    mm.write()
Пример #5
0
def kmc(args):
    """
    %prog kmc folder

    Run kmc3 on Illumina reads.
    """
    p = OptionParser(kmc.__doc__)
    p.add_option("-k", default=21, type="int", help="Kmer size")
    p.add_option("--ci", default=2, type="int",
                 help="Exclude kmers with less than ci counts")
    p.add_option("--cs", default=2, type="int",
                 help="Maximal value of a counter")
    p.add_option("--cx", default=None, type="int",
                 help="Exclude kmers with more than cx counts")
    p.add_option("--single", default=False, action="store_true",
                 help="Input is single-end data, only one FASTQ/FASTA")
    p.add_option("--fasta", default=False, action="store_true",
                 help="Input is FASTA instead of FASTQ")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    folder, = args
    K = opts.k
    n = 1 if opts.single else 2
    pattern = "*.fa,*.fa.gz,*.fasta,*.fasta.gz" if opts.fasta else \
              "*.fq,*.fq.gz,*.fastq,*.fastq.gz"

    mm = MakeManager()
    for p, pf in iter_project(folder, pattern=pattern,
                              n=n, commonprefix=False):
        pf = pf.split("_")[0] + ".ms{}".format(K)
        infiles = pf + ".infiles"
        fw = open(infiles, "w")
        print >> fw, "\n".join(p)
        fw.close()

        cmd = "kmc -k{} -m64 -t{}".format(K, opts.cpus)
        cmd += " -ci{} -cs{}".format(opts.ci, opts.cs)
        if opts.cx:
            cmd += " -cx{}".format(opts.cx)
        if opts.fasta:
            cmd += " -fm"
        cmd += " @{} {} .".format(infiles, pf)
        outfile = pf + ".kmc_suf"
        mm.add(p, outfile, cmd)

    mm.write()
Пример #6
0
def star(args):
    """
    %prog star folder reference

    Run star on a folder with reads.
    """
    p = OptionParser(star.__doc__)
    p.add_option("--single", default=False, action="store_true",
                 help="Single end mapping")
    p.set_fastq_names()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    folder, reference = args
    cpus = opts.cpus
    mm = MakeManager()

    num = 1 if opts.single else 2
    folder, reference = args
    gd = "GenomeDir"
    mkdir(gd)
    STAR = "STAR --runThreadN {0} --genomeDir {1}".format(cpus, gd)

    # Step 0: build genome index
    genomeidx = op.join(gd, "Genome")
    if need_update(reference, genomeidx):
        cmd = STAR + " --runMode genomeGenerate"
        cmd += " --genomeFastaFiles {0}".format(reference)
        mm.add(reference, genomeidx, cmd)

    # Step 1: align
    for p, prefix in iter_project(folder, opts.names, num):
        pf = "{0}_star".format(prefix)
        bamfile = pf + "Aligned.sortedByCoord.out.bam"
        cmd = STAR + " --readFilesIn {0}".format(" ".join(p))
        if p[0].endswith(".gz"):
            cmd += " --readFilesCommand zcat"
        cmd += " --outSAMtype BAM SortedByCoordinate"
        cmd += " --outFileNamePrefix {0}".format(pf)
        cmd += " --twopassMode Basic"
        # Compatibility for cufflinks
        cmd += " --outSAMstrandField intronMotif"
        cmd += " --outFilterIntronMotifs RemoveNoncanonical"
        mm.add(p, bamfile, cmd)

    mm.write()
Пример #7
0
def batch(args):
    """
    %proj batch database.fasta project_dir output_dir

    Run bwa in batch mode.
    """
    p = OptionParser(batch.__doc__)
    set_align_options(p)
    p.set_sam_options()
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    ref_fasta, proj_dir, outdir = args
    outdir = outdir.rstrip("/")
    s3dir = None
    if outdir.startswith("s3://"):
        s3dir = outdir
        outdir = op.basename(outdir)
        mkdir(outdir)

    mm = MakeManager()
    for p, pf in iter_project(proj_dir):
        targs = [ref_fasta] + p
        cmd1, bamfile = mem(targs, opts)
        if cmd1:
            cmd1 = output_bam(cmd1, bamfile)
        nbamfile = op.join(outdir, bamfile)
        cmd2 = "mv {} {}".format(bamfile, nbamfile)
        cmds = [cmd1, cmd2]

        if s3dir:
            cmd = "aws s3 cp {} {} --sse".format(nbamfile,
                                                 op.join(s3dir, bamfile))
            cmds.append(cmd)

        mm.add(p, nbamfile, cmds)

    mm.write()
Пример #8
0
def batch(args):
    """
    %proj batch database.fasta project_dir output_dir

    Run bwa in batch mode.
    """
    p = OptionParser(batch.__doc__)
    set_align_options(p)
    p.set_sam_options()
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    ref_fasta, proj_dir, outdir = args
    outdir = outdir.rstrip("/")
    s3dir = None
    if outdir.startswith("s3://"):
        s3dir = outdir
        outdir = op.basename(outdir)
        mkdir(outdir)

    mm = MakeManager()
    for p, pf in iter_project(proj_dir):
        targs = [ref_fasta] + p
        cmd1, bamfile = mem(targs, opts)
        if cmd1:
            cmd1 = output_bam(cmd1, bamfile)
        nbamfile = op.join(outdir, bamfile)
        cmd2 = "mv {} {}".format(bamfile, nbamfile)
        cmds = [cmd1, cmd2]

        if s3dir:
            cmd = "aws s3 cp {} {} --sse".format(nbamfile,
                                              op.join(s3dir, bamfile))
            cmds.append(cmd)

        mm.add(p, nbamfile, cmds)

    mm.write()
Пример #9
0
def kmc(args):
    """
    %prog kmc folder

    Run kmc3 on Illumina reads.
    """
    p = OptionParser(kmc.__doc__)
    p.add_option("-k", default=21, type="int", help="Kmer size")
    p.add_option("--ci", default=2, type="int",
                 help="Minimum value of a counter")
    p.add_option("--cs", default=2, type="int",
                 help="Maximal value of a counter")
    p.add_option("--single", default=False, action="store_true",
                 help="Input is single-end data, only one FASTQ")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    folder, = args
    K = opts.k
    n = 1 if opts.single else 2
    mm = MakeManager()
    for p, pf in iter_project(folder, n=n, commonprefix=False):
        pf = pf.split("_")[0] + ".ms{}".format(K)
        infiles = pf + ".infiles"
        fw = open(infiles, "w")
        print >> fw, "\n".join(p)
        fw.close()

        cmd = "kmc -k{} -m64 -t{}".format(K, opts.cpus)
        cmd += " -ci{} -cs{}".format(opts.ci, opts.cs)
        cmd += " @{} {} .".format(infiles, pf)
        outfile = pf + ".kmc_suf"
        mm.add(p, outfile, cmd)

    mm.write()
Пример #10
0
def meryl(args):
    """
    %prog meryl folder

    Run meryl on Illumina reads.
    """
    p = OptionParser(meryl.__doc__)
    p.add_option("-k", default=19, type="int", help="Kmer size")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (folder, ) = args
    K = opts.k
    cpus = opts.cpus
    mm = MakeManager()
    for p, pf in iter_project(folder):
        cmds = []
        mss = []
        for i, ip in enumerate(p):
            ms = "{}{}.ms{}".format(pf, i + 1, K)
            mss.append(ms)
            cmd = "meryl -B -C -m {} -threads {}".format(K, cpus)
            cmd += " -s {} -o {}".format(ip, ms)
            cmds.append(cmd)
        ams, bms = mss
        pms = "{}.ms{}".format(pf, K)
        cmd = "meryl -M add -s {} -s {} -o {}".format(ams, bms, pms)
        cmds.append(cmd)
        cmd = "rm -f {}.mcdat {}.mcidx {}.mcdat {}.mcidx".format(
            ams, ams, bms, bms)
        cmds.append(cmd)
        mm.add(p, pms + ".mcdat", cmds)

    mm.write()
Пример #11
0
def meryl(args):
    """
    %prog meryl folder

    Run meryl on Illumina reads.
    """
    p = OptionParser(meryl.__doc__)
    p.add_option("-k", default=19, type="int", help="Kmer size")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    folder, = args
    K = opts.k
    cpus = opts.cpus
    mm = MakeManager()
    for p, pf in iter_project(folder):
        cmds = []
        mss = []
        for i, ip in enumerate(p):
            ms = "{}{}.ms{}".format(pf, i + 1, K)
            mss.append(ms)
            cmd = "meryl -B -C -m {} -threads {}".format(K, cpus)
            cmd += " -s {} -o {}".format(ip, ms)
            cmds.append(cmd)
        ams, bms = mss
        pms = "{}.ms{}".format(pf, K)
        cmd = "meryl -M add -s {} -s {} -o {}".format(ams, bms, pms)
        cmds.append(cmd)
        cmd = "rm -f {}.mcdat {}.mcidx {}.mcdat {}.mcidx".\
                    format(ams, ams, bms, bms)
        cmds.append(cmd)
        mm.add(p, pms + ".mcdat", cmds)

    mm.write()
Пример #12
0
def tophat(args):
    """
    %prog tophat folder reference

    Run tophat on a folder of reads.
    """
    from jcvi.apps.bowtie import check_index
    from jcvi.formats.fastq import guessoffset

    p = OptionParser(tophat.__doc__)
    p.add_option("--gtf", help="Reference annotation [default: %default]")
    p.add_option("--single",
                 default=False,
                 action="store_true",
                 help="Single end mapping")
    p.add_option("--intron",
                 default=15000,
                 type="int",
                 help="Max intron size [default: %default]")
    p.add_option("--dist",
                 default=-50,
                 type="int",
                 help="Mate inner distance [default: %default]")
    p.add_option("--stdev",
                 default=50,
                 type="int",
                 help="Mate standard deviation [default: %default]")
    p.set_phred()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    num = 1 if opts.single else 2
    folder, reference = args
    reference = check_index(reference)
    for p, prefix in iter_project(folder, n=num):
        outdir = "{0}_tophat".format(prefix)
        outfile = op.join(outdir, "accepted_hits.bam")
        if op.exists(outfile):
            logging.debug("File `{0}` found. Skipping.".format(outfile))
            continue

        cmd = "tophat -p {0}".format(opts.cpus)
        if opts.gtf:
            cmd += " -G {0}".format(opts.gtf)
        cmd += " -o {0}".format(outdir)

        if num == 1:  # Single-end
            a, = p
        else:  # Paired-end
            a, b = p
            cmd += " --max-intron-length {0}".format(opts.intron)
            cmd += " --mate-inner-dist {0}".format(opts.dist)
            cmd += " --mate-std-dev {0}".format(opts.stdev)

        phred = opts.phred or str(guessoffset([a]))
        if phred == "64":
            cmd += " --phred64-quals"
        cmd += " {0} {1}".format(reference, " ".join(p))

        sh(cmd)