Пример #1
0
def pasa(args):
    """
    %prog pasa pasa_db fastafile

    Run EVM in TIGR-only mode.
    """
    p = OptionParser(pasa.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    pasa_db, fastafile = args

    termexons = "pasa.terminal_exons.gff3"
    if need_update(fastafile, termexons):
        cmd = "$ANNOT_DEVEL/PASA2/scripts/pasa_asmbls_to_training_set.dbi"
        cmd += ' -M "{0}:mysql.tigr.org" -p "access:access"'.format(pasa_db)
        cmd += ' -g {0}'.format(fastafile)
        sh(cmd)

        cmd = "$EVM/PasaUtils/retrieve_terminal_CDS_exons.pl"
        cmd += " trainingSetCandidates.fasta trainingSetCandidates.gff"
        sh(cmd, outfile=termexons)

    return termexons
Пример #2
0
def query(args):
    """
    %prog query frgscf.sorted scfID:start-end

    Query certain region to get frg placement, using random access. Build index
    if not present.
    """
    p = OptionParser(query.__doc__)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(p.print_help())

    frgscffile, region = args
    gzfile = frgscffile + ".gz"
    tbifile = gzfile + ".tbi"
    outfile = region + ".posmap"

    if not (op.exists(gzfile) and op.exists(tbifile)):
        index(frgscffile)

    assert op.exists(gzfile) and op.exists(tbifile)

    cmd = "tabix {0} {1}".format(gzfile, region)
    sh(cmd, outfile=outfile)
Пример #3
0
def bwasw(args):
    """
    %prog bwasw database.fasta long_read.fastq

    Wrapper for `bwa bwasw`. Output will be long_read.sam.
    """
    p = OptionParser(bwasw.__doc__)
    set_params(p)
    set_grid(p)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(p.print_help())

    extra = opts.extra
    grid = opts.grid

    dbfile, readfile = args
    safile = check_index(dbfile, grid=grid)
    saifile = check_aln(dbfile, readfile, grid=grid)

    samfile = readfile.rsplit(".", 1)[0] + ".sam"
    if op.exists(samfile):
        logging.error("`{0}` exists. `bwa bwasw` already run.".format(samfile))
        return

    cmd = "bwa bwasw -t 32 {0} {1} ".format(dbfile, readfile)
    cmd += "{0}".format(extra)
    sh(cmd, grid=grid, outfile=samfile)
Пример #4
0
def test(args):
    """
    %prog test unitig{partID}.{unitigID}

    For example, `%prog test unitig5.530` will test the modified `unitig530`
    """
    p = OptionParser(test.__doc__)
    p.add_option("--verbose", default=False, action="store_true",
                 help="Turn on verbose debugging [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    prefix = get_prefix()
    s, = args
    partID, unitigID = get_ID(s)

    cmd = CAPATH("utgcns")
    cmd += " -g ../{0}.gkpStore -t ../{0}.tigStore 1".format(prefix)
    cmd += " {0} -T {1}".format(partID, s)
    if opts.verbose:
        cmd += " -V -V"
    cmd += " -V -v 2> {0}.log".format(s)

    sh(cmd)

    # Show log
    cmd = "tail {0}.log".format(s)
    sh(cmd)
Пример #5
0
def run_megablast(infile=None, outfile=None, db=None, wordsize=None, \
        pctid=98, hitlen=100, best=None, evalue=0.01, task="megablast", cpus=16):

    assert db, "Need to specify database fasta file."

    db = get_abs_path(db)
    nin = db + ".nin"
    nin00 = db + ".00.nin"
    nin = nin00 if op.exists(nin00) else (db + ".nin")
    run_formatdb(infile=db, outfile=nin)

    cmd = "blastn"
    cmd += " -query {0} -db {1} -out {2}".format(infile, db, outfile)
    cmd += " -evalue {0} -outfmt 6 -num_threads {1}".format(evalue, cpus)
    cmd += " -task {0}".format(task)
    if wordsize:
        cmd += " -word_size {0}".format(wordsize)
    if pctid:
        cmd += " -perc_identity {0}".format(pctid)
    if best:
        cmd += " -max_target_seqs {0}".format(best)
    sh(cmd)

    if pctid and hitlen:
        blastfile = outfile
        filtered_blastfile = outfile + ".P{0}L{1}".format(pctid, hitlen)
        run_blast_filter(infile=blastfile, outfile=filtered_blastfile,
                pctid=pctid, hitlen=hitlen)
        shutil.move(filtered_blastfile, blastfile)
Пример #6
0
def index(args):
    """
    %prog index frgscf.sorted

    Compress frgscffile.sorted and index it using `tabix`.
    """
    p = OptionParser(index.__doc__)

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    frgscffile, = args
    gzfile = frgscffile + ".gz"
    cmd = "bgzip -c {0}".format(frgscffile)

    if not op.exists(gzfile):
        sh(cmd, outfile=gzfile)

    tbifile = gzfile + ".tbi"
    # Sequence, begin, end in 2, 3, 4-th column, respectively
    cmd = "tabix -s 2 -b 3 -e 4 {0}".format(gzfile)

    if not op.exists(tbifile):
        sh(cmd)
Пример #7
0
def bam(args):
    """
    %prog snp input.gsnap ref.fasta

    Convert GSNAP output to BAM.
    """
    from jcvi.formats.sizes import Sizes
    from jcvi.formats.sam import index

    p = OptionParser(bam.__doc__)
    p.set_home("eddyyeh")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    gsnapfile, fastafile = args
    EYHOME = opts.eddyyeh_home
    pf = gsnapfile.rsplit(".", 1)[0]
    uniqsam = pf + ".unique.sam"
    if need_update((gsnapfile, fastafile), uniqsam):
        cmd = op.join(EYHOME, "gsnap2gff3.pl")
        sizesfile = Sizes(fastafile).filename
        cmd += " --format sam -i {0} -o {1}".format(gsnapfile, uniqsam)
        cmd += " -u -l {0} -p {1}".format(sizesfile, opts.cpus)
        sh(cmd)

    index([uniqsam])
Пример #8
0
def snp(args):
    """
    %prog snp input.gsnap

    Run SNP calling on GSNAP output after apps.gsnap.align().
    """
    p = OptionParser(snp.__doc__)
    p.set_home("eddyyeh")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    gsnapfile, = args
    EYHOME = opts.eddyyeh_home
    pf = gsnapfile.rsplit(".", 1)[0]
    nativefile = pf + ".native"
    if need_update(gsnapfile, nativefile):
        cmd = op.join(EYHOME, "convert2native.pl")
        cmd += " --gsnap {0} -o {1}".format(gsnapfile, nativefile)
        cmd += " -proc {0}".format(opts.cpus)
        sh(cmd)

    snpfile = pf + ".snp"
    if need_update(nativefile, snpfile):
        cmd = op.join(EYHOME, "SNPs/SNP_Discovery-short.pl")
        cmd += " --native {0} -o {1}".format(nativefile, snpfile)
        cmd += " -a 2 -ac 0.3 -c 0.8"
        sh(cmd)
Пример #9
0
def blat(args):
    """
    %prog blat old.fasta new.fasta

    Generate psl file using blat.
    """
    p = OptionParser(blat.__doc__)
    p.add_option("--minscore", default=100, type="int",
                 help="Matches minus mismatches gap penalty [default: %default]")
    p.add_option("--minid", default=98, type="int",
                 help="Minimum sequence identity [default: %default]")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    oldfasta, newfasta = args
    twobitfiles = []
    for fastafile in args:
        tbfile = faToTwoBit(fastafile)
        twobitfiles.append(tbfile)

    oldtwobit, newtwobit = twobitfiles
    cmd = "pblat -threads={0}".format(opts.cpus) if which("pblat") else "blat"
    cmd += " {0} {1}".format(oldtwobit, newfasta)
    cmd += " -tileSize=12 -minScore={0} -minIdentity={1} ".\
                format(opts.minscore, opts.minid)
    pslfile = "{0}.{1}.psl".format(*(op.basename(x).split('.')[0] \
                for x in (newfasta, oldfasta)))
    cmd += pslfile
    sh(cmd)
Пример #10
0
def uclust(args):
    """
    %prog uclust fastafile

    Use `usearch` to remove duplicate reads.
    """
    p = OptionParser(uclust.__doc__)
    p.set_align(pctid=98)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    identity = opts.pctid / 100.

    pf, sf = fastafile.rsplit(".", 1)
    sortedfastafile = pf + ".sorted.fasta"
    if need_update(fastafile, sortedfastafile):
        cmd = "usearch -sortbylength {0} -fastaout {1}".\
                    format(fastafile, sortedfastafile)
        sh(cmd)

    pf = fastafile + ".P{0}.uclust".format(opts.pctid)
    clstrfile = pf + ".clstr"
    centroidsfastafile = pf + ".centroids.fasta"
    if need_update(sortedfastafile, centroidsfastafile):
        cmd = "usearch -cluster_smallmem {0}".format(sortedfastafile)
        cmd += " -id {0}".format(identity)
        cmd += " -uc {0} -centroids {1}".format(clstrfile, centroidsfastafile)
        sh(cmd)
Пример #11
0
def sort(args):
    """
    %prog sort bedfile

    Sort bed file to have ascending order of seqid, then start. It uses the
    `sort` command.
    """
    p = OptionParser(sort.__doc__)
    p.add_option("-i", "--inplace", dest="inplace",
            default=False, action="store_true",
            help="Sort bed file in place [default: %default]")
    p.add_option("--accn", default=False, action="store_true",
            help="Sort based on the accessions [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bedfile, = args
    inplace = opts.inplace

    sortedbed = op.basename(bedfile).rsplit(".", 1)[0] + ".sorted.bed"
    if inplace:
        sortedbed = bedfile

    sortopt = "-k1,1 -k2,2n -k4,4" if not opts.accn else \
              "-k4,4 -k1,1 -k2,2n"
    cmd = "sort {0} {1}".format(sortopt, bedfile)
    cmd += " -o {0}".format(sortedbed)
    sh(cmd)

    return sortedbed
Пример #12
0
def plot_some_queries(refs, qsizes, rsizes, deltafile, refcov, prefix="out", color="similarity", layout=True):

    Qfile, Rfile = "Qfile", "Rfile"
    coords = Coords(deltafile)
    queries = set()
    for c in coords:
        if c.refcov < refcov:
            continue
        if c.ref not in refs:
            continue
        queries.add(c.query)

    if not queries or not refs:
        logging.debug("Empty - {0} vs. {1}".format(queries, refs))
        return None

    writeXfile(queries, qsizes, Qfile)
    writeXfile(refs, rsizes, Rfile)

    cmd = "mummerplot {0}".format(deltafile)
    cmd += " -Rfile {0} -Qfile {1}".format(Rfile, Qfile)
    cmd += " --postscript -p {0}".format(prefix)
    if layout:
        cmd += " --layout"
    if color == "similarity":
        cmd += " --color"
    elif color == "none":
        cmd += " --nocolor"
    sh(cmd)

    cmd = "ps2pdf {0}.ps {0}.pdf".format(prefix)
    sh(cmd)

    return prefix + ".pdf"
Пример #13
0
def txt(args):
    """
    %prog txt casfile

    convert binary CAS file to tabular output using CLC assembly_table
    """
    p = OptionParser(txt.__doc__)
    p.add_option("-m", dest="multi", default=False, action="store_true",
        help="report multi-matches [default: %default]")
    set_grid(p)

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    grid = opts.grid

    casfile, = args
    txtfile = casfile.replace(".cas", ".txt")
    assert op.exists(casfile)

    cmd = "assembly_table -n -s -p "
    if opts.multi:
        cmd += "-m "
    cmd += casfile
    sh(cmd, grid=grid, outfile=txtfile)

    return txtfile
Пример #14
0
def split(args):
    """
    %prog split casfile 1 10

    split the binary casfile by using CLCbio `sub_assembly` program, the two
    numbers are starting and ending index for the `reference`; useful to split
    one big assembly per contig
    """
    p = OptionParser(split.__doc__)
    set_grid(p)

    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(p.print_help())

    casfile, start, end = args
    start = int(start)
    end = int(end)

    split_cmd = "sub_assembly -a {casfile} -o sa.{i}.cas -s {i} " + \
        "-e sa.{i}.pairs.fasta -f sa.{i}.fragments.fasta -g sa.{i}.ref.fasta"

    for i in range(start, end + 1):
        cmd = split_cmd.format(casfile=casfile, i=i)
        sh(cmd, grid=opts.grid)
Пример #15
0
def run_FastbAndQualb2Fastq(infile=None, outfile=None, rc=False):
    corr = op.basename(infile).rsplit(".", 1)[0]
    cmd = "FastbQualbToFastq HEAD_IN={0} HEAD_OUT={0}".format(corr)
    cmd += " PAIRED=False PHRED_OFFSET=33"
    if rc:
        cmd += " FLIP=True"
    sh(cmd)
Пример #16
0
def cufflinks(args):
    """
    %prog cufflinks folder reference

    Run cufflinks on a folder containing tophat results.
    """
    p = OptionParser(cufflinks.__doc__)
    p.add_option("--gtf", help="Reference annotation [default: %default]")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    folder, reference = args
    os.chdir(folder)
    bams = glob("*tophat/accepted_hits.bam")
    for bam in bams:
        pf, ab = op.split(bam)
        outdir = op.join(pf, "cufflinks")
        if op.exists(outdir):
            logging.debug("Directory {0} found. Skipping.".format(outdir))
            continue
        cmd = "cufflinks"
        cmd += " -o {0}".format(outdir)
        cmd += " -p {0}".format(opts.cpus)
        if opts.gtf:
            cmd += " -g {0}".format(opts.gtf)
        cmd += " --frag-bias-correct {0}".format(reference)
        cmd += " --multi-read-correct"
        cmd += " {0}".format(bam)
        sh(cmd)
Пример #17
0
def fake_quals(fa):
    faq = fa.rsplit(".", 1)[0] + ".qual"
    if op.exists(faq):
        logging.debug("Qual file `{0}` found.".format(faq))
    else:
        sh("fakeQuals.py {0} {1}".format(fa, faq))
    return fa, faq
Пример #18
0
def fpkm(args):
    """
    %prog fpkm fastafile *.bam

    Calculate FPKM values from BAM file.
    """
    p = OptionParser(fpkm.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    fastafile = args[0]
    bamfiles = args[1:]
    # Create a DUMMY gff file for cuffdiff
    gffile = fastafile.rsplit(".", 1)[0] + ".gff"
    if need_update(fastafile, gffile):
        fw = open(gffile, "w")
        f = Fasta(fastafile, lazy=True)
        for key, size in f.itersizes_ordered():
            print >> fw, "\t".join(str(x) for x in (key, "dummy", "transcript",\
                1, size, ".", ".", ".", "ID=" + key))
        fw.close()
        logging.debug("Dummy GFF created: {0}".format(gffile))

    cmd = "cuffdiff {0} {1}".format(gffile, " ".join(bamfiles))
    sh(cmd)
Пример #19
0
def push_to_s3(s3_store, obj_name):
    cmd = "sync" if op.isdir(obj_name) else "cp"
    s3address = "{0}/{1}".format(s3_store, obj_name)
    s3address = s3ify(s3address)
    cmd = "aws s3 {0} {1} {2} --sse".format(cmd, obj_name, s3address)
    sh(cmd)
    return s3address
Пример #20
0
def genemark(args):
    """
    %prog genemark species fastafile

    Train GENEMARK model given fastafile. GENEMARK self-trains so no trainig
    model gff file is needed.
    """
    p = OptionParser(genemark.__doc__)
    p.set_home("gmes")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    species, fastafile = args
    mhome = opts.gmes_home
    gmdir = "genemark"
    mkdir(gmdir)

    cwd = os.getcwd()
    os.chdir(gmdir)
    cmd = "ln -sf ../{0}".format(fastafile)
    sh(cmd)

    license = op.expanduser("~/.gm_key")
    assert op.exists(license), "License key ({0}) not found!".format(license)
    cmd = "{0}/gm_es.pl {1}".format(mhome, fastafile)
    sh(cmd)

    os.chdir(cwd)
    logging.debug("GENEMARK matrix written to `{0}/mod/{1}.mod`".format(gmdir, species))
Пример #21
0
def main(args):
    """
    %prog deltafile refidsfile query.fasta ref.fasta

    Plot one query. Extract the references that have major matches to this
    query. Control "major" by option --refcov.
    """
    p = OptionParser(main.__doc__)
    p.add_option("--refcov", default=.01, type="float",
                 help="Minimum reference coverage [default: %default]")
    p.add_option("--all", default=False, action="store_true",
                 help="Plot one pdf file per ref in refidsfile [default: %default]")
    p.set_align(pctid=96, hitlen=500)
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    deltafile, refidsfile, queryfasta, reffasta = args
    qsizes = Sizes(queryfasta).mapping
    rsizes = Sizes(reffasta).mapping
    refs = SetFile(refidsfile)
    refcov = opts.refcov
    pctid = opts.pctid
    hitlen = opts.hitlen
    deltafile = filter([deltafile, "--pctid={0}".format(pctid),
                        "--hitlen={0}".format(hitlen)])

    if opts.all:
        for r in refs:
            pdffile = plot_some_queries([r], qsizes, rsizes, deltafile, refcov)
            if pdffile:
                sh("mv {0} {1}.pdf".format(pdffile, r))
    else:
        plot_some_queries(refs, qsizes, rsizes, deltafile, refcov)
Пример #22
0
def first(args):
    """
    %prog first N fastqfile(s)

    Get first N reads from file.
    """
    from jcvi.apps.base import need_update

    p = OptionParser(first.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    N = int(args[0])
    nlines = N * 4
    fastqfiles = args[1:]
    fastqfile = fastqfiles[0]
    outfile = opts.outfile
    if not need_update(fastqfiles, outfile):
        logging.debug("File `{0}` exists. Will not overwrite.".format(outfile))
        return

    gz = fastqfile.endswith(".gz")
    for fastqfile in fastqfiles:
        if gz:
            cmd = "zcat {0} | head -n {1}".format(fastqfile, nlines)
        else:
            cmd = "head -n {0} {1}".format(nlines, fastqfile)

        sh(cmd, outfile=opts.outfile, append=True)
Пример #23
0
def trim(args):
    """
    %prog trim fastqfile

    Wraps `fastx_trimmer` to trim from begin or end of reads.
    """
    p = OptionParser(trim.__doc__)
    p.add_option("-f", dest="first", default=0, type="int", help="First base to keep. Default is 1.")
    p.add_option("-l", dest="last", default=0, type="int", help="Last base to keep. Default is entire read.")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastqfile, = args
    obfastqfile = op.basename(fastqfile)
    fq = obfastqfile.rsplit(".", 1)[0] + ".ntrimmed.fastq"
    if fastqfile.endswith(".gz"):
        fq = obfastqfile.rsplit(".", 2)[0] + ".ntrimmed.fastq.gz"

    cmd = "fastx_trimmer -Q33 "
    if opts.first:
        cmd += "-f {0.first} ".format(opts)
    if opts.last:
        cmd += "-l {0.last} ".format(opts)

    sh(cmd, infile=fastqfile, outfile=fq)
Пример #24
0
def pull_from_s3(s3_store, file_name=None):
    file_name = file_name or s3_store.split("/")[-1]
    if not op.exists(file_name):
        s3_store = s3ify(s3_store)
        cmd = "aws s3 cp {0} {1} --sse".format(s3_store, file_name)
        sh(cmd)
    return op.abspath(file_name)
Пример #25
0
Файл: bed.py Проект: yangjl/jcvi
def mergeBed(bedfile, d=0, nms=False, s=False, scores=None):
    sort([bedfile, "-i"])
    cmd = "mergeBed -i {0}".format(bedfile)
    if d:
        cmd += " -d {0}".format(d)
    if nms:
        nargs = len(open(bedfile).readline().split())
        if nargs <= 3:
            logging.debug("Only {0} columns detected... set nms=True"\
                            .format(nargs))
        else:
            cmd += " -c 4 -o collapse"
    if s:
        cmd += " -s"
    if scores:
        valid_opts = ("sum", "min", "max", "mean", "median",
                "mode", "antimode", "collapse")
        if not scores in valid_opts:
            scores = "mean"
        cmd += " -scores {0}".format(scores)

    mergebedfile = op.basename(bedfile).rsplit(".", 1)[0] + ".merge.bed"

    if need_update(bedfile, mergebedfile):
        sh(cmd, outfile=mergebedfile)
    return mergebedfile
Пример #26
0
def alignextend(args):
    """
    %prog alignextend ref.fasta read.1.fastq read.2.fastq

    Wrapper around AMOS alignextend.
    """
    p = OptionParser(alignextend.__doc__)
    p.add_option("--nosuffix", default=False, action="store_true",
                 help="Do not add /1/2 suffix to the read [default: %default]")
    p.set_home("amos")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    ref, r1, r2 = args
    pf = op.basename(r1).split(".")[0]
    cmd = op.join(opts.amos_home, "src/Experimental/alignextend.pl")
    if not opts.nosuffix:
        cmd += " -suffix"
    bwa_idx = "{0}.ref.fa.sa".format(pf)
    if not need_update(ref, bwa_idx):
        cmd += " -noindex"
    cmd += " -threads {0}".format(opts.cpus)
    offset = guessoffset([r1])
    if offset == 64:
        cmd += " -I"
    cmd += " ".join(("", pf, ref, r1, r2))
    sh(cmd)
Пример #27
0
def deduplicate(args):
    """
    %prog deduplicate fastafile

    Wraps `cd-hit-454` to remove duplicate reads.
    """
    p = OptionParser(deduplicate.__doc__)
    p.add_option("--identity", default=.98, type="float",
                 help="Sequence identity threshold [default: %default]")
    p.add_option("--cpus", default=0, type="int",
                 help="Number of CPUs to use, 0=unlimited [default: %default]")
    set_grid(p)

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args

    from jcvi.apps.command import CDPATH

    cmd = CDPATH("cd-hit-454")
    cmd += " -c {0}".format(opts.identity)
    cmd += " -M 0 -T {0} -i {1} -o {1}.cdhit".format(opts.cpus, fastafile)
    sh(cmd, grid=opts.grid)
Пример #28
0
def dust(args):
    """
    %prog dust assembly.fasta

    Remove low-complexity contigs within assembly.
    """
    p = OptionParser(dust.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    dustfastafile = fastafile.rsplit(".", 1)[0] + ".dust.fasta"
    if need_update(fastafile, dustfastafile):
        cmd = "dustmasker -in {0}".format(fastafile)
        cmd += " -out {1} -outfmt fasta".format(dustfastafile)
        sh(cmd)

    for name, seq in parse_fasta(dustfastafile):
        nlow = sum(1 for x in seq if x in "acgtN")
        pctlow = nlow * 100. / len(seq)
        if pctlow < 98:
            continue
        #print "{0}\t{1:.1f}".format(name, pctlow)
        print name
Пример #29
0
def consensus(args):
    """
    %prog consensus fastafile bamfile

    Convert bam alignments to consensus FASTQ/FASTA.
    """
    p = OptionParser(consensus.__doc__)
    p.add_option("--fasta", default=False, action="store_true",
            help="Generate consensus FASTA sequences [default: %default]")
    p.add_option("--mask", default=0, type="int",
            help="Mask bases with quality lower than")
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    fastafile, bamfile = args
    fasta = opts.fasta
    suffix = "fasta" if fasta else "fastq"
    pf = bamfile.rsplit(".", 1)[0]
    cnsfile = pf + ".cns.{0}".format(suffix)
    vcfgzfile = pf + ".vcf.gz"
    vcf([fastafile, bamfile, "-o", vcfgzfile])
    cmd += "zcat {0} | vcfutils.pl vcf2fq".format(vcfgzfile)
    if fasta:
        cmd += " | seqtk seq -q {0} -A -".format(opts.mask)

    sh(cmd, outfile=cnsfile)
Пример #30
0
def deduplicate(args):
    """
    %prog deduplicate fastafile

    Wraps `cd-hit-est` to remove duplicate sequences.
    """
    p = OptionParser(deduplicate.__doc__)
    p.set_align(pctid=98)
    p.add_option("--reads", default=False, action="store_true",
                 help="Use `cd-hit-454` to deduplicate [default: %default]")
    p.add_option("--samestrand", default=False, action="store_true",
                 help="Enforce same strand alignment [%default: %default]")
    p.set_home("cdhit")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    identity = opts.pctid / 100.

    cmd = "cd-hit-454" if opts.reads else "cd-hit-est"
    cmd = op.join(opts.cdhit_home, cmd)
    cmd += " -c {0}".format(identity)
    cmd += " -d 0"  # include complete defline
    if opts.samestrand:
        cmd += " -r 0"
    cmd += " -M 0 -T {0} -i {1} -o {1}.cdhit".format(opts.cpus, fastafile)
    sh(cmd)

    dd = fastafile + ".cdhit"
    return dd
Пример #31
0
def align(args):
    """
    %prog align reference fastqfiles

    Use `clc_ref_assemble` to map the read files to a reference. Use a non-zero
    -s option to turn on paired end mode.
    """
    p = OptionParser(align.__doc__)
    p.add_option("-o", dest="outfile", default=None,
            help="Output prefix.cas file [default: %default]")
    p.add_option("-s", dest="size", default=0, type="int",
            help="Use paired end mapping with insert [default: %default]")
    p.add_option("--short", default=False, action="store_true",
            help="Use `clc_ref_assemble_short` as the mapper [default: %default]")
    p.add_option("--orientations", default="fb",
            help="The reads have the orientations [default: %default]")
    p.add_option("--fraction", default=0.5,
            help="Fraction of the read that must match [default: %default]")
    p.add_option("--similarity", default=0.95,
            help="Similarity of the matching region [default: %default]")
    p.set_params()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    write_file("license.properties", CLCLICENSE, skipcheck=True)

    ref = args[0]
    assert op.exists(ref)
    fastqfiles = args[1:]
    size = opts.size
    orientations = opts.orientations
    assert orientations in ("fb", "bf", "ff", "bb")

    cmd = "clc_ref_assemble_short" if opts.short else "clc_ref_assemble_long"
    readprefix = op.basename(fastqfiles[0]).split(".", 1)[0]
    refprefix = op.basename(ref).split(".", 1)[0]
    outfile = opts.outfile or "{0}.{1}".format(readprefix, refprefix)
    if not outfile.endswith(".cas"):
        outfile += ".cas"

    cmd += " --cpus {0}".format(opts.cpus)
    cmd += " -d {0} -o {1} -q ".format(ref, outfile)
    fastqs = " ".join(fastqfiles)
    if size == 0:
        cmd += fastqs
    else:
        assert len(fastqfiles) == 2
        stddev = size / 4
        lb, ub = size - stddev, size + stddev
        cmd += " -p {0} ss {1} {2} -i {3} ".format(orientations, lb, ub, fastqs)

    if opts.extra:
        cmd += " " + opts.extra

    if not opts.short:

        cmd += " -l {0} -s {1}".format(opts.fraction, opts.similarity)

    sh(cmd)
    return outfile, None
Пример #32
0
def rm_s3(store):
    cmd = "aws s3 rm {}".format(store)
    sh(cmd)
Пример #33
0
def aws_configure(profile, key, value):
    sh('aws configure set profile.{0}.{1} {2}'.format(profile, key, value))
Пример #34
0
def last(args):
    """
    %prog database.fasta query.fasta

    Run LAST by calling LASTDB and LASTAL. LAST program available:
    <http://last.cbrc.jp>

    Works with LAST-719.
    """
    p = OptionParser(last.__doc__)
    p.add_option("--path", help="Specify LAST path")
    p.add_option("--mask",
                 default=False,
                 action="store_true",
                 help="Invoke -c in lastdb")
    p.add_option("--format",
                 default="BlastTab",
                 choices=("TAB", "MAF", "BlastTab", "BlastTab+"),
                 help="Output format")
    p.add_option("--minlen",
                 default=0,
                 type="int",
                 help="Filter alignments by how many bases match")
    p.add_option("--minid",
                 default=0,
                 type="int",
                 help="Minimum sequence identity")
    p.set_cpus()
    p.set_params()

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    subject, query = args
    path = opts.path
    cpus = opts.cpus
    getpath = lambda x: op.join(path, x) if path else x
    lastdb_bin = getpath("lastdb")
    lastal_bin = getpath("lastal")

    subjectdb = subject.rsplit(".", 1)[0]
    run_lastdb(infile=subject, outfile=subjectdb + ".prj", mask=opts.mask, \
              lastdb_bin=lastdb_bin)

    u = 2 if opts.mask else 0
    cmd = "{0} -u {1}".format(lastal_bin, u)
    cmd += " -P {0} -i3G".format(cpus)
    cmd += " -f {0}".format(opts.format)
    cmd += " {0} {1}".format(subjectdb, query)

    minlen = opts.minlen
    minid = opts.minid
    extra = opts.extra
    assert minid != 100, "Perfect match not yet supported"
    mm = minid / (100 - minid)

    if minlen:
        extra += " -e{0}".format(minlen)
    if minid:
        extra += " -r1 -q{0} -a{0} -b{0}".format(mm)
    if extra:
        cmd += " " + extra.strip()

    lastfile = get_outfile(subject, query, suffix="last")
    sh(cmd, outfile=lastfile)
Пример #35
0
def run_lastdb(infile=None, outfile=None, mask=False, lastdb_bin="lastdb"):
    outfilebase = outfile.rsplit(".", 1)[0]
    mask = "-c " if mask else ""
    cmd = "{0} {1}{2} {3}".format(lastdb_bin, mask, outfilebase, infile)
    sh(cmd)
Пример #36
0
def calc(args):
    """
    %prog calc [prot.fasta] cds.fasta > out.ks

    Protein file is optional. If only one file is given, it is assumed to
    be CDS sequences with correct frame (frame 0). Results will be written to
    stdout. Both protein file and nucleotide file are assumed to be Fasta format,
    with adjacent records as the pairs to compare.

    Author: Haibao Tang <*****@*****.**>, Brad Chapman, Jingping Li
    Calculate synonymous mutation rates for gene pairs

    This does the following:
        1. Fetches a protein pair.
        2. Aligns the protein pair with clustalw (default) or muscle.
        3. Convert the output to Fasta format.
        4. Use this alignment info to align gene sequences using PAL2NAL
        5. Run PAML yn00 to calculate synonymous mutation rates.
    """
    from jcvi.formats.fasta import translate

    p = OptionParser(calc.__doc__)
    p.add_option("--longest", action="store_true",
                 help="Get longest ORF, only works if no pep file, "\
                      "e.g. ESTs [default: %default]")
    p.add_option(
        "--msa",
        default="clustalw",
        choices=("clustalw", "muscle"),
        help="software used to align the proteins [default: %default]")
    p.add_option("--workdir", default=os.getcwd(), help="Work directory")
    p.set_outfile()

    opts, args = p.parse_args(args)

    if len(args) == 1:
        protein_file, dna_file = None, args[0]
    elif len(args) == 2:
        protein_file, dna_file = args
    else:
        print("Incorrect arguments", file=sys.stderr)
        sys.exit(not p.print_help())

    output_h = must_open(opts.outfile, "w")
    print(fields, file=output_h)
    work_dir = op.join(opts.workdir, "syn_analysis")
    mkdir(work_dir)

    if not protein_file:
        protein_file = dna_file + ".pep"
        translate_args = [dna_file, "--outfile=" + protein_file]
        if opts.longest:
            translate_args += ["--longest"]
        dna_file, protein_file = translate(translate_args)

    prot_iterator = SeqIO.parse(open(protein_file), "fasta")
    dna_iterator = SeqIO.parse(open(dna_file), "fasta")
    for p_rec_1, p_rec_2, n_rec_1, n_rec_2 in \
            zip(prot_iterator, prot_iterator, dna_iterator, dna_iterator):

        print("--------", p_rec_1.name, p_rec_2.name, file=sys.stderr)
        if opts.msa == "clustalw":
            align_fasta = clustal_align_protein((p_rec_1, p_rec_2), work_dir)
        elif opts.msa == "muscle":
            align_fasta = muscle_align_protein((p_rec_1, p_rec_2), work_dir)
        mrtrans_fasta = run_mrtrans(align_fasta, (n_rec_1, n_rec_2), work_dir)
        if mrtrans_fasta:
            ds_subs_yn, dn_subs_yn, ds_subs_ng, dn_subs_ng = \
                    find_synonymous(mrtrans_fasta, work_dir)
            if ds_subs_yn is not None:
                pair_name = "%s;%s" % (p_rec_1.name, p_rec_2.name)
                output_h.write("%s\n" % (",".join(
                    str(x) for x in (pair_name, ds_subs_yn, dn_subs_yn,
                                     ds_subs_ng, dn_subs_ng))))
                output_h.flush()

    # Clean-up
    sh("rm -rf 2YN.t 2YN.dN 2YN.dS rst rub rst1 syn_analysis")
Пример #37
0
def sample(args):
    """
    %prog sample bedfile sizesfile

    Sample bed file and remove high-coverage regions.

    When option --targetsize is used, this program uses a differnent mode. It
    first calculates the current total bases from all ranges and then compare to
    targetsize, if more, then sample down as close to targetsize as possible.
    """
    import random
    from jcvi.assembly.coverage import Coverage

    p = OptionParser(sample.__doc__)
    p.add_option("--max",
                 default=10,
                 type="int",
                 help="Max depth allowed [default: %default]")
    p.add_option(
        "--targetsize",
        type="int",
        help="Sample bed file to get target base number [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bedfile, sizesfile = args
    pf = bedfile.rsplit(".", 1)[0]

    targetsize = opts.targetsize
    if targetsize:
        bed = Bed(bedfile)
        samplebed = pf + ".sample.bed"
        fw = open(samplebed, "w")
        nfeats = len(bed)
        nbases = bed.sum(unique=False)
        targetfeats = int(round(nfeats * targetsize / nbases))
        sub_bed = random.sample(bed, targetfeats)
        for b in sub_bed:
            print >> fw, b

        logging.debug("File written to `{0}`.".format(samplebed))
        return

    c = Coverage(bedfile, sizesfile)
    coveragefile = c.filename
    samplecoveragefile = pf + ".sample.coverage"
    fw = open(samplecoveragefile, "w")
    fp = open(coveragefile)
    for row in fp:
        seqid, start, end, cov = row.split()
        cov = int(cov)
        if cov <= opts.max:
            fw.write(row)
    fw.close()

    samplebedfile = pf + ".sample.bed"
    cmd = "intersectBed -a {0} -b {1} -wa -u".format(bedfile,
                                                     samplecoveragefile)
    sh(cmd, outfile=samplebedfile)
    logging.debug("Sampled bedfile written to `{0}`.".format(samplebedfile))
Пример #38
0
def overlap(args):
    """
    %prog overlap ctgfasta poolfasta

    Fish out the sequences in `poolfasta` that overlap with `ctgfasta`.
    Mix and combine using `minimus2`.
    """
    p = OptionParser(overlap.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ctgfasta, poolfasta = args
    prefix = ctgfasta.split(".")[0]
    rid = list(Fasta(ctgfasta).iterkeys())
    assert len(rid) == 1, "Use overlapbatch() to improve multi-FASTA file"

    rid = rid[0]
    splitctgfasta = ctgfasta.rsplit(".", 1)[0] + ".split.fasta"
    ctgfasta = run_gapsplit(infile=ctgfasta, outfile=splitctgfasta)

    # Run BLAST
    blastfile = ctgfasta + ".blast"
    run_megablast(infile=ctgfasta, outfile=blastfile, db=poolfasta)

    # Extract contigs and merge using minimus2
    closuredir = prefix + ".closure"
    closure = False
    if need_update(blastfile, closuredir):
        mkdir(closuredir, overwrite=True)
        closure = True

    if closure:
        idsfile = op.join(closuredir, prefix + ".ids")
        cmd = "cut -f2 {0} | sort -u".format(blastfile)
        sh(cmd, outfile=idsfile)

        idsfastafile = op.join(closuredir, prefix + ".ids.fasta")
        cmd = "faSomeRecords {0} {1} {2}".format(poolfasta, idsfile,
                                                 idsfastafile)
        sh(cmd)

        # This step is a hack to weight the bases from original sequences more
        # than the pulled sequences, by literally adding another copy to be used
        # in consensus calls.
        redundantfastafile = op.join(closuredir, prefix + ".redundant.fasta")
        format([ctgfasta, redundantfastafile, "--prefix=RED."])

        mergedfastafile = op.join(closuredir, prefix + ".merged.fasta")
        cmd = "cat {0} {1} {2}".format(ctgfasta, redundantfastafile,
                                       idsfastafile)
        sh(cmd, outfile=mergedfastafile)

        afgfile = op.join(closuredir, prefix + ".afg")
        cmd = "toAmos -s {0} -o {1}".format(mergedfastafile, afgfile)
        sh(cmd)

        cwd = os.getcwd()
        os.chdir(closuredir)
        cmd = "minimus2 {0} -D REFCOUNT=0".format(prefix)
        cmd += " -D OVERLAP=100 -D MINID=98"
        sh(cmd)
        os.chdir(cwd)

    # Analyze output, make sure that:
    # + Get the singletons of the original set back
    # + Drop any contig that is comprised entirely of pulled set
    originalIDs = set(Fasta(ctgfasta).iterkeys())
    minimuscontig = op.join(closuredir, prefix + ".contig")
    c = ContigFile(minimuscontig)
    excludecontigs = set()
    for rec in c.iter_records():
        reads = set(x.id for x in rec.reads)
        if reads.isdisjoint(originalIDs):
            excludecontigs.add(rec.id)

    logging.debug("Exclude contigs: {0}".format(", ".join(
        sorted(excludecontigs))))

    finalfasta = prefix + ".improved.fasta_"
    fw = open(finalfasta, "w")
    minimusfasta = op.join(closuredir, prefix + ".fasta")
    f = Fasta(minimusfasta)
    for id, rec in f.iteritems_ordered():
        if id in excludecontigs:
            continue
        SeqIO.write([rec], fw, "fasta")

    singletonfile = op.join(closuredir, prefix + ".singletons")
    singletons = set(x.strip() for x in open(singletonfile))
    leftovers = singletons & originalIDs

    logging.debug("Pull leftover singletons: {0}".format(", ".join(
        sorted(leftovers))))

    f = Fasta(ctgfasta)
    for id, rec in f.iteritems_ordered():
        if id not in leftovers:
            continue
        SeqIO.write([rec], fw, "fasta")

    fw.close()

    fastafile = finalfasta
    finalfasta = fastafile.rstrip("_")
    format([
        fastafile, finalfasta, "--sequential", "--pad0=3",
        "--prefix={0}_".format(rid)
    ])

    logging.debug("Improved FASTA written to `{0}`.".format(finalfasta))

    n50([ctgfasta])
    n50([finalfasta])

    errlog = "error.log"
    for f in (fastafile, blastfile, errlog):
        if op.exists(f):
            os.remove(f)
Пример #39
0
def faToTwoBit(fastafile):
    twobitfile = fastafile.rsplit(".", 1)[0] + ".2bit"
    cmd = "faToTwoBit {0} {1}".format(fastafile, twobitfile)
    if need_update(fastafile, twobitfile):
        sh(cmd)
    return twobitfile
Пример #40
0
def simulate(args):
    """
    %prog simulate run_dir 1 300

    Simulate BAMs with varying inserts with dwgsim. The above command will
    simulate between 1 to 300 CAGs in the HD region, in a directory called
    `run_dir`.
    """
    p = OptionParser(simulate.__doc__)
    p.add_option("--ref",
                 default="/Users/htang/projects/ref/hg38.upper.fa",
                 help="Reference genome sequence")
    add_simulate_options(p)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    rundir, startunits, endunits = args
    startunits, endunits = int(startunits), int(endunits)
    basecwd = os.getcwd()
    mkdir(rundir)
    os.chdir(rundir)
    cwd = os.getcwd()

    # Huntington region
    pad_left, pad_right = 1000, 10000
    chr, start, end = 'chr4', 3074877, 3074933
    fasta = Fasta(opts.ref)
    seq_left = fasta[chr][start - pad_left:start - 1]
    seq_right = fasta[chr][end:end + pad_right]
    motif = 'CAG'
    reffastafile = "ref.fasta"
    seq = str(fasta[chr][start - pad_left:end + pad_right])
    make_fasta(seq, reffastafile, id=chr.upper())

    # Write fake sequence
    for units in range(startunits, endunits + 1):
        pf = str(units)
        mkdir(pf)
        os.chdir(pf)
        seq = str(seq_left) + motif * units + str(seq_right)
        fastafile = pf + ".fasta"
        make_fasta(seq, fastafile, id=chr.upper())

        # Simulate reads on it
        wgsim([
            fastafile, "--depth={}".format(opts.depth),
            "--readlen={}".format(opts.readlen),
            "--distance={}".format(opts.distance), "--outfile={}".format(pf)
        ])

        read1 = pf + ".bwa.read1.fastq"
        read2 = pf + ".bwa.read2.fastq"
        samfile, _ = align(["../{}".format(reffastafile), read1, read2])
        indexed_samfile = index([samfile])

        sh("mv {} ../{}.bam".format(indexed_samfile, pf))
        sh("mv {}.bai ../{}.bam.bai".format(indexed_samfile, pf))

        os.chdir(cwd)
        shutil.rmtree(pf)

    os.chdir(basecwd)
Пример #41
0
def draw(args):
    """
    %prog draw --input newicktrees [options]

    Draw phylogenetic trees into single or combined plots.
    Input trees should be one of the following:
    1.  single Newick format tree file
    2.  a dir containing *ONLY* the tree files to be drawn

    Newick format:
    http://evolution.genetics.washington.edu/phylip/newicktree.html

    This function wraps on jcvi.graphics.tree
    This function is better used for trees generated by jcvi.apps.phylo (rooted
    if possible). For drawing general Newick trees from external sources invoke
    jcvi.graphics.tree directly, which also gives more drawing options.
    """
    trunc_name_options = ['headn', 'oheadn', 'tailn', 'otailn']
    p = OptionParser(draw.__doc__)
    p.add_option("--input", help="path to single input tree file or a dir "\
                 "containing ONLY the input tree files")
    p.add_option("--combine", type="string", default="1x1", \
                 help="combine multiple trees into one plot in nrowxncol")
    p.add_option("--trunc_name", default=None, help="Options are: {0}. " \
                 "truncate first n chars, retains only first n chars, " \
                 "truncate last n chars, retain only last chars. " \
                 "n=1~99. [default: %default]".format(trunc_name_options))
    p.add_option("--SH", default=None,
                 help="path to a file containing SH test p-values in format:" \
                 "tree_file_name<tab>p-values " \
                 "This file can be generated with jcvi.apps.phylo build [default: %default]")
    p.add_option("--scutoff", default=50, type="int",
                 help="cutoff for displaying node support, 0-100 [default: %default]")
    p.add_option("--barcode", default=None,
                 help="path to seq/taxon name barcode mapping file: " \
                 "barcode<tab>new_name " \
                 "This option is downstream of `--trunc_name` [default: %default]")
    p.add_option("--leafcolorfile", default=None,
                 help="path to a mapping file containing font colors " \
                 "for the OTUs: leafname<tab>color [default: %default]")
    p.set_outdir()
    opts, args, iopts = p.set_image_options(figsize="8x6")
    input = opts.input
    outdir = opts.outdir
    combine = opts.combine.split("x")
    trunc_name = opts.trunc_name
    SH = opts.SH

    mkdir(outdir)
    if not input:
        sys.exit(not p.print_help())
    elif op.isfile(input):
        trees_file = input
        treenames = [op.basename(input)]
    elif op.isdir(input):
        trees_file = op.join(outdir, "alltrees.dnd")
        treenames = []
        for f in sorted(os.listdir(input)):
            sh("cat {0}/{1} >> {2}".format(input, f, trees_file), log=False)
            treenames.append(f)
    else:
        sys.exit(not p.print_help())

    trees = OrderedDict()
    tree = ""
    i = 0
    for row in LineFile(trees_file, comment="#", load=True).lines:
        if i == len(treenames):
            break
        if not len(row):
            continue

        if ";" in row:
            # sanity check
            if row.index(";") != len(row)-1:
                ts = row.split(";")
                for ii in xrange(len(ts)-1):
                    ts[ii] += ";"
            else:
                ts = [row]
            for t in ts:
                if ";" in t:
                    tree += t
                    if tree:
                        trees[treenames[i]] = tree
                        tree = ""
                        i+=1
                else:
                    tree += t
        else:
            tree += row

    logging.debug("A total of {0} trees imported.".format(len(trees)))
    sh("rm {0}".format(op.join(outdir, "alltrees.dnd")))

    _draw_trees(trees, nrow=int(combine[0]), ncol=int(combine[1]), rmargin=.3,\
         iopts=iopts, outdir=outdir, shfile=SH, trunc_name=trunc_name, \
         scutoff=opts.scutoff, barcodefile = opts.barcode,
         leafcolorfile=opts.leafcolorfile)
Пример #42
0
def parallel(args):
    """
    %prog parallel genome.fasta N

    Partition the genome into parts and run separately. This is useful if MAKER
    is to be run on the grid.
    """
    from jcvi.formats.base import split

    p = OptionParser(parallel.__doc__)
    p.set_home("maker")
    p.set_tmpdir(tmpdir="tmp")
    p.set_grid_opts(array=True)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    genome, NN = args
    threaded = opts.threaded or 1
    tmpdir = opts.tmpdir

    mkdir(tmpdir)
    tmpdir = get_abs_path(tmpdir)

    N = int(NN)
    assert 1 <= N < 1000, "Required: 1 < N < 1000!"

    outdir = "outdir"
    fs = split([genome, outdir, NN])

    c = CTLFile("maker_opts.ctl")
    c.update_abs_path()
    if threaded > 1:
        c.update_tag("cpus", threaded)

    cwd = os.getcwd()
    dirs = []
    for name in fs.names:
        fn = get_abs_path(name)
        bn = op.basename(name)
        dirs.append(bn)
        c.update_tag("genome", fn)
        mkdir(bn)
        sh("cp *.ctl {0}".format(bn))

        os.chdir(bn)
        c.write_file("maker_opts.ctl")
        os.chdir(cwd)

    jobs = "jobs"
    fw = open(jobs, "w")
    print("\n".join(dirs), file=fw)
    fw.close()

    # Submit to grid
    ncmds = len(dirs)
    runfile = "array.sh"
    cmd = op.join(opts.maker_home, "bin/maker")
    if tmpdir:
        cmd += " -TMP {0}".format(tmpdir)

    engine = get_grid_engine()
    contents = arraysh.format(jobs, cmd) if engine == "SGE" \
                else arraysh_ua.format(N, threaded, jobs, cmd)
    write_file(runfile, contents)

    if engine == "PBS":
        return

    # qsub script
    outfile = "maker.\$TASK_ID.out"
    p = GridProcess(runfile,
                    outfile=outfile,
                    errfile=outfile,
                    arr=ncmds,
                    grid_opts=opts)
    qsubfile = "qsub.sh"
    qsub = p.build()
    write_file(qsubfile, qsub)
Пример #43
0
def refine(args):
    """
    %prog refine breakpoints.bed gaps.bed

    Find gaps within or near breakpoint region.

    For breakpoint regions with no gaps, there are two options:
    - Break in the middle of the region
    - Break at the closest gap (--closest)
    """
    p = OptionParser(refine.__doc__)
    p.add_option(
        "--closest",
        default=False,
        action="store_true",
        help="In case of no gaps, use closest",
    )
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    breakpointsbed, gapsbed = args
    ncols = len(next(open(breakpointsbed)).split())
    logging.debug("File %s contains %d columns.", breakpointsbed, ncols)
    cmd = "intersectBed -wao -a {0} -b {1}".format(breakpointsbed, gapsbed)

    pf = "{0}.{1}".format(breakpointsbed.split(".")[0], gapsbed.split(".")[0])
    ingapsbed = pf + ".bed"
    sh(cmd, outfile=ingapsbed)

    fp = open(ingapsbed)
    data = [x.split() for x in fp]

    nogapsbed = pf + ".nogaps.bed"
    largestgapsbed = pf + ".largestgaps.bed"
    nogapsfw = open(nogapsbed, "w")
    largestgapsfw = open(largestgapsbed, "w")
    for b, gaps in groupby(data, key=lambda x: x[:ncols]):
        gaps = list(gaps)
        gap = gaps[0]
        if len(gaps) == 1 and gap[-1] == "0":
            assert gap[-3] == "."
            print("\t".join(b), file=nogapsfw)
            continue

        gaps = [(int(x[-1]), x) for x in gaps]
        maxgap = max(gaps)[1]
        print("\t".join(maxgap), file=largestgapsfw)

    nogapsfw.close()
    largestgapsfw.close()
    beds = [largestgapsbed]
    toclean = [nogapsbed, largestgapsbed]

    if opts.closest:
        closestgapsbed = pf + ".closestgaps.bed"
        cmd = "closestBed -a {0} -b {1} -d".format(nogapsbed, gapsbed)
        sh(cmd, outfile=closestgapsbed)
        beds += [closestgapsbed]
        toclean += [closestgapsbed]
    else:
        pointbed = pf + ".point.bed"
        pbed = Bed()
        bed = Bed(nogapsbed)
        for b in bed:
            pos = (b.start + b.end) // 2
            b.start, b.end = pos, pos
            pbed.append(b)
        pbed.print_to_file(pointbed)
        beds += [pointbed]
        toclean += [pointbed]

    refinedbed = pf + ".refined.bed"
    FileMerger(beds, outfile=refinedbed).merge()

    # Clean-up
    FileShredder(toclean)

    return refinedbed
Пример #44
0
    def __init__(self, filelist, verbose=True):

        filelist = [x for x in filelist if x and op.exists(x)]
        cmd = "rm -rf {0}".format(" ".join(filelist))
        sh(cmd, log=verbose)
Пример #45
0
def run_formatdb(infile=None, outfile=None, dbtype="nucl"):
    cmd = "makeblastdb"
    cmd += " -dbtype {0} -in {1}".format(dbtype, infile)
    sh(cmd)
Пример #46
0
def build(args):
    """
    %prog build [prot.fasta] cds.fasta [options] --outdir=outdir

    This function wraps on the following steps:
    1. msa using ClustalW2 or MUSCLE(default)
    2. (optional) alignment editing using Gblocks
    3. build NJ tree using PHYLIP in EMBOSS package
       seq names should be unique by first 10 chars (restriction of PHYLIP)
    4. build ML tree using RAxML(default) or PHYML, use keywords raxml or phyml,
       *WARNING* maybe slow with large dataset

    If an outgroup file is provided, the result tree will be rooted on the
    outgroup according to order in the file, i.e. the name in row1 will be
    tried first. If not found, row2 will be used, etc.
    Tail truncated names can be provided so long as it is unique among the seqs.
    If not uniq, the first occurrence will be used. For example, if you have
    two moss sequences in your input, then the tree will be rooted on the
    first moss sequence encountered by the program, unless they are monophylic,
     in which case the root will be their common ancestor.

    --stree and --smap are required if --treefix is set.

    Trees can be edited again using an editor such as Dendroscope. This
    is the recommended way to get highly customized trees.

    Newick format trees will be deposited into outdir (. by default).
    """
    from jcvi.formats.fasta import translate

    p = OptionParser(build.__doc__)
    p.add_option("--longest", action="store_true",
                 help="Get longest ORF, only works if no pep file, "\
                      "e.g. ESTs [default: %default]")
    p.add_option("--nogblocks", action="store_true",
                 help="don't use Gblocks to edit alignment [default: %default]")
    p.add_option("--synonymous", action="store_true",
                 help="extract synonymous sites of the alignment [default: %default]")
    p.add_option("--fourfold", action="store_true",
                 help="extract fourfold degenerate sites of the alignment [default: %default]")
    p.add_option("--msa", default="muscle", choices=("clustalw", "muscle"),
                 help="software used to align the proteins [default: %default]")
    p.add_option("--noneighbor", action="store_true",
                 help="don't build NJ tree [default: %default]")
    p.add_option("--ml", default=None, choices=("raxml", "phyml"),
                 help="software used to build ML tree [default: %default]")
    p.add_option("--outgroup",
                 help="path to file containing outgroup orders [default: %default]")
    p.add_option("--SH", help="path to reference Newick tree [default: %default]")
    p.add_option("--shout", default="SH_out.txt", \
                 help="SH output file name [default: %default]")
    p.add_option("--treefix", action="store_true",
                 help="use TreeFix to rearrange ML tree [default: %default]")
    p.add_option("--stree", help="path to species Newick tree [default: %default]")
    p.add_option("--smap", help="path to smap file: " \
                    "gene_name_pattern<tab>species_name [default: %default]")
    p.set_outdir()

    opts, args = p.parse_args(args)
    gblocks = not opts.nogblocks
    synonymous = opts.synonymous
    fourfold = opts.fourfold
    neighbor = not opts.noneighbor
    outgroup = opts.outgroup
    outdir = opts.outdir

    if len(args) == 1:
        protein_file, dna_file = None, args[0]
    elif len(args) == 2:
        protein_file, dna_file = args
    else:
        print("Incorrect arguments", file=sys.stderr)
        sys.exit(not p.print_help())

    if opts.treefix:
        stree = opts.stree
        smap = opts.smap
        assert stree and smap, "TreeFix requires stree and smap files."
        opts.ml = "raxml"

    treedir = op.join(outdir, "tree")
    mkdir(treedir)

    if not protein_file:
        protein_file = dna_file + ".pep"
        translate_args = [dna_file, "--outfile=" + protein_file]
        if opts.longest:
            translate_args += ["--longest"]
        dna_file, protein_file = translate(translate_args)

    work_dir = op.join(outdir, "alignment")
    mkdir(work_dir)
    p_recs = list(SeqIO.parse(open(protein_file), "fasta"))
    if opts.msa == "clustalw":
        align_fasta = clustal_align_protein(p_recs, work_dir)
    elif opts.msa == "muscle":
        align_fasta = muscle_align_protein(p_recs, work_dir)

    n_recs = list(SeqIO.parse(open(dna_file), "fasta"))
    mrtrans_fasta = run_mrtrans(align_fasta, n_recs, work_dir, outfmt="fasta")

    if not mrtrans_fasta:
        logging.debug("pal2nal aborted. " \
            "Cannot reliably build tree for {0}".format(dna_file))
        return

    codon_aln_fasta = mrtrans_fasta
    if gblocks:
        gb_fasta = run_gblocks(mrtrans_fasta)
        codon_aln_fasta = gb_fasta if gb_fasta else codon_aln_fasta

    else:
        if synonymous:
            codon_aln_fasta = subalignment(mrtrans_fasta, "synonymous")

        if fourfold:
            codon_aln_fasta = subalignment(mrtrans_fasta, "fourfold")

    if not neighbor and not opts.ml:
        return codon_aln_fasta

    alignment = AlignIO.read(codon_aln_fasta, "fasta")
    if len(alignment) <= 3:
        raise ValueError("Too few seqs to build tree.")

    mkdir(op.join(treedir, "work"))
    if neighbor:
        out_file = op.join(treedir, op.basename(dna_file).rsplit(".", 1)[0] + \
                ".NJ.unrooted.dnd")
        try:
            outfile, phy_file = build_nj_phylip(alignment, \
                outfile=out_file, outgroup=outgroup, work_dir=treedir)
        except:
            print("NJ tree cannot be built for {0}".format(dna_file))

        if opts.SH:
            reftree = opts.SH
            querytree = outfile
            SH_raxml(reftree, querytree, phy_file, shout=opts.shout)

    if opts.ml:
        out_file = op.join(treedir, op.basename(dna_file).rsplit(".", 1)[0] + \
                ".ML.unrooted.dnd")

        if opts.ml == "phyml":
            try:
                outfile, phy_file = build_ml_phyml\
                    (alignment, outfile=out_file, work_dir=treedir)
            except:
                print("ML tree cannot be built for {0}".format(dna_file))

        elif opts.ml == "raxml":
            try:
                outfile, phy_file = build_ml_raxml\
                    (alignment, outfile=out_file, work_dir=treedir)
            except:
                print("ML tree cannot be built for {0}".format(dna_file))

        if outgroup:
            new_out_file = out_file.replace(".unrooted", "")
            t = smart_reroot(treefile=out_file, outgroupfile=outgroup, \
                outfile=new_out_file)
            if t == new_out_file:
                sh("rm %s" % out_file)
                outfile = new_out_file

        if opts.SH:
            reftree = opts.SH
            querytree = outfile
            SH_raxml(reftree, querytree, phy_file, shout=opts.shout)

        if opts.treefix:
            treefix_dir = op.join(treedir, "treefix")
            assert mkdir(treefix_dir, overwrite=True)

            sh("cp {0} {1}/".format(outfile, treefix_dir))
            input = op.join(treefix_dir, op.basename(outfile))
            aln_file = input.rsplit(".", 1)[0] + ".fasta"
            SeqIO.write(alignment, aln_file, "fasta")

            outfile = run_treefix(input=input, stree_file=stree, smap_file=smap, \
                        a_ext=".fasta", o_ext=".dnd", n_ext = ".treefix.dnd")

    return outfile
Пример #47
0
 def run(self, cpus=1):
     cmd = "make -j {0} -f {1}".format(cpus, self.makefile)
     sh(cmd)
Пример #48
0
def deletion(args):
    """
    %prog deletion [mac.mic.bam|mac.mic.bed] mic.gaps.bed

    Find IES based on mapping MAC reads to MIC genome.
    """
    p = OptionParser(deletion.__doc__)
    p.add_option("--mindepth",
                 default=3,
                 type="int",
                 help="Minimum depth to call a deletion")
    p.add_option("--minspan",
                 default=30,
                 type="int",
                 help="Minimum span to call a deletion")
    p.add_option("--split",
                 default=False,
                 action="store_true",
                 help="Break at cigar N into separate parts")
    p.set_tmpdir()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bedfile, gapsbedfile = args
    if bedfile.endswith(".bam"):
        bamfile = bedfile
        bedfile = bamfile.replace(".sorted.", ".").replace(".bam", ".bed")
        if need_update(bamfile, bedfile):
            cmd = "bamToBed -i {0}".format(bamfile)
            if opts.split:
                cmd += " -split"
            cmd += " | cut -f1-4"
            sh(cmd, outfile=bedfile)

    sort_tmpdir = "--tmpdir={0}".format(opts.tmpdir)
    if bedfile.endswith(".sorted.bed"):
        pf = bedfile.rsplit(".", 2)[0]
        sortedbedfile = bedfile
    else:
        pf = bedfile.rsplit(".", 1)[0]
        sortedbedfile = pf + ".sorted.bed"
        if need_update(bedfile, sortedbedfile):
            sort([bedfile, "-u", "--accn", sort_tmpdir])

    # Find reads that contain multiple matches
    ibedfile = pf + ".d.bed"
    if need_update(sortedbedfile, ibedfile):
        bed = Bed(sortedbedfile, sorted=False)
        fw = open(ibedfile, "w")
        logging.debug("Write deletions to `{0}`.".format(ibedfile))
        for accn, bb in groupby(bed, key=lambda x: x.accn):
            bb = list(bb)
            branges = [(x.seqid, x.start, x.end) for x in bb]
            iranges = range_interleave(branges)
            for seqid, start, end in iranges:
                if end - start + 1 < opts.minspan:
                    continue
                print >> fw, "\t".join(str(x) for x in \
                            (seqid, start - 1, end, accn + '-d'))
        fw.close()

    # Uniqify the insertions and count occurrences
    countbedfile = pf + ".uniq.bed"
    if need_update(ibedfile, countbedfile):
        bed = Bed(ibedfile)
        fw = open(countbedfile, "w")
        logging.debug("Write counts to `{0}`.".format(countbedfile))
        registry = Counter((x.seqid, x.start, x.end) for x in bed)
        ies_id = 1
        for (seqid, start, end), count in registry.items():
            ies_name = "{0:05d}-r{1}".format(ies_id, count)
            if count < opts.mindepth:
                continue
            print >> fw, "\t".join(str(x) for x in \
                            (seqid, start - 1, end, ies_name))
            ies_id += 1
        fw.close()
        sort([countbedfile, "-i", sort_tmpdir])

    # Remove deletions that contain some read depth
    depthbedfile = pf + ".depth.bed"
    if need_update((sortedbedfile, countbedfile), depthbedfile):
        depth([
            sortedbedfile, countbedfile, "--outfile={0}".format(depthbedfile)
        ])

    validbedfile = pf + ".valid.bed"
    if need_update(depthbedfile, validbedfile):
        fw = open(validbedfile, "w")
        logging.debug("Filter valid deletions to `{0}`.".format(validbedfile))
        bed = Bed(depthbedfile)
        all_scores = [float(b.score) for b in bed]
        lb, ub = outlier_cutoff(all_scores)
        logging.debug(
            "Bounds for depths: LB={0:.2f} (ignored)  UB={1:.2f}".format(
                lb, ub))
        for b in bed:
            if float(b.score) > ub:
                continue
            print >> fw, b
        fw.close()

    # Remove deletions that contain sequencing gaps on its flanks
    selectedbedfile = pf + ".selected.bed"
    if need_update(validbedfile, selectedbedfile):
        flanksbedfile = pf + ".flanks.bed"
        fw = open(flanksbedfile, "w")
        bed = Bed(validbedfile)
        flank = 100
        logging.debug("Write deletion flanks to `{0}`.".format(flanksbedfile))
        for b in bed:
            start, end = b.start, b.end
            b.start, b.end = start, min(start + flank - 1, end)
            print >> fw, b
            b.start, b.end = max(start, end - flank + 1), end
            print >> fw, b
        fw.close()

        intersectidsfile = pf + ".intersect.ids"
        cmd = "intersectBed -a {0} -b {1}".format(flanksbedfile, gapsbedfile)
        cmd += " | cut -f4 | sort -u"
        sh(cmd, outfile=intersectidsfile)
        some([
            validbedfile, intersectidsfile, "-v",
            "--outfile={0}".format(selectedbedfile)
        ])

    # Find best-scoring non-overlapping set
    iesbedfile = pf + ".ies.bed"
    if need_update(selectedbedfile, iesbedfile):
        bed = Bed(selectedbedfile)
        fw = open(iesbedfile, "w")
        logging.debug("Write IES to `{0}`.".format(iesbedfile))
        branges = [Range(x.seqid, x.start, x.end, int(x.accn.rsplit("r")[-1]), i) \
                        for i, x in enumerate(bed)]
        iranges, iscore = range_chain(branges)
        logging.debug("Best chain score: {0} ({1} IES)".\
                        format(iscore, len(iranges)))
        ies_id = 1
        for seqid, start, end, score, id in iranges:
            ies_name = "IES-{0:05d}-r{1}".format(ies_id, score)
            span = end - start + 1
            print >> fw, "\t".join(str(x) for x in \
                            (seqid, start - 1, end, ies_name, span))
            ies_id += 1
        fw.close()
Пример #49
0
def snap(args):
    """
    %prog snap species gffile fastafile

    Train SNAP model given gffile and fastafile. Whole procedure taken from:
    <http://gmod.org/wiki/MAKER_Tutorial_2012>
    """
    p = OptionParser(snap.__doc__)
    p.set_home("maker")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    species, gffile, fastafile = args
    mhome = opts.maker_home
    snapdir = "snap"
    mkdir(snapdir)

    cwd = os.getcwd()
    os.chdir(snapdir)

    newgffile = "training.gff3"
    logging.debug("Construct GFF file combined with sequence ...")
    sh("cat ../{0} > {1}".format(gffile, newgffile))
    sh('echo "##FASTA" >> {0}'.format(newgffile))
    sh("cat ../{0} >> {1}".format(fastafile, newgffile))

    logging.debug("Make models ...")
    sh("{0}/src/bin/maker2zff training.gff3".format(mhome))
    sh("{0}/exe/snap/fathom -categorize 1000 genome.ann genome.dna".format(
        mhome))
    sh("{0}/exe/snap/fathom -export 1000 -plus uni.ann uni.dna".format(mhome))
    sh("{0}/exe/snap/forge export.ann export.dna".format(mhome))
    sh("{0}/exe/snap/hmm-assembler.pl {1} . > {1}.hmm".format(mhome, species))

    os.chdir(cwd)
    logging.debug("SNAP matrix written to `{0}/{1}.hmm`".format(
        snapdir, species))
Пример #50
0
    if tag == "all":
        sh("qdel -u {0}".format(username))
        return

    valid_jobids = set()
    method = opts.method or guess_method(tag)
    if method == "jobid":
        jobids = tag.split(",")
        valid_jobids |= set(jobids)
    elif method == "pattern":
        qsxmlcmd = 'qstat -u "{0}" -j "{1}" -nenv -njd -xml'.\
                                format(username, tag)
        try:
            qsxml = check_output(shlex.split(qsxmlcmd)).strip()
        except CalledProcessError, e:
            qsxml = None
            logging.debug('No jobs matching the pattern "{0}"'.format(tag))

        if qsxml is not None:
            for job in ET.fromstring(qsxml).findall("djob_info"):
                for elem in job.findall("element"):
                    jobid = elem.find("JB_job_number").text
                    valid_jobids.add(jobid)

    if valid_jobids:
        sh("qdel {0}".format(",".join(valid_jobids)))


if __name__ == '__main__':
    main()
Пример #51
0
def do_cleanup(minibam, realignbam):
    sh("rm -f {}* {}*".format(minibam, realignbam))
Пример #52
0
def augustus(args):
    """
    %prog augustus species gffile fastafile

    Train AUGUSTUS model given gffile and fastafile. Whole procedure taken from:
    <http://www.molecularevolution.org/molevolfiles/exercises/augustus/training.html>
    """
    p = OptionParser(augustus.__doc__)
    p.add_option("--autotrain",
                 default=False,
                 action="store_true",
                 help="Run autoAugTrain.pl to iteratively train AUGUSTUS")
    p.set_home("augustus")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    species, gffile, fastafile = args
    mhome = opts.augustus_home
    augdir = "augustus"

    cwd = os.getcwd()
    mkdir(augdir)
    os.chdir(augdir)
    target = "{0}/config/species/{1}".format(mhome, species)

    if op.exists(target):
        logging.debug("Removing existing target `{0}`".format(target))
        sh("rm -rf {0}".format(target))

    config_path = "{0}/config".format(mhome)
    sh("{0}/scripts/new_species.pl --species={1} --AUGUSTUS_CONFIG_PATH={2}".
       format(mhome, species, config_path))
    sh("{0}/scripts/gff2gbSmallDNA.pl ../{1} ../{2} 1000 raw.gb".\
            format(mhome, gffile, fastafile))
    sh("{0}/bin/etraining --species={1} raw.gb 2> train.err".\
            format(mhome, species))
    sh("cat train.err | perl -pe 's/.*in sequence (\S+): .*/$1/' > badgenes.lst"
       )
    sh("{0}/scripts/filterGenes.pl badgenes.lst raw.gb > training.gb".\
            format(mhome))
    sh("grep -c LOCUS raw.gb training.gb")

    # autoAugTrain failed to execute, disable for now
    if opts.autotrain:
        sh("rm -rf {0}".format(target))
        sh("{0}/scripts/autoAugTrain.pl --trainingset=training.gb --species={1}".\
                format(mhome, species))

    os.chdir(cwd)
    sh("cp -r {0} augustus/".format(target))
Пример #53
0
def mito(args):
    """
    %prog mito chrM.fa input.bam

    Identify mitochondrial deletions.
    """
    p = OptionParser(mito.__doc__)
    p.set_aws_opts(store="hli-mv-data-science/htang/mito-deletions")
    p.add_option("--realignonly",
                 default=False,
                 action="store_true",
                 help="Realign only")
    p.add_option("--svonly",
                 default=False,
                 action="store_true",
                 help="Run Realign => SV calls only")
    p.add_option("--support",
                 default=1,
                 type="int",
                 help="Minimum number of supporting reads")
    p.set_home("speedseq", default="/mnt/software/speedseq/bin")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    chrMfa, bamfile = args
    store = opts.output_path
    cleanup = not opts.nocleanup

    if not op.exists(chrMfa):
        logging.debug("File `{}` missing. Exiting.".format(chrMfa))
        return

    chrMfai = chrMfa + ".fai"
    if not op.exists(chrMfai):
        cmd = "samtools index {}".format(chrMfa)
        sh(cmd)

    if not bamfile.endswith(".bam"):
        bamfiles = [x.strip() for x in open(bamfile)]
    else:
        bamfiles = [bamfile]

    if store:
        computed = ls_s3(store)
        computed = [
            op.basename(x).split('.')[0] for x in computed
            if x.endswith(".depth")
        ]
        remaining_samples = [
            x for x in bamfiles if op.basename(x).split(".")[0] not in computed
        ]

        logging.debug("Already computed on `{}`: {}".format(
            store,
            len(bamfiles) - len(remaining_samples)))
        bamfiles = remaining_samples

    logging.debug("Total samples: {}".format(len(bamfiles)))

    for bamfile in bamfiles:
        run_mito(chrMfa,
                 bamfile,
                 opts,
                 realignonly=opts.realignonly,
                 svonly=opts.svonly,
                 store=store,
                 cleanup=cleanup)
Пример #54
0
def jellyfish(args):
    """
    %prog jellyfish [*.fastq|*.fasta]

    Run jellyfish to dump histogram to be used in kmer.histogram().
    """
    from jcvi.apps.base import getfilesize
    from jcvi.utils.cbook import human_size
    p = OptionParser(jellyfish.__doc__)
    p.add_option("-K",
                 default=23,
                 type="int",
                 help="K-mer size [default: %default]")
    p.add_option("--coverage",
                 default=40,
                 type="int",
                 help="Expected sequence coverage [default: %default]")
    p.add_option("--prefix",
                 default="jf",
                 help="Database prefix [default: %default]")
    p.add_option("--nohist",
                 default=False,
                 action="store_true",
                 help="Do not print histogram [default: %default]")
    p.set_home("jellyfish")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fastqfiles = args
    K = opts.K
    coverage = opts.coverage

    totalfilesize = sum(getfilesize(x) for x in fastqfiles)
    fq = fastqfiles[0]
    pf = opts.prefix
    gzip = fq.endswith(".gz")

    hashsize = totalfilesize / coverage
    logging.debug("Total file size: {0}, hashsize (-s): {1}".\
                    format(human_size(totalfilesize,
                           a_kilobyte_is_1024_bytes=True), hashsize))

    jfpf = "{0}-K{1}".format(pf, K)
    jfdb = jfpf
    fastqfiles = " ".join(fastqfiles)

    jfcmd = op.join(opts.jellyfish_home, "jellyfish")
    cmd = jfcmd
    cmd += " count -t {0} -C -o {1}".format(opts.cpus, jfpf)
    cmd += " -s {0} -m {1}".format(hashsize, K)
    if gzip:
        cmd = "gzip -dc {0} | ".format(fastqfiles) + cmd + " /dev/fd/0"
    else:
        cmd += " " + fastqfiles

    if need_update(fastqfiles, jfdb):
        sh(cmd)

    if opts.nohist:
        return

    jfhisto = jfpf + ".histogram"
    cmd = jfcmd + " histo -t 64 {0} -o {1}".format(jfdb, jfhisto)

    if need_update(jfdb, jfhisto):
        sh(cmd)
Пример #55
0
def main(args):
    """
    %prog deltafile

    Plot one query. Extract the references that have major matches to this
    query. Control "major" by option --refcov.
    """
    p = OptionParser(main.__doc__)
    p.add_option("--refids", help="Use subset of contigs in the ref")
    p.add_option(
        "--refcov",
        default=0.01,
        type="float",
        help="Minimum reference coverage",
    )
    p.add_option(
        "--all",
        default=False,
        action="store_true",
        help="Plot one pdf file per ref in refidsfile",
    )
    p.add_option(
        "--color",
        default="similarity",
        choices=("similarity", "direction", "none"),
        help="Color the dots based on",
    )
    p.add_option(
        "--nolayout",
        default=False,
        action="store_true",
        help="Do not rearrange contigs",
    )
    p.set_align(pctid=0, hitlen=0)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (deltafile,) = args
    reffasta, queryfasta = open(deltafile).readline().split()
    color = opts.color
    layout = not opts.nolayout
    prefix = op.basename(deltafile).split(".")[0]
    qsizes = Sizes(queryfasta).mapping
    rsizes = Sizes(reffasta).mapping

    refs = SetFile(opts.refids) if opts.refids else set(rsizes.keys())
    refcov = opts.refcov
    pctid = opts.pctid
    hitlen = opts.hitlen
    deltafile = filter(
        [deltafile, "--pctid={0}".format(pctid), "--hitlen={0}".format(hitlen)]
    )

    if opts.all:
        for r in refs:
            pdffile = plot_some_queries(
                [r],
                qsizes,
                rsizes,
                deltafile,
                refcov,
                prefix=prefix,
                color=color,
                layout=layout,
            )
            if pdffile:
                sh("mv {0} {1}.pdf".format(pdffile, r))
    else:
        plot_some_queries(
            refs,
            qsizes,
            rsizes,
            deltafile,
            refcov,
            prefix=prefix,
            color=color,
            layout=layout,
        )
Пример #56
0
def run_mito(chrMfa,
             bamfile,
             opts,
             realignonly=False,
             svonly=False,
             store=None,
             cleanup=False):
    from jcvi.formats.sam import get_minibam
    region = "chrM"
    minibam = op.basename(bamfile).replace(".bam", ".{}.bam".format(region))
    if not op.exists(minibam):
        get_minibam(bamfile, region)
    else:
        logging.debug("{} found. Skipped.".format(minibam))

    speedseq_bin = op.join(opts.speedseq_home, "speedseq")

    realign = minibam.rsplit(".", 1)[0] + ".realign"
    realignbam = realign + ".bam"
    margs = " -v -t {} -o {}".format(opts.cpus, realign)
    if need_update(minibam, realign + ".bam"):
        cmd = speedseq_bin + " realign"
        cmd += margs
        cmd += " {} {}".format(chrMfa, minibam)
        sh(cmd)
    else:
        logging.debug("{} found. Skipped.".format(realignbam))

    if realignonly:
        return

    depthfile = realign + ".depth"
    if need_update(realignbam, depthfile):
        coverage([
            chrMfa, realignbam, "--nosort", "--format=coverage",
            "--outfile={}".format(depthfile)
        ])

    if store:
        push_to_s3(store, depthfile)

    vcffile = realign + ".sv.vcf.gz"
    if need_update(realignbam, vcffile):
        cmd = speedseq_bin + " sv"
        cmd += margs
        cmd += " -R {}".format(chrMfa)
        cmd += " -m {}".format(opts.support)
        cmd += " -B {} -D {} -S {}".format(realignbam,
                                           realign + ".discordants.bam",
                                           realign + ".splitters.bam")
        sh(cmd)
    else:
        logging.debug("{} found. Skipped.".format(vcffile))

    if store:
        push_to_s3(store, vcffile)

    if svonly:
        if cleanup:
            do_cleanup(minibam, realignbam)
        return

    piledriver = realign + ".piledriver"
    if need_update(realignbam, piledriver):
        cmd = "bamtools piledriver -fasta {}".format(chrMfa)
        cmd += " -in {}".format(realignbam)
        sh(cmd, outfile=piledriver)

    if store:
        push_to_s3(store, piledriver)

    if cleanup:
        do_cleanup(minibam, realignbam)
Пример #57
0
def info(args):
    """
    %prog info casfile <fastafile>

    Wraps around `assembly_info` and get the following block.

    General info:
    Read info:
    Coverage info:

    In particular, the read info will be reorganized so that it shows the
    percentage of unmapped, mapped, unique and multi-hit reads.

    When --coverage is used, the program expects a second fastafile to replace
    the contig IDs with real ones.

    RPKM = 10^9 x C / NL, which is really just simply C/N

    C = the number of mappable reads that felt onto the gene's exons
    N = total number of mappable reads in the experiment
    L = the sum of the exons in base pairs.
    """
    from jcvi.utils.cbook import percentage

    p = OptionParser(info.__doc__)
    p.add_option(
        "--coverage",
        default=False,
        action="store_true",
        help="Generate coverage output, replacing IDs [default: %default]")
    set_grid(p)

    opts, args = p.parse_args(args)

    if len(args) not in (1, 2):
        sys.exit(not p.print_help())

    casfile = args[0]
    pf = casfile.rsplit(".", 1)[0]

    if opts.coverage:
        assert len(args) == 2, "You need a fastafile when using --coverage"
        coveragefile = pf + ".coverage"
        fw = open(coveragefile, "w")

    infofile = pf + ".info"
    cmd = "assembly_info {0}".format(casfile)
    if not op.exists(infofile):
        sh(cmd, outfile=infofile, grid=opts.grid)

    inreadblock = False
    incontigblock = False

    fp = open(infofile)
    row = fp.readline()
    while row:
        if row.startswith("Read info:"):
            inreadblock = True
        elif row.startswith("Contig info:"):
            incontigblock = True

        # Following looks like a hack, but to keep compatible between
        # CLC 3.20 and CLC 4.0 beta
        if inreadblock:
            atoms = row.split('s')

            last = atoms[-1].split()[0] if len(atoms) > 1 else "0"
            srow = row.strip()

            if srow.startswith("Reads"):
                reads = int(last)
            if srow.startswith("Unmapped") or srow.startswith("Unassembled"):
                unmapped = int(last)
            if srow.startswith("Mapped") or srow.startswith("Assembled"):
                mapped = int(last)
            if srow.startswith("Multi"):
                multihits = int(last)

            if row.startswith("Coverage info:"):
                # Print the Read info: block
                print "Read info:"
                assert mapped + unmapped == reads

                unique = mapped - multihits
                print
                print "Total reads: {0}".format(reads)
                print "Unmapped reads: {0}".format(
                    percentage(unmapped, reads, False))
                print "Mapped reads: {0}".format(
                    percentage(mapped, reads, False))
                print "Unique reads: {0}".format(
                    percentage(unique, reads, False))
                print "Multi hit reads: {0}".\
                        format(percentage(multihits, reads, False))
                print
                inreadblock = False

        if incontigblock and opts.coverage:

            fastafile = args[1]
            s = Sizes(fastafile)
            while row:
                atoms = row.split()
                if len(atoms) == 4 and atoms[0][0] != "C":  # Contig
                    # Contig       Sites       Reads     Coverage
                    contig, sites, reads, coverage = atoms
                    contig = int(contig) - 1
                    size = s.sizes[contig]
                    contig = s.ctgs[contig]
                    assert size == int(sites)

                    # See formula above
                    rpkm = 1e9 * int(reads) / (size * mapped)
                    print >> fw, "\t".join(
                        (contig, sites, reads, "{0:.1f}".format(rpkm)))

                row = fp.readline()

        row = fp.readline()
Пример #58
0
def anneal(args):
    """
    %prog anneal agpfile contigs.fasta

    Merge adjacent overlapping contigs and make new AGP file.

    By default it will also anneal lines like these together (unless --nozipshreds):
    scaffold4       1       1608    1       W       ca-bacs.5638.frag11.22000-23608 1       1608    -
    scaffold4       1609    1771    2       N       163     scaffold        yes     paired-ends
    scaffold4       1772    3771    3       W       ca-bacs.5638.frag10.20000-22000 1       2000    -

    These are most likely shreds, which we look for based on names.
    """
    p = OptionParser(anneal.__doc__)
    p.set_align(pctid=GoodPct, hitlen=GoodOverlap)
    p.add_option("--hang",
                 default=GoodOverhang,
                 type="int",
                 help="Maximum overhang length")
    p.set_outdir(outdir="outdir")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    agpfile, contigs = args
    outdir = opts.outdir
    if not op.exists(outdir):
        mkdir(outdir)
        cmd = "faSplit byname {0} {1}/".format(contigs, outdir)
        sh(cmd)

    cutoff = Cutoff(opts.pctid, opts.hitlen, opts.hang)
    logging.debug(str(cutoff))

    agp = AGP(agpfile)
    blastfile = agpfile.replace(".agp", ".blast")
    if not op.exists(blastfile):
        populate_blastfile(blastfile, agp, outdir, opts)

    assert op.exists(blastfile)
    logging.debug("File `{0}` found. Start loading.".format(blastfile))
    blast = BlastSlow(blastfile).to_dict()

    annealedagp = "annealed.agp"
    annealedfasta = "annealed.fasta"

    newagp = deepcopy(agp)
    clrstore = {}
    for a, b, qreverse in agp.iter_paired_components():
        aid = a.component_id
        bid = b.component_id

        pair = (aid, bid)
        if pair in blast:
            bl = blast[pair]
        else:
            oopts = get_overlap_opts(aid, bid, qreverse, outdir, opts)
            o = overlap(oopts)
            if not o:
                continue
            bl = o.blastline

        o = Overlap(bl,
                    a.component_span,
                    b.component_span,
                    cutoff,
                    qreverse=qreverse)

        if aid not in clrstore:
            clrstore[aid] = CLR.from_agpline(a)
        if bid not in clrstore:
            clrstore[bid] = CLR.from_agpline(b)

        aclr, bclr = clrstore[aid], clrstore[bid]

        o.print_graphic()
        if o.anneal(aclr, bclr):
            newagp.delete_between(aid, bid, verbose=True)

        if o.otype == 2:  # b ~ a
            o = o.swapped
            o.print_graphic()
            if o.anneal(bclr, aclr):
                newagp.switch_between(bid, aid, verbose=True)
                newagp.delete_between(bid, aid, verbose=True)

    logging.debug("A total of {0} components with modified CLR.".format(
        len(clrstore)))

    for cid, c in clrstore.items():
        if c.is_valid:
            continue
        print("Remove {0}".format(c), file=sys.stderr)
        newagp.convert_to_gap(cid, verbose=True)

    # Update all ranges that has modified clr
    for a in newagp:
        if a.is_gap:
            continue
        aid = a.component_id
        if aid in clrstore:
            c = clrstore[aid]
            a.component_beg = c.start
            a.component_end = c.end

    newagp.print_to_file(annealedagp)
    tidyagp = tidy([annealedagp, contigs])

    build([tidyagp, contigs, annealedfasta])
    return annealedfasta
Пример #59
0
def eagle(args):
    """
    %prog eagle fastafile

    """
    p = OptionParser(eagle.__doc__)
    p.add_option("--share", default="/usr/local/share/EAGLE/",
                 help="Default EAGLE share path")
    add_sim_options(p)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    share = opts.share
    depth = opts.depth
    readlen = opts.readlen
    distance = opts.distance
    pf = op.basename(fastafile).split(".")[0]

    # Since EAGLE does not natively support read length other than 100bp and
    # 250bp - for an arbitrary read length we need to generate a bunch of
    # support files

    # First file is the Runinfo
    runinfo_readlen = "RunInfo_PairedReads2x{}Cycles1x1Tiles.xml".format(readlen)
    if not op.exists(runinfo_readlen):
        runinfo = op.join(share, "RunInfo/RunInfo_PairedReads2x251Cycles1x1Tiles.xml")
        runinfo_xml = open(runinfo).read()
        runinfo_xml = runinfo_xml.replace("251", str(readlen))\
                                 .replace("252", str(readlen + 1))\
                                 .replace("502", str(2 * readlen))
        fw = open(runinfo_readlen, "w")
        print >> fw, runinfo_xml.strip()
        fw.close()

    # Generate quality profiles
    quality_file1 = "QualityTable.read1.length{}.qval".format(readlen)
    quality_file2 = "QualityTable.read2.length{}.qval".format(readlen)
    if not (op.exists(quality_file1) and op.exists(quality_file2)):
        for i, qq in enumerate([quality_file1, quality_file2]):
            cmd  = "/usr/local/libexec/EAGLE/scaleQualityTable.pl"
            cmd += " --input {}".format(op.join(share,
                "QualityTables/DefaultQualityTable.read{}.length101.qval".format(i + 1)))
            cmd += " --cycles {}".format(readlen)
            cmd += " --output {}".format(qq)
            sh(cmd, silent=True)

    # Since distance is different from the default distribution which is
    # centered around 319, we shift our peak to the new peak
    template_lengths = op.join(share,
                "TemplateLengthTables/DefaultTemplateLengthTable.tsv")
    template_distance = "TemplateLengthTable{}.tsv".format(distance)
    shift = distance - 319
    if not op.exists(template_distance):
        fp = open(template_lengths)
        fw = open(template_distance, "w")
        for row in fp:
            size, counts = row.split()
            size = int(size)
            counts = int(counts)
            size += shift
            if size < readlen:
                continue
            print >> fw, "\t".join(str(x) for x in (size, counts))
        fw.close()

    # All done, let's simulate!
    cmd  = "configureEAGLE.pl"
    cmd += " --reference-genome {}".format(fastafile)
    cmd += " --coverage-depth {}".format(depth)
    cmd += " --gc-coverage-fit-table {}".format(op.join(share,
                "GcCoverageFitTables/Homo_sapiens.example1.tsv"))
    cmd += " --run-info {}".format(runinfo_readlen)
    cmd += " --quality-table {}".format(quality_file1)
    cmd += " --quality-table {}".format(quality_file2)
    cmd += " --template-length-table {}".format(template_distance)
    cmd += " --random-seed {}".format(random.randint(1, 65535))
    sh(cmd, silent=True)

    # Retrieve results
    outpf = opts.outfile or "{0}.{1}bp.{2}x".format(pf, distance, depth)
    outpf += ".bwa"
    cwd = os.getcwd()
    eagle_dir = "EAGLE"
    os.chdir(eagle_dir)
    sh("make bam", silent=True)

    # Convert BAM to FASTQ
    from jcvi.formats.sam import fastq
    a, b = fastq(["eagle.bam", outpf])
    sh("mv {} {} ../".format(a, b))
    os.chdir(cwd)

    # Clean-up
    shutil.rmtree(eagle_dir)
Пример #60
0
def assemble(args):
    """
    Run `cap3` on a single multi FASTA file containing reads or a folder containing several
    multi FASTA files. Allows for tweaking of `cap3` parameters max_gap_len, ovl_pct_id, etc.
    """
    p = OptionParser(assemble.__doc__)
    g1 = OptionGroup(
        p, "Input file options (required)",
        "Note: Please choose from and provide values for one of the following parameters"
    )
    g1.add_option("--input_file",
                  default=None,
                  help="input file of reads [default: %default]")
    g1.add_option(
        "--input_folder",
        default=None,
        help=
        "input folder containing multi FASTA files of reads [default: %default]"
    )
    g1.add_option(
        "--input_file_list",
        default=None,
        help=
        "list file containing paths to multi FASTA files of reads [default: %default]"
    )
    p.add_option_group(g1)

    g2 = OptionGroup(p, "Optional parameters",
                     "Note: If not specified, `cap3` defaults will be used")
    g2.add_option("-f", "--max_gap_len", default=20, type="int",
            help="maximum gap length in any overlap [default: %default]\n" +\
                 "Same as cap3 `-f` parameter.")
    g2.add_option("-p", "--ovl_pct_id", default=90, type="int",
            help="overlap percent identity cutoff [default: %default]\n" +\
                 "Same as cap3 `-p` parameter.")
    g2.add_option("-s", "--ovl_sim_score", default=900, type="int",
            help="overlap similarity score cutoff [default: %default]\n" +\
                 "Same as cap3 `-s` parameter.")
    g2.add_option(
        "-x",
        "--prefix",
        dest="prefix",
        default="cap3",
        help="prefix string for output file name [default: %default]")
    p.add_option_group(g2)

    p.set_params()

    opts, args = p.parse_args(args)

    if opts.max_gap_len and opts.max_gap_len <= 1:
        logging.error("--max_gap_len should be > 1")
        sys.exit()
    elif opts.ovl_pct_id and opts.ovl_pct_id <= 65:
        logging.error("--ovl_pct_id should be > 65")
        sys.exit()
    elif opts.ovl_sim_score and opts.ovl_sim_score <= 250:
        logging.error("--ovl_sim_score should be > 250")
        sys.exit()

    file_list = []
    if opts.input_file_list:
        if not op.isfile(opts.input_file_list):
            logging.error("Input file list {0} does not exist".format(
                opts.input_file_list))
            sys.exit()
        with open(opts.input_file_list, 'r') as f:
            file_list = f.read().splitlines()
    elif opts.input_folder:
        if not op.isdir(opts.input_folder):
            logging.error("Input folder {0} does not exist".format(
                opts.input_folder))
            sys.exit()

        file_list = [file for file in os.listdir(opts.input_folder) \
                if file.lower().endswith(('.fa', '.fasta'))]
        folder = opts.input_folder
        folder = folder.rstrip('/')
        for i in xrange(len(file_list)):
            file_list[i] = folder + "/" + file_list[i]
    elif opts.input_file:
        file_list.append(opts.input_file)
    else:
        logging.error("Please specify one of the options for input files")
        sys.exit(not p.print_help())

    if len(file_list) == 0:
        logging.warning(
            "List of files to process is empty. Please check your input!")
        sys.exit()

    for file in file_list:
        if not op.isfile(file):
            logging.warning("Input file {0} does not exist".format(file))
        else:
            cmd = "cap3 {0} -f {1} -p {2} -s {3} -x {4}".format(file, opts.max_gap_len, \
                    opts.ovl_pct_id, opts.ovl_sim_score, opts.prefix)
            if opts.extra:
                cmd += " {0}".format(opts.extra)
            logfile = "{0}.{1}.log".format(file, opts.prefix)

            sh(cmd, outfile=logfile)