示例#1
0
文件: base.py 项目: linlifeng/jcvi
def must_open(filename, mode="r", checkexists=False, skipcheck=False):
    """
    Accepts filename and returns filehandle.

    Checks on multiple files, stdin/stdout/stderr, .gz or .bz2 file.
    """
    if isinstance(filename, list):
        assert "r" in mode

        import fileinput
        return fileinput.input(filename)

    if filename in ("-", "stdin"):
        assert "r" in mode
        fp = sys.stdin

    elif filename == "stdout":
        assert "w" in mode
        fp = sys.stdout

    elif filename == "stderr":
        assert "w" in mode
        fp = sys.stderr

    elif filename == "tmp" and mode == "w":
        from tempfile import NamedTemporaryFile
        fp = NamedTemporaryFile(delete=False)

    elif filename.endswith(".gz"):
        if 'r' in mode:
            cmd = "zcat {0}".format(filename)
            fp = popen(cmd, debug=False)
        elif 'w' in mode:
            import gzip
            fp = gzip.open(filename, mode)

    elif filename.endswith(".bz2"):
        if 'r' in mode:
            cmd = "bzcat {0}".format(filename)
            fp = popen(cmd, debug=False)
        elif 'w' in mode:
            import bz2
            fp = bz2.BZ2File(filename, mode)

    else:
        if checkexists:
            assert mode == "w"
            overwrite = (not op.exists(filename)) if skipcheck \
                        else check_exists(filename)
            if overwrite:
                fp = open(filename, "w")
            else:
                logging.debug("File `{0}` already exists. Skipped."\
                        .format(filename))
                return None
        else:
            fp = open(filename, mode)

    return fp
示例#2
0
文件: aws.py 项目: ascendo/jcvi
def ls_s3(s3_store_obj_name):
    s3_store_obj_name = s3ify(s3_store_obj_name)
    cmd = "aws s3 ls {0}/".format(s3_store_obj_name)
    contents = []
    for row in popen(cmd):
        contents.append(row.split()[-1])
    return contents
示例#3
0
文件: sam.py 项目: sophy7074/jcvi
def append(args):
    """
    %prog append bamfile

    Append /1 or /2 to read names. Useful for using the Tophat2 bam file for
    training AUGUSTUS gene models.
    """
    p = OptionParser(append.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bamfile, = args
    icmd = "samtools view -h {0}".format(bamfile)
    bamfile = bamfile.rsplit(".", 1)[0] + ".append.bam"
    ocmd = "samtools view -b -@ 64 - -o {0}".format(bamfile)
    p = Popen(ocmd, stdin=PIPE)
    for row in popen(icmd):
        if row[0] == '@':
            print >> p.stdin, row.strip()
        else:
            s = SamLine(row)
            s.update_readname()
            print >> p.stdin, s
示例#4
0
文件: ks.py 项目: bennyyu/jcvi
def get_mixture(data, components):
    """
    probs = [.476, .509]
    mus = [.69069, -.15038]
    variances = [.468982e-1, .959052e-1]
    """
    from jcvi.apps.base import popen

    probs, mus, sigmas = [], [], []
    fw = must_open("tmp", "w")
    log_data = [log(x) for x in data if x > .05]
    data = "\n".join(["%.4f" % x for x in log_data]).replace("inf\n", "")
    fw.write(data)
    fw.close()

    cmd = "gmm-bic {0} {1} {2}".format(components, len(log_data), fw.name)
    pipe = popen(cmd)

    for row in pipe:
        if row[0] != '#':
            continue

        atoms = row.split(",")
        a, b, c = atoms[1:4]
        a = float(a)
        b = float(b)
        c = float(c)

        mus.append(a)
        sigmas.append(b)
        probs.append(c)

    os.remove(fw.name)
    return probs, mus, sigmas
示例#5
0
文件: blast.py 项目: zengxiaofei/jcvi
def top10(args):
    """
    %prog top10 blastfile.best

    Count the most frequent 10 hits. Usually the BLASTFILE needs to be screened
    the get the best match. You can also provide an .ids file to query the ids.
    For example the ids file can contain the seqid to species mapping.

    The ids file is two-column, and can sometimes be generated by
    `jcvi.formats.fasta ids --description`.
    """
    from jcvi.formats.base import DictFile

    p = OptionParser(top10.__doc__)
    p.add_option("--top", default=10, type="int",
                help="Top N taxa to extract [default: %default]")
    p.add_option("--ids", default=None,
                help="Two column ids file to query seqid [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    blastfile, = args
    mapping = DictFile(opts.ids, delimiter="\t") if opts.ids else {}

    cmd = "cut -f2 {0}".format(blastfile)
    cmd += " | sort | uniq -c | sort -k1,1nr | head -n {0}".format(opts.top)
    fp = popen(cmd)
    for row in fp:
        count, seqid = row.split()
        nseqid = mapping.get(seqid, seqid)
        print "\t".join((count, nseqid))
示例#6
0
文件: fimo.py 项目: orionzhou/nf
def locate(args):
    fi, fo = args.fi, args.fo
    seq = args.seq
    mtfs = read_motif(fi, args.motif)
    #
    mtf_str = " ".join([f'--motif {mid}' for mid, wd, score in mtfs])
    pre = f"tmp.lc{random.randrange(1000)}"
    #
    sh(f'fimo --bfile --motif-- {mtf_str} --thresh 1e-4 --skip-matched-sequence --text {fi} {seq} > {pre}_0.txt'
       )
    for mid, wd, score in mtfs:
        sh(f'grep -P "^{mid}\t" {pre}_0.txt > {pre}_0a.txt')
        #
        score_thresh = score
        if not score:
            xh = popen(
                f'cut -f7 {pre}_0a.txt | sed \'1d\' | sort -k1,1nr | head')
            max_score = float(xh.readline().decode("utf-8").strip())
            score_thresh = max_score * args.score_thresh
        #
        sh("bioawk -tH '{if($7>%f) {print $1\"%%\"$3, $4-1, $5}}' %s_0a.txt > %s_1.bed"
           % (score_thresh, pre, pre))
        hwd = round(wd * args.motif_frac)
        if os.stat(f"{pre}_1.bed").st_size == 0:
            sh(f'touch {pre}_4_{mid}.bed')
        else:
            sh(f'sortBed -i {pre}_1.bed | mergeBed > {pre}_2.bed')
            sh(f'bedtools makewindows -w {wd} -b {pre}_2.bed > {pre}_3.bed')
            sh(f'bed.py filter --minsize {hwd} {pre}_3.bed > {pre}_4_{mid}.bed'
               )
    sh(f'cat {pre}_4_*.bed > {fo}')
    if not args.debug:
        sh(f'rm -rf {pre}_*')
示例#7
0
文件: blast.py 项目: bennyyu/jcvi
def top10(args):
    """
    %prog top10 blastfile.best

    Count the most frequent 10 hits. Usually the BLASTFILE needs to be screened
    the get the best match. You can also provide an .ids file to query the ids.
    For example the ids file can contain the seqid to species mapping.

    The ids file is two-column, and can sometimes be generated by
    `jcvi.formats.fasta ids --description`.
    """
    from jcvi.formats.base import DictFile

    p = OptionParser(top10.__doc__)
    p.add_option("--ids", default=None,
                help="Two column ids file to query seqid [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    blastfile, = args
    mapping = DictFile(opts.ids, delimiter="\t") if opts.ids else {}

    cmd = "cut -f2 {0}".format(blastfile)
    cmd += " | sort | uniq -c | sort -k1,1nr | head"
    fp = popen(cmd)
    for row in fp:
        count, seqid = row.split()
        nseqid = mapping.get(seqid, seqid)
        print "\t".join((count, nseqid))
示例#8
0
文件: sam.py 项目: arvin580/jcvi
def append(args):
    """
    %prog append bamfile

    Append /1 or /2 to read names. Useful for using the Tophat2 bam file for
    training AUGUSTUS gene models.
    """
    p = OptionParser(append.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bamfile, = args
    icmd = "samtools view -h {0}".format(bamfile)
    bamfile = bamfile.rsplit(".", 1)[0] + ".append.bam"
    ocmd = "samtools view -b -@ 64 - -o {0}".format(bamfile)
    p = Popen(ocmd, stdin=PIPE)
    for row in popen(icmd):
        if row[0] == '@':
            print >> p.stdin, row.strip()
        else:
            s = SamLine(row)
            s.update_readname()
            print >> p.stdin, s
示例#9
0
def append(args):
    """
    %prog append bamfile

    Append /1 or /2 to read names. Useful for using the Tophat2 bam file for
    training AUGUSTUS gene models.
    """
    p = OptionParser(append.__doc__)
    p.add_option("--prepend", help="Prepend string to read names")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (bamfile,) = args
    prepend = opts.prepend

    icmd = "samtools view -h {0}".format(bamfile)
    bamfile = bamfile.rsplit(".", 1)[0] + ".append.bam"
    ocmd = "samtools view -b -@ 64 - -o {0}".format(bamfile)
    p = Popen(ocmd, stdin=PIPE)
    for row in popen(icmd):
        if row[0] == "@":
            print(row.strip(), file=p.stdin)
        else:
            s = SamLine(row)
            if prepend:
                s.qname = prepend + "_" + s.qname
            else:
                s.update_readname()
            print(s, file=p.stdin)
示例#10
0
文件: ks.py 项目: LongZhao1992/jcvi
def get_mixture(data, components):
    """
    probs = [.476, .509]
    mus = [.69069, -.15038]
    variances = [.468982e-1, .959052e-1]
    """
    from jcvi.apps.base import popen

    probs, mus, sigmas = [], [], []
    fw = must_open("tmp", "w")
    log_data = [log(x) for x in data if x > .05]
    data = "\n".join(["%.4f" % x for x in log_data]).replace("inf\n", "")
    fw.write(data)
    fw.close()

    cmd = "gmm-bic {0} {1} {2}".format(components, len(log_data), fw.name)
    pipe = popen(cmd)

    for row in pipe:
        if row[0] != '#':
            continue

        atoms = row.split(",")
        a, b, c = atoms[1:4]
        a = float(a)
        b = float(b)
        c = float(c)

        mus.append(a)
        sigmas.append(b)
        probs.append(c)

    os.remove(fw.name)
    return probs, mus, sigmas
示例#11
0
文件: aws.py 项目: qiao-xin/jcvi
def ls_s3(s3_store_obj_name):
    s3_store_obj_name = s3ify(s3_store_obj_name)
    cmd = "aws s3 ls {0}/".format(s3_store_obj_name)
    contents = []
    for row in popen(cmd):
        contents.append(row.split()[-1])
    return contents
示例#12
0
def validate(args):
    """
    %prog validate outdir genome.fasta

    Validate current folder after MAKER run and check for failures. Failed batch
    will be written to a directory for additional work.
    """
    from jcvi.utils.counter import Counter

    p = OptionParser(validate.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    outdir, genome = args
    counter = Counter()

    fsnames, suffix = get_fsnames(outdir)
    dsfile = "{0}{1}/{0}.maker.output/{0}_master_datastore_index.log"
    dslogs = [dsfile.format(x, suffix) for x in fsnames]
    all_failed = []
    for f, d in zip(fsnames, dslogs):
        dslog = DatastoreIndexFile(d)
        counter.update(dslog.scaffold_status.values())
        all_failed.extend([(f, x) for x in dslog.failed])

    cmd = 'tail maker.*.out | grep -c "now finished"'
    n = int(popen(cmd).read())
    assert len(fsnames) == n
    print("ALL jobs have been finished", file=sys.stderr)

    nfailed = len(all_failed)
    if nfailed == 0:
        print("ALL scaffolds are completed with no errors", file=sys.stderr)
        return

    print("Scaffold status:", file=sys.stderr)
    print(counter, file=sys.stderr)
    failed = "FAILED"
    fw = open(failed, "w")
    print("\n".join(["\t".join((f, x)) for f, x in all_failed]), file=fw)
    fw.close()

    nlines = sum(1 for x in open("FAILED"))
    assert nlines == nfailed
    print("FAILED !! {0} instances.".format(nfailed), file=sys.stderr)

    # Rebuild the failed batch
    failed_ids = failed + ".ids"
    failed_fasta = failed + ".fasta"
    cmd = "cut -f2 {0}".format(failed)
    sh(cmd, outfile=failed_ids)
    if need_update((genome, failed_ids), failed_fasta):
        cmd = "faSomeRecords {0} {1} {2}".\
                    format(genome, failed_ids, failed_fasta)
        sh(cmd)
示例#13
0
文件: maker.py 项目: tanghaibao/jcvi
def validate(args):
    """
    %prog validate outdir genome.fasta

    Validate current folder after MAKER run and check for failures. Failed batch
    will be written to a directory for additional work.
    """
    from jcvi.utils.counter import Counter

    p = OptionParser(validate.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    outdir, genome = args
    counter = Counter()

    fsnames, suffix = get_fsnames(outdir)
    dsfile = "{0}{1}/{0}.maker.output/{0}_master_datastore_index.log"
    dslogs = [dsfile.format(x, suffix) for x in fsnames]
    all_failed = []
    for f, d in zip(fsnames, dslogs):
        dslog = DatastoreIndexFile(d)
        counter.update(dslog.scaffold_status.values())
        all_failed.extend([(f, x) for x in dslog.failed])

    cmd = 'tail maker.*.out | grep -c "now finished"'
    n = int(popen(cmd).read())
    assert len(fsnames) == n
    print("ALL jobs have been finished", file=sys.stderr)

    nfailed = len(all_failed)
    if nfailed == 0:
        print("ALL scaffolds are completed with no errors", file=sys.stderr)
        return

    print("Scaffold status:", file=sys.stderr)
    print(counter, file=sys.stderr)
    failed = "FAILED"
    fw = open(failed, "w")
    print("\n".join(["\t".join((f, x)) for f, x in all_failed]), file=fw)
    fw.close()

    nlines = sum(1 for x in open("FAILED"))
    assert nlines == nfailed
    print("FAILED !! {0} instances.".format(nfailed), file=sys.stderr)

    # Rebuild the failed batch
    failed_ids = failed + ".ids"
    failed_fasta = failed + ".fasta"
    cmd = "cut -f2 {0}".format(failed)
    sh(cmd, outfile=failed_ids)
    if need_update((genome, failed_ids), failed_fasta):
        cmd = "faSomeRecords {0} {1} {2}".\
                    format(genome, failed_ids, failed_fasta)
        sh(cmd)
示例#14
0
def ls_s3(s3_store_obj_name, recursive=False):
    s3_store_obj_name = s3ify(s3_store_obj_name)
    cmd = "aws s3 ls {0}/".format(s3_store_obj_name)
    if recursive:
        cmd += " --recursive"
    contents = []
    for row in popen(cmd):
        contents.append(row.split()[-1])
    return contents
示例#15
0
def ls_s3(s3_store_obj_name, recursive=False):
    s3_store_obj_name = s3ify(s3_store_obj_name)
    cmd = "aws s3 ls {0}/".format(s3_store_obj_name)
    if recursive:
        cmd += " --recursive"
    contents = []
    for row in popen(cmd):
        contents.append(row.split()[-1])
    return contents
示例#16
0
文件: cnv.py 项目: zhimenggan/jcvi
def compare_worker(arg):
    cnvoutput, truths = arg
    cmd = "intersectBed -f .5 -F .5"
    cmd += " -a {} -b {} | wc -l".format(cnvoutput, truths)
    nlines = int(popen(cmd, debug=False).read())
    target_lines = len([x for x in open(cnvoutput)])
    truths_lines = len([x for x in open(truths)])
    precision = nlines * 100. / target_lines
    recall = nlines * 100. / truths_lines
    d = "\t".join(str(x) for x in (cnvoutput, truths,
                                   nlines, target_lines, truths_lines,
                                   precision, recall))
    return d
示例#17
0
文件: cnv.py 项目: xuanblo/jcvi
def compare_worker(arg):
    cnvoutput, truths = arg
    cmd = "intersectBed -f .5 -F .5"
    cmd += " -a {} -b {} | wc -l".format(cnvoutput, truths)
    nlines = int(popen(cmd, debug=False).read())
    target_lines = len([x for x in open(cnvoutput)])
    truths_lines = len([x for x in open(truths)])
    precision = nlines * 100. / target_lines
    recall = nlines * 100. / truths_lines
    d = "\t".join(str(x) for x in (cnvoutput, truths,
                                   nlines, target_lines, truths_lines,
                                   precision, recall))
    return d
示例#18
0
def consolidate(nbedfile, obedfile, cbedfile):
    from pybedtools import BedTool

    nbedtool = BedTool(nbedfile)
    obedtool = BedTool(obedfile)

    ab = nbedtool.intersect(obedtool, s=True, u=True)
    ba = obedtool.intersect(nbedtool, s=True, u=True)

    cmd = "cat {0} {1} | sort -k1,1 -k2,2n".format(ab.fn, ba.fn)
    fp = popen(cmd)
    ovl = BedTool(fp.readlines())

    abmerge = ovl.merge(s=True, nms=True, scores="mean").sort()
    cmd = "cat {0}".format(abmerge.fn)
    fp = popen(cmd, debug=False)
    ovl = BedTool(fp.readlines())

    notovl = nbedtool.intersect(ovl.sort(), s=True, v=True)

    infile = "{0} {1}".format(notovl.fn, ovl.fn)
    tmpfile = "/tmp/reformat.{0}.bed".format(os.getpid())
    cmd = "sort -k1,1 -k2,2n"
    sh(cmd, infile=infile, outfile=tmpfile)

    fp = open(cbedfile, "w")
    bed = Bed(tmpfile)
    for b in bed:
        if ";" in b.accn:
            accns = set()
            for accn in b.accn.split(";"):
                accns.add(accn)
            b.accn = ";".join(accns)
        print(b, file=fp)
    fp.close()
    os.remove(tmpfile)

    sort([cbedfile, "-i"])
示例#19
0
def get_splits(split_bed, gff_file, stype, key):
    """
    Use intersectBed to find the fused gene => split genes mappings.
    """
    bed_file = get_bed_file(gff_file, stype, key)
    cmd = "intersectBed -a {0} -b {1} -wao".format(split_bed, bed_file)
    cmd += " | cut -f4,10"
    p = popen(cmd)
    splits = defaultdict(set)
    for row in p:
        a, b = row.split()
        splits[a].add(b)

    return splits
示例#20
0
文件: maker.py 项目: bennyyu/jcvi
def get_splits(split_bed, gff_file, stype, key):
    """
    Use intersectBed to find the fused gene => split genes mappings.
    """
    bed_file = get_bed_file(gff_file, stype, key)
    cmd = "intersectBed -a {0} -b {1} -wao".format(split_bed, bed_file)
    cmd += " | cut -f4,10"
    p = popen(cmd)
    splits = defaultdict(set)
    for row in p:
        a, b = row.split()
        splits[a].add(b)

    return splits
示例#21
0
文件: reformat.py 项目: Hensonmw/jcvi
def consolidate(nbedfile, obedfile, cbedfile):
    from pybedtools import BedTool
    nbedtool = BedTool(nbedfile)
    obedtool = BedTool(obedfile)

    ab = nbedtool.intersect(obedtool, s=True, u=True)
    ba = obedtool.intersect(nbedtool, s=True, u=True)

    cmd = "cat {0} {1} | sort -k1,1 -k2,2n".format(ab.fn, ba.fn)
    fp = popen(cmd)
    ovl = BedTool(fp.readlines())

    abmerge = ovl.merge(s=True, nms=True, scores="mean").sort()
    cmd = "cat {0}".format(abmerge.fn)
    fp = popen(cmd, debug=False)
    ovl = BedTool(fp.readlines())

    notovl = nbedtool.intersect(ovl.sort(), s=True, v=True)

    infile = "{0} {1}".format(notovl.fn, ovl.fn)
    tmpfile = "/tmp/reformat.{0}.bed".format(os.getpid())
    cmd = "sort -k1,1 -k2,2n"
    sh(cmd, infile=infile, outfile=tmpfile)

    fp = open(cbedfile, "w")
    bed = Bed(tmpfile)
    for b in bed:
        if ";" in b.accn:
            accns = set()
            for accn in b.accn.split(";"):
                accns.add(accn)
            b.accn = ";".join(accns)
        print >> fp, b
    fp.close()
    os.remove(tmpfile)

    sort([cbedfile, "-i"])
示例#22
0
文件: grid.py 项目: bennyyu/jcvi
    def start(self, path=sge):

        if self.is_defunct:
            return

        cwd = os.getcwd()
        if path:
            os.chdir(path)

        # Shell commands
        if "|" in self.cmd or "&&" in self.cmd or "||" in self.cmd:
            quote = "\"" if "'" in self.cmd else "'"
            self.cmd = "sh -c {1}{0}{1}".format(self.cmd, quote)

        # qsub command (the project code is specific to jcvi)
        qsub = "qsub -P {0} -cwd".format(PCODE)
        if self.queue != "default":
            qsub += " -l {0}".format(self.queue)
        if self.threaded:
            qsub += " -pe threaded {0}".format(self.threaded)
        if self.infile:
            qsub += " -i {0}".format(self.infile)
        if self.outfile:
            qsub += " -o {0}".format(self.outfile)
        if self.errfile:
            qsub += " -e {0}".format(self.errfile)

        cmd = " ".join((qsub, self.cmd))
        # run the command and get the job-ID (important)
        output = popen(cmd, debug=False).read()

        if output.strip() != "":
            self.jobid = re.search(self.pat, output).group("id")
        else:
            self.jobid = "-1"

        msg = "[{0}] {1}".format(self.jobid, self.cmd)
        if self.infile:
            msg += " < {0} ".format(self.infile)
        if self.outfile:
            backup(self.outfile)
            msg += " > {0} ".format(self.outfile)
        if self.errfile:
            backup(self.errfile)
            msg += " 2> {0} ".format(self.errfile)

        logging.debug(msg)

        os.chdir(cwd)
示例#23
0
文件: grid.py 项目: linlifeng/jcvi
    def start(self, path=sge):

        if self.is_defunct:
            return

        cwd = os.getcwd()
        if path:
            os.chdir(path)

        # Shell commands
        if "|" in self.cmd or "&&" in self.cmd or "||" in self.cmd:
            quote = "\"" if "'" in self.cmd else "'"
            self.cmd = "sh -c {1}{0}{1}".format(self.cmd, quote)

        # qsub command (the project code is specific to jcvi)
        qsub = "qsub -P {0} -cwd".format(PCODE)
        if self.queue != "default":
            qsub += " -l {0}".format(self.queue)
        if self.threaded:
            qsub += " -pe threaded {0}".format(self.threaded)
        if self.infile:
            qsub += " -i {0}".format(self.infile)
        if self.outfile:
            qsub += " -o {0}".format(self.outfile)
        if self.errfile:
            qsub += " -e {0}".format(self.errfile)

        cmd = " ".join((qsub, self.cmd))
        # run the command and get the job-ID (important)
        output = popen(cmd, debug=False).read()

        if output.strip() != "":
            self.jobid = re.search(self.pat, output).group("id")
        else:
            self.jobid = "-1"

        msg = "[{0}] {1}".format(self.jobid, self.cmd)
        if self.infile:
            msg += " < {0} ".format(self.infile)
        if self.outfile:
            backup(self.outfile)
            msg += " > {0} ".format(self.outfile)
        if self.errfile:
            backup(self.errfile)
            msg += " 2> {0} ".format(self.errfile)

        logging.debug(msg)

        os.chdir(cwd)
示例#24
0
文件: aws.py 项目: xuanblo/jcvi
def ls_s3(s3_store_obj_name, recursive=False):
    s3_store_obj_name = s3ify(s3_store_obj_name)
    cmd = "aws s3 ls {0}/".format(s3_store_obj_name)
    contents = []
    for row in popen(cmd):
        f = row.split()[-1]
        f = op.join(s3_store_obj_name, f)
        contents.append(f)

    if recursive:
        que = [x for x in contents if x.endswith("/")]
        while que:
            f = que.pop(0).rstrip("/")
            contents += ls_s3(f, recursive=True)

    return contents
示例#25
0
def ls_s3(s3_store_obj_name, recursive=False):
    s3_store_obj_name = s3ify(s3_store_obj_name)
    cmd = "aws s3 ls {0}/".format(s3_store_obj_name)
    contents = []
    for row in popen(cmd):
        f = row.split()[-1]
        f = op.join(s3_store_obj_name, f)
        contents.append(f)

    if recursive:
        que = [x for x in contents if x.endswith("/")]
        while que:
            f = que.pop(0).rstrip("/")
            contents += ls_s3(f, recursive=True)

    return contents
示例#26
0
def gaps(args):
    """
    %prog gaps idsfile fractionationfile gapsbed

    Check gene locations against gaps. `idsfile` contains a list of IDs to query
    into `fractionationfile` in order to get expected locations.
    """
    from jcvi.formats.base import DictFile
    from jcvi.apps.base import popen
    from jcvi.utils.cbook import percentage

    p = OptionParser(gaps.__doc__)
    p.add_option("--bdist",
                 default=0,
                 type="int",
                 help="Base pair distance [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    idsfile, frfile, gapsbed = args
    bdist = opts.bdist
    d = DictFile(frfile, keypos=1, valuepos=2)
    bedfile = idsfile + ".bed"
    fw = open(bedfile, "w")
    fp = open(idsfile)
    total = 0
    for row in fp:
        id = row.strip()
        hit = d[id]
        tag, pos = get_tag(hit, None)
        seqid, start, end = pos
        start, end = max(start - bdist, 1), end + bdist
        print >> fw, "\t".join(str(x) for x in (seqid, start - 1, end, id))
        total += 1
    fw.close()

    cmd = "intersectBed -a {0} -b {1} -v | wc -l".format(bedfile, gapsbed)
    not_in_gaps = popen(cmd).read()
    not_in_gaps = int(not_in_gaps)
    in_gaps = total - not_in_gaps
    print >> sys.stderr, "Ids in gaps: {1}".\
            format(total, percentage(in_gaps, total))
示例#27
0
文件: ca.py 项目: zjwang6/jcvi
def removecontains(args):
    """
    %prog removecontains 4-unitigger/best.contains asm.gkpStore

    Remove contained reads from gkpStore. This will improve assembly contiguity
    without sacrificing accuracy, when using bogart unitigger.
    """
    p = OptionParser(removecontains.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    contains, gkpStore = args

    s = set()
    fp = open(contains)
    for row in fp:
        if row[0] == "#":
            continue
        iid = int(row.split()[0])
        s.add(iid)

    cmd = "gatekeeper -dumpfragments -lastfragiid {}".format(gkpStore)
    gkpmsg = popen(cmd).read()
    last_iid = int(gkpmsg.strip().split()[-1])

    ndeleted = 0
    editfile = "delete.edit"
    fw = open(editfile, "w")
    for iid in range(1, last_iid + 1):
        if iid in s:
            print("frg iid {0} isdeleted 1".format(iid), file=fw)
            ndeleted += 1

    fw.close()
    assert len(s) == ndeleted
    logging.debug(
        "A total of {0} contained reads flagged as deleted.".format(ndeleted))
    print("Now you can run:", file=sys.stderr)
    print("$ gatekeeper --edit {0} {1}".format(editfile, gkpStore),
          file=sys.stderr)
示例#28
0
def gaps(args):
    """
    %prog gaps idsfile fractionationfile gapsbed

    Check gene locations against gaps. `idsfile` contains a list of IDs to query
    into `fractionationfile` in order to get expected locations.
    """
    from jcvi.formats.base import DictFile
    from jcvi.apps.base import popen
    from jcvi.utils.cbook import percentage

    p = OptionParser(gaps.__doc__)
    p.add_option("--bdist", default=0, type="int",
                 help="Base pair distance [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    idsfile, frfile, gapsbed = args
    bdist = opts.bdist
    d =  DictFile(frfile, keypos=1, valuepos=2)
    bedfile = idsfile + ".bed"
    fw = open(bedfile, "w")
    fp = open(idsfile)
    total = 0
    for row in fp:
        id = row.strip()
        hit = d[id]
        tag, pos = get_tag(hit, None)
        seqid, start, end = pos
        start, end = max(start - bdist, 1), end + bdist
        print >> fw, "\t".join(str(x) for x in (seqid, start - 1, end, id))
        total += 1
    fw.close()

    cmd = "intersectBed -a {0} -b {1} -v | wc -l".format(bedfile, gapsbed)
    not_in_gaps = popen(cmd).read()
    not_in_gaps = int(not_in_gaps)
    in_gaps = total - not_in_gaps
    print >> sys.stderr, "Ids in gaps: {1}".\
            format(total, percentage(in_gaps, total))
示例#29
0
文件: ca.py 项目: Hensonmw/jcvi
def removecontains(args):
    """
    %prog removecontains 4-unitigger/best.contains asm.gkpStore

    Remove contained reads from gkpStore. This will improve assembly contiguity
    without sacrificing accuracy, when using bogart unitigger.
    """
    p = OptionParser(removecontains.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    contains, gkpStore = args

    s = set()
    fp = open(contains)
    for row in fp:
        if row[0] == '#':
            continue
        iid = int(row.split()[0])
        s.add(iid)

    cmd = "gatekeeper -dumpfragments -lastfragiid {}".format(gkpStore)
    gkpmsg = popen(cmd).read()
    last_iid = int(gkpmsg.strip().split()[-1])

    ndeleted = 0
    editfile = "delete.edit"
    fw = open(editfile, "w")
    for iid in xrange(1, last_iid + 1):
        if iid in s:
            print >> fw, "frg iid {0} isdeleted 1".format(iid)
            ndeleted += 1

    fw.close()
    assert len(s) == ndeleted
    logging.debug("A total of {0} contained reads flagged as deleted."\
                  .format(ndeleted))
    print >> sys.stderr, "Now you can run:"
    print >> sys.stderr, "$ gatekeeper --edit {0} {1}".format(editfile, gkpStore)
示例#30
0
文件: grid.py 项目: radaniba/jcvi
    def start(self):
        cmd = self.build()
        # run the command and get the job-ID (important)
        output = popen(cmd, debug=False).read()

        if output.strip() != "":
            self.jobid = re.search(self.pat, output).group("id")
        else:
            self.jobid = "-1"

        msg = "[{0}] {1}".format(self.jobid, self.cmd)
        if self.infile:
            msg += " < {0} ".format(self.infile)
        if self.outfile:
            backup(self.outfile)
            msg += " > {0} ".format(self.outfile)
        if self.errfile:
            backup(self.errfile)
            msg += " 2> {0} ".format(self.errfile)

        logging.debug(msg)
示例#31
0
    def start(self):
        cmd = self.build()
        # run the command and get the job-ID (important)
        output = popen(cmd, debug=False).read()

        if output.strip() != "":
            self.jobid = re.search(self.pat, output).group("id")
        else:
            self.jobid = "-1"

        msg = "[{0}] {1}".format(self.jobid, self.cmd)
        if self.infile:
            msg += " < {0} ".format(self.infile)
        if self.outfile:
            backup(self.outfile)
            msg += " > {0} ".format(self.outfile)
        if self.errfile:
            backup(self.errfile)
            msg += " 2> {0} ".format(self.errfile)

        logging.debug(msg)
示例#32
0
文件: bed.py 项目: radaniba/jcvi
def intersectBed_wao(abedfile, bbedfile, minOverlap=0):
    abed = Bed(abedfile)
    bbed = Bed(bbedfile)
    print >> sys.stderr, "`{0}` has {1} features.".format(abedfile, len(abed))
    print >> sys.stderr, "`{0}` has {1} features.".format(bbedfile, len(bbed))

    cmd = "intersectBed -wao -a {0} -b {1}".format(abedfile, bbedfile)
    acols = abed[0].nargs
    bcols = bbed[0].nargs
    fp = popen(cmd)
    for row in fp:
        atoms = row.split()
        aline = "\t".join(atoms[:acols])
        bline = "\t".join(atoms[acols:acols + bcols])
        c = int(atoms[-1])
        if c < minOverlap:
            continue
        a = BedLine(aline)
        try:
            b = BedLine(bline)
        except AssertionError:
            b = None

        yield a, b
示例#33
0
文件: bed.py 项目: linlifeng/jcvi
def intersectBed_wao(abedfile, bbedfile, minOverlap=0):
    abed = Bed(abedfile)
    bbed = Bed(bbedfile)
    print >> sys.stderr, "`{0}` has {1} features.".format(abedfile, len(abed))
    print >> sys.stderr, "`{0}` has {1} features.".format(bbedfile, len(bbed))

    cmd = "intersectBed -wao -a {0} -b {1}".format(abedfile, bbedfile)
    acols = abed[0].nargs
    bcols = bbed[0].nargs
    fp = popen(cmd)
    for row in fp:
        atoms = row.split()
        aline = "\t".join(atoms[:acols])
        bline = "\t".join(atoms[acols:acols + bcols])
        c = int(atoms[-1])
        if c < minOverlap:
            continue
        a = BedLine(aline)
        try:
            b = BedLine(bline)
        except AssertionError:
            b = None

        yield a, b
示例#34
0
def must_open(filename,
              mode="r",
              checkexists=False,
              skipcheck=False,
              oappend=False):
    """
    Accepts filename and returns filehandle.

    Checks on multiple files, stdin/stdout/stderr, .gz or .bz2 file.
    """
    if isinstance(filename, list):
        assert "r" in mode

        if filename[0].endswith((".gz", ".bz2")):
            filename = " ".join(
                filename)  # allow opening multiple gz/bz2 files
        else:
            import fileinput

            return fileinput.input(filename)

    if filename.startswith("s3://"):
        from jcvi.utils.aws import pull_from_s3

        filename = pull_from_s3(filename)

    if filename in ("-", "stdin"):
        assert "r" in mode
        fp = sys.stdin

    elif filename == "stdout":
        assert "w" in mode
        fp = sys.stdout

    elif filename == "stderr":
        assert "w" in mode
        fp = sys.stderr

    elif filename == "tmp" and mode == "w":
        from tempfile import NamedTemporaryFile

        fp = NamedTemporaryFile(mode=mode, delete=False)

    elif filename.endswith(".gz"):
        import gzip

        if "r" in mode:
            fp = gzip.open(filename, mode + "t")
        elif "w" in mode:
            fp = gzip.open(filename, mode)

    elif filename.endswith(".bz2"):
        if "r" in mode:
            cmd = "bzcat {0}".format(filename)
            fp = popen(cmd, debug=False)
        elif "w" in mode:
            import bz2

            fp = bz2.BZ2File(filename, mode)

    else:
        if checkexists:
            assert mode == "w"
            overwrite = ((not op.exists(filename))
                         if skipcheck else check_exists(filename, oappend))
            if overwrite:
                if oappend:
                    fp = open(filename, "a")
                else:
                    fp = open(filename, "w")
            else:
                logging.debug(
                    "File `{0}` already exists. Skipped.".format(filename))
                return None
        else:
            fp = open(filename, mode)

    return fp
示例#35
0
def overlap(args):
    """
    %prog overlap <a|a.fasta> <b|b.fasta>

    Check overlaps between two fasta records. The arguments can be genBank IDs
    instead of FASTA files. In case of IDs, the sequences will be downloaded
    first.
    """
    from jcvi.apps.command import BLPATH
    from jcvi.formats.blast import chain_HSPs

    p = OptionParser(overlap.__doc__)
    p.add_option("--dir",
                 default=os.getcwd(),
                 help="Download sequences to dir [default: %default]")
    p.add_option("--qreverse",
                 default=False,
                 action="store_true",
                 help="Reverse seq a [default: %default]")
    p.add_option("--nochain",
                 default=False,
                 action="store_true",
                 help="Do not chain adjacent HSPs [default: chain HSPs]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    afasta, bfasta = args
    dir = opts.dir
    chain = not opts.nochain

    # Check first whether it is file or accession name
    if not op.exists(afasta):
        af = op.join(dir, afasta + ".fasta")
        if not op.exists(af):  # Check to avoid redownload
            fetch([afasta, "--skipcheck", "--outdir=" + dir])
        afasta = af

    if not op.exists(bfasta):
        bf = op.join(dir, bfasta + ".fasta")
        if not op.exists(bf):
            fetch([bfasta, "--skipcheck", "--outdir=" + dir])
        bfasta = bf

    assert op.exists(afasta) and op.exists(bfasta)

    cmd = BLPATH("blastn")
    cmd += " -query {0} -subject {1}".format(afasta, bfasta)
    cmd += " -evalue 0.01 -outfmt 6 -perc_identity {0}".format(GoodPct)

    fp = popen(cmd)
    hsps = fp.readlines()

    hsps = [BlastLine(x) for x in hsps]
    hsps = [x for x in hsps if x.hitlen >= GoodOverlap]
    dist = 2 * GoodOverlap  # Distance to chain the HSPs
    if chain:
        logging.debug("Chain HSPs in the Blast output.")
        hsps = chain_HSPs(hsps, xdist=dist, ydist=dist)

    if len(hsps) == 0:
        print >> sys.stderr, "No match found."
        return None

    besthsp = hsps[0]

    aid, asize = Fasta(afasta).itersizes().next()
    bid, bsize = Fasta(bfasta).itersizes().next()
    o = Overlap(besthsp, asize, bsize)
    o.print_graphic(qreverse=opts.qreverse)
    print >> sys.stderr, str(o)

    return o
示例#36
0
def check_exists_s3(s3_store_obj_name):
    s3_store_obj_name = s3ify(s3_store_obj_name)
    cmd = "aws s3 ls {0} | wc -l".format(s3_store_obj_name)
    counts = int(popen(cmd).read())
    return counts != 0
示例#37
0
def overlap(args):
    """
    %prog overlap <a|a.fasta> <b|b.fasta>

    Check overlaps between two fasta records. The arguments can be genBank IDs
    instead of FASTA files. In case of IDs, the sequences will be downloaded
    first.
    """
    from jcvi.apps.command import BLPATH
    from jcvi.formats.blast import chain_HSPs

    p = OptionParser(overlap.__doc__)
    p.add_option("--dir", default=os.getcwd(),
            help="Download sequences to dir [default: %default]")
    p.add_option("--qreverse", default=False, action="store_true",
            help="Reverse seq a [default: %default]")
    p.add_option("--nochain", default=False, action="store_true",
            help="Do not chain adjacent HSPs [default: chain HSPs]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    afasta, bfasta = args
    dir = opts.dir
    chain = not opts.nochain

    # Check first whether it is file or accession name
    if not op.exists(afasta):
        af = op.join(dir, afasta + ".fasta")
        if not op.exists(af):  # Check to avoid redownload
            fetch([afasta, "--skipcheck", "--outdir=" + dir])
        afasta = af

    if not op.exists(bfasta):
        bf = op.join(dir, bfasta + ".fasta")
        if not op.exists(bf):
            fetch([bfasta, "--skipcheck", "--outdir=" + dir])
        bfasta = bf

    assert op.exists(afasta) and op.exists(bfasta)

    cmd = BLPATH("blastn")
    cmd += " -query {0} -subject {1}".format(afasta, bfasta)
    cmd += " -evalue 0.01 -outfmt 6 -perc_identity {0}".format(GoodPct)

    fp = popen(cmd)
    hsps = fp.readlines()

    hsps = [BlastLine(x) for x in hsps]
    hsps = [x for x in hsps if x.hitlen >= GoodOverlap]
    dist = 2 * GoodOverlap  # Distance to chain the HSPs
    if chain:
        logging.debug("Chain HSPs in the Blast output.")
        hsps = chain_HSPs(hsps, xdist=dist, ydist=dist)

    if len(hsps) == 0:
        print >> sys.stderr, "No match found."
        return None

    besthsp = hsps[0]

    aid, asize = Fasta(afasta).itersizes().next()
    bid, bsize = Fasta(bfasta).itersizes().next()
    o = Overlap(besthsp, asize, bsize)
    o.print_graphic(qreverse=opts.qreverse)
    print >> sys.stderr, str(o)

    return o
示例#38
0
def overlap(args):
    """
    %prog overlap <a|a.fasta> <b|b.fasta>

    Check overlaps between two fasta records. The arguments can be genBank IDs
    instead of FASTA files. In case of IDs, the sequences will be downloaded
    first.
    """
    from jcvi.formats.blast import chain_HSPs

    p = OptionParser(overlap.__doc__)
    p.add_option("--dir", default=os.getcwd(),
            help="Download sequences to dir [default: %default]")
    p.add_option("--suffix", default="fasta",
            help="Suffix of the sequence file in dir [default: %default]")
    p.add_option("--qreverse", default=False, action="store_true",
            help="Reverse seq a [default: %default]")
    p.add_option("--nochain", default=False, action="store_true",
            help="Do not chain adjacent HSPs [default: chain HSPs]")
    p.set_align(pctid=GoodPct, hitlen=GoodOverlap, evalue=.01)
    p.set_outfile(outfile=None)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    afasta, bfasta = args
    dir = opts.dir
    chain = not opts.nochain
    suffix = opts.suffix
    evalue = opts.evalue
    pctid = opts.pctid
    hitlen = opts.hitlen
    cutoff = Cutoff(pctid, hitlen)

    # Check first whether it is file or accession name
    if not op.exists(afasta):
        af = op.join(dir, ".".join((afasta, suffix)))
        if not op.exists(af):  # Check to avoid redownload
            entrez([afasta, "--skipcheck", "--outdir=" + dir])
        afasta = af

    if not op.exists(bfasta):
        bf = op.join(dir, ".".join((bfasta, suffix)))
        if not op.exists(bf):
            entrez([bfasta, "--skipcheck", "--outdir=" + dir])
        bfasta = bf

    assert op.exists(afasta) and op.exists(bfasta)

    cmd = "blastn -dust no"
    cmd += " -query {0} -subject {1}".format(afasta, bfasta)
    cmd += " -evalue {0} -outfmt 6 -perc_identity {1}".format(evalue, pctid)

    fp = popen(cmd)
    hsps = fp.readlines()

    hsps = [BlastLine(x) for x in hsps]
    hsps = [x for x in hsps if x.hitlen >= hitlen]
    if chain:
        logging.debug("Chain HSPs in the Blast output.")
        dist = 2 * hitlen  # Distance to chain the HSPs
        hsps = chain_HSPs(hsps, xdist=dist, ydist=dist)

    if len(hsps) == 0:
        print >> sys.stderr, "No match found."
        return None

    besthsp = hsps[0]

    aid, asize = Fasta(afasta).itersizes().next()
    bid, bsize = Fasta(bfasta).itersizes().next()
    o = Overlap(besthsp, asize, bsize, cutoff, qreverse=opts.qreverse)
    o.print_graphic()

    if opts.outfile:
        fw = must_open(opts.outfile, "w")
        print >> fw, str(o)
        fw.close()

    return o
示例#39
0
文件: ca.py 项目: arvin580/jcvi
def overlap(args):
    """
    %prog overlap best.contains iid

    Visualize overlaps for a given fragment. Must be run in 4-unitigger. All
    overlaps for iid were retrieved, excluding the ones matching best.contains.
    """
    from jcvi.apps.console import green

    p = OptionParser(overlap.__doc__)
    p.add_option("--maxerr", default=2, type="int", help="Maximum error rate")
    p.add_option("--canvas", default=100, type="int", help="Canvas size")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bestcontains, iid = args
    canvas = opts.canvas

    bestcontainscache = bestcontains + ".cache"
    if need_update(bestcontains, bestcontainscache):
        fp = open(bestcontains)
        fw = open(bestcontainscache, "w")
        exclude = set()
        for row in fp:
            if row[0] == "#":
                continue
            j = int(row.split()[0])
            exclude.add(j)
        cPickle.dump(exclude, fw)
        fw.close()

    exclude = cPickle.load(open(bestcontainscache))
    logging.debug("A total of {0} reads to exclude".format(len(exclude)))

    cmd = "overlapStore -d ../asm.ovlStore -b {0} -e {0}".format(iid)
    cmd += " -E {0}".format(opts.maxerr)
    frags = []
    for row in popen(cmd):
        r = OverlapLine(row)
        if r.bid in exclude:
            continue
        frags.append(r)

    # Also include to query fragment
    frags.append(OverlapLine("{0} {0} N 0 0 0 0".format(iid)))
    frags.sort(key=lambda x: x.ahang)

    # Determine size of the query fragment
    cmd = "gatekeeper -b {0} -e {0}".format(iid)
    cmd += " -tabular -dumpfragments ../asm.gkpStore"
    fp = popen(cmd)
    row = fp.next()
    size = int(fp.next().split()[-1])

    # Determine size of canvas
    xmin = min(x.ahang for x in frags)
    xmax = max(x.bhang for x in frags)
    xsize = -xmin + size + xmax
    ratio = xsize / canvas

    fw = sys.stdout
    for f in frags:
        fsize = -f.ahang + size + f.bhang
        a = (f.ahang - xmin) / ratio
        b = fsize / ratio
        t = "-" * b
        if f.orientation == "N":
            t = t[:-1] + ">"
        else:
            t = "<" + t[1:]
        if f.ahang == 0 and f.bhang == 0:
            t = green(t)
        c = canvas - a - b
        fw.write(" " * a)
        fw.write(t)
        fw.write(" " * c)
        print >> fw, "{0} ({1})".format(str(f.bid).rjust(10), f.erate_adj)
示例#40
0
def get_grid_engine():
    cmd = "qsub --version"
    ret = popen(cmd, debug=False).read()
    return "PBS" if "PBS" in ret else "SGE"
示例#41
0
    def allocate(self, info, chr, start_id, end_id, id_table):

        start_bp = info[0].start
        end_bp = info[-1].end

        current_chr = chr_number(chr)
        needed = info
        assert end_id > start_id, \
            "end ({0}) > start ({1})".format(end_id, start_id)

        spots = end_id - start_id - 1
        available = [
            x for x in xrange(start_id + 1, end_id)
            if (current_chr, x) not in self.black
        ]

        message = "{0} need {1} ids, has {2} spots ({3} available)".\
                format(chr, len(needed), spots, len(available))

        start_gene = gene_name(current_chr, start_id, prefix=self.prefix, \
                pad0=self.pad0, uc=self.uc)
        end_gene = gene_name(current_chr,
                             end_id,
                             prefix=self.prefix,
                             pad0=self.pad0,
                             uc=self.uc)
        message += " between {0} - {1}\n".format(start_gene, end_gene)

        assert end_bp > start_bp

        b = "\t".join(str(x) for x in (chr, start_bp - 1, end_bp))
        cmd = "echo '{0}' |".format(b)
        cmd += " intersectBed -a {0} -b stdin".format(self.gapfile)
        gaps = list(BedLine(x) for x in popen(cmd, debug=False))
        ngaps = len(gaps)

        gapsexpanded = []
        GeneDensity = 10000.  # assume 10Kb per gene
        for gap in gaps:
            gap_bp = int(gap.score)
            gap_ids = int(round(gap_bp / GeneDensity))
            gapsexpanded += [gap] * gap_ids

        lines = sorted(info + gapsexpanded, key=lambda x: x.start)

        message += "between bp: {0} - {1}, there are {2} gaps (total {3} ids)".\
                format(start_bp, end_bp, ngaps, len(lines))

        needed = lines
        stride = Stride(needed, available)
        conf = stride.conf
        message += " stride: {0}".format(conf)
        print >> sys.stderr, message

        nneeded = len(needed)
        if conf is None:  # prefix rule - prepend version number for spills
            magic = 400000  # version 4
            firstdigit = 100000
            step = 10  # stride for the prefixed ids
            rank = start_id + magic
            if rank > magic + firstdigit:
                rank -= firstdigit
            available = []
            while len(available) != nneeded:
                rank += step
                if (current_chr, rank) in self.black:  # avoid blacklisted ids
                    continue
                available.append(rank)

        else:  # follow the best stride
            available = stride.available
            if start_id == 0:  # follow right flank at start of chr
                available = available[-nneeded:]
            else:  # follow left flank otherwise
                available = available[:nneeded]

        # Finally assign the ids
        assert len(needed) == len(available)
        for b, rank in zip(needed, available):
            name = gene_name(current_chr, rank, prefix=self.prefix, \
                    pad0=self.pad0, uc=self.uc)
            print >> sys.stderr, "\t".join((str(b), name))
            id_table[b.accn] = name
            self.black.add((current_chr, rank))
        print >> sys.stderr
示例#42
0
def overlap(args):
    """
    %prog overlap <a|a.fasta> <b|b.fasta>

    Check overlaps between two fasta records. The arguments can be genBank IDs
    instead of FASTA files. In case of IDs, the sequences will be downloaded
    first.
    """
    from jcvi.formats.blast import chain_HSPs

    p = OptionParser(overlap.__doc__)
    p.add_option("--dir",
                 default=os.getcwd(),
                 help="Download sequences to dir [default: %default]")
    p.add_option("--suffix",
                 default="fasta",
                 help="Suffix of the sequence file in dir [default: %default]")
    p.add_option("--qreverse",
                 default=False,
                 action="store_true",
                 help="Reverse seq a [default: %default]")
    p.add_option("--nochain",
                 default=False,
                 action="store_true",
                 help="Do not chain adjacent HSPs [default: chain HSPs]")
    p.set_align(pctid=GoodPct, hitlen=GoodOverlap, evalue=.01)
    p.set_outfile(outfile=None)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    afasta, bfasta = args
    dir = opts.dir
    chain = not opts.nochain
    suffix = opts.suffix
    evalue = opts.evalue
    pctid = opts.pctid
    hitlen = opts.hitlen
    cutoff = Cutoff(pctid, hitlen)

    # Check first whether it is file or accession name
    if not op.exists(afasta):
        af = op.join(dir, ".".join((afasta, suffix)))
        if not op.exists(af):  # Check to avoid redownload
            entrez([afasta, "--skipcheck", "--outdir=" + dir])
        afasta = af

    if not op.exists(bfasta):
        bf = op.join(dir, ".".join((bfasta, suffix)))
        if not op.exists(bf):
            entrez([bfasta, "--skipcheck", "--outdir=" + dir])
        bfasta = bf

    assert op.exists(afasta) and op.exists(bfasta)

    cmd = "blastn -dust no"
    cmd += " -query {0} -subject {1}".format(afasta, bfasta)
    cmd += " -evalue {0} -outfmt 6 -perc_identity {1}".format(evalue, pctid)

    fp = popen(cmd)
    hsps = fp.readlines()

    hsps = [BlastLine(x) for x in hsps]
    hsps = [x for x in hsps if x.hitlen >= hitlen]
    if chain:
        logging.debug("Chain HSPs in the Blast output.")
        dist = 2 * hitlen  # Distance to chain the HSPs
        hsps = chain_HSPs(hsps, xdist=dist, ydist=dist)

    if len(hsps) == 0:
        print >> sys.stderr, "No match found."
        return None

    besthsp = hsps[0]

    aid, asize = Fasta(afasta).itersizes().next()
    bid, bsize = Fasta(bfasta).itersizes().next()
    o = Overlap(besthsp, asize, bsize, cutoff, qreverse=opts.qreverse)
    o.print_graphic()

    if opts.outfile:
        fw = must_open(opts.outfile, "w")
        print >> fw, str(o)
        fw.close()

    return o
示例#43
0
文件: aws.py 项目: xuanblo/jcvi
def check_exists_s3(s3_store_obj_name):
    s3_store_obj_name = s3ify(s3_store_obj_name)
    cmd = "aws s3 ls {0} | wc -l".format(s3_store_obj_name)
    counts = int(popen(cmd).read())
    return counts != 0
示例#44
0
文件: ca.py 项目: zjwang6/jcvi
def overlap(args):
    """
    %prog overlap best.contains iid

    Visualize overlaps for a given fragment. Must be run in 4-unitigger. All
    overlaps for iid were retrieved, excluding the ones matching best.contains.
    """
    p = OptionParser(overlap.__doc__)
    p.add_option("--maxerr", default=2, type="int", help="Maximum error rate")
    p.add_option("--canvas", default=100, type="int", help="Canvas size")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bestcontains, iid = args
    canvas = opts.canvas

    bestcontainscache = bestcontains + ".cache"
    if need_update(bestcontains, bestcontainscache):
        fp = open(bestcontains)
        fw = open(bestcontainscache, "w")
        exclude = set()
        for row in fp:
            if row[0] == "#":
                continue
            j = int(row.split()[0])
            exclude.add(j)
        dump(exclude, fw)
        fw.close()

    exclude = load(open(bestcontainscache))
    logging.debug("A total of {0} reads to exclude".format(len(exclude)))

    cmd = "overlapStore -d ../asm.ovlStore -b {0} -e {0}".format(iid)
    cmd += " -E {0}".format(opts.maxerr)
    frags = []
    for row in popen(cmd):
        r = OverlapLine(row)
        if r.bid in exclude:
            continue
        frags.append(r)

    # Also include to query fragment
    frags.append(OverlapLine("{0} {0} N 0 0 0 0".format(iid)))
    frags.sort(key=lambda x: x.ahang)

    # Determine size of the query fragment
    cmd = "gatekeeper -b {0} -e {0}".format(iid)
    cmd += " -tabular -dumpfragments ../asm.gkpStore"
    fp = popen(cmd)
    row = next(fp)
    size = int(fp.next().split()[-1])

    # Determine size of canvas
    xmin = min(x.ahang for x in frags)
    xmax = max(x.bhang for x in frags)
    xsize = -xmin + size + xmax
    ratio = xsize / canvas

    for f in frags:
        fsize = -f.ahang + size + f.bhang
        a = (f.ahang - xmin) / ratio
        b = fsize / ratio
        t = "-" * b
        if f.orientation == "N":
            t = t[:-1] + ">"
        else:
            t = "<" + t[1:]
        if f.ahang == 0 and f.bhang == 0:
            t = "[green]{}".format(t)
        c = canvas - a - b
        printf(
            "{}{}{}{} ({})".format(" " * a, t, " " * c,
                                   str(f.bid).rjust(10), f.erate_adj), )
示例#45
0
文件: grid.py 项目: radaniba/jcvi
def get_grid_engine():
    cmd = "qsub --version"
    popen(cmd, debug=False).read()
    return "PBS" if "PBS" in cmd else "SGE"
示例#46
0
文件: reformat.py 项目: Hensonmw/jcvi
    def allocate(self, info, chr, start_id, end_id, id_table, extended_stride=False):

        start_bp = info[0].start
        end_bp = info[-1].end

        current_chr = chr_number(chr)
        needed = info
        assert end_id > start_id, \
            "end ({0}) > start ({1})".format(end_id, start_id)

        spots = end_id - start_id - 1
        available = [x for x in xrange(start_id + 1, end_id) if
                            (current_chr, x) not in self.black]

        message = "{0} need {1} ids, has {2} spots ({3} available)".\
                format(chr, len(needed), spots, len(available))

        start_gene = gene_name(current_chr, start_id, prefix=self.prefix, \
                pad0=self.pad0, uc=self.uc)
        end_gene = gene_name(current_chr, end_id, prefix=self.prefix,
                pad0=self.pad0, uc=self.uc)
        message += " between {0} - {1}\n".format(start_gene, end_gene)

        assert end_bp > start_bp

        b = "\t".join(str(x) for x in (chr, start_bp - 1, end_bp))
        cmd = "echo '{0}' |".format(b)
        cmd += " intersectBed -a {0} -b stdin".format(self.gapfile)
        gaps = list(BedLine(x) for x in popen(cmd, debug=False))
        ngaps = len(gaps)

        gapsexpanded = []
        GeneDensity = 10000.  # assume 10Kb per gene
        for gap in gaps:
            gap_bp = int(gap.score)
            gap_ids = int(round(gap_bp / GeneDensity))
            gapsexpanded += [gap] * gap_ids

        lines = sorted(info + gapsexpanded, key=lambda x: x.start)

        message += "between bp: {0} - {1}, there are {2} gaps (total {3} ids)".\
                format(start_bp, end_bp, ngaps, len(lines))

        needed = lines
        stride = Stride(needed, available, extended=extended_stride)
        conf = stride.conf
        message += " stride: {0}".format(conf)
        print >> sys.stderr, message

        nneeded = len(needed)
        if conf is None: # prefix rule - prepend version number for spills
            magic = 400000  # version 4
            firstdigit = 100000
            step = 10  # stride for the prefixed ids
            rank = start_id + magic
            if rank > magic + firstdigit:
                rank -= firstdigit
            available = []
            while len(available) != nneeded:
                rank += step
                if (current_chr, rank) in self.black:  # avoid blacklisted ids
                    continue
                available.append(rank)

        else: # follow the best stride
            available = stride.available
            if start_id == 0:  # follow right flank at start of chr
                available = available[- nneeded:]
            else:  # follow left flank otherwise
                available = available[:nneeded]

        # Finally assign the ids
        assert len(needed) == len(available)
        for b, rank in zip(needed, available):
            name = gene_name(current_chr, rank, prefix=self.prefix, \
                    pad0=self.pad0, uc=self.uc)
            print >> sys.stderr, "\t".join((str(b), name))
            id_table[b.accn] = name
            self.black.add((current_chr, rank))
        print >> sys.stderr