Пример #1
0
def must_open(filename, mode="r", checkexists=False, skipcheck=False):
    """
    Accepts filename and returns filehandle.

    Checks on multiple files, stdin/stdout/stderr, .gz or .bz2 file.
    """
    if isinstance(filename, list):
        assert "r" in mode

        import fileinput
        return fileinput.input(filename)

    if filename in ("-", "stdin"):
        assert "r" in mode
        fp = sys.stdin

    elif filename == "stdout":
        assert "w" in mode
        fp = sys.stdout

    elif filename == "stderr":
        assert "w" in mode
        fp = sys.stderr

    elif filename == "tmp" and mode == "w":
        from tempfile import NamedTemporaryFile
        fp = NamedTemporaryFile(delete=False)

    elif filename.endswith(".gz"):
        if 'r' in mode:
            cmd = "zcat {0}".format(filename)
            fp = popen(cmd, debug=False)
        elif 'w' in mode:
            import gzip
            fp = gzip.open(filename, mode)

    elif filename.endswith(".bz2"):
        if 'r' in mode:
            cmd = "bzcat {0}".format(filename)
            fp = popen(cmd, debug=False)
        elif 'w' in mode:
            import bz2
            fp = bz2.BZ2File(filename, mode)

    else:
        if checkexists:
            assert mode == "w"
            overwrite = (not op.exists(filename)) if skipcheck \
                        else check_exists(filename)
            if overwrite:
                fp = open(filename, "w")
            else:
                logging.debug("File `{0}` already exists. Skipped."\
                        .format(filename))
                return None
        else:
            fp = open(filename, mode)

    return fp
Пример #2
0
def ls_s3(s3_store_obj_name):
    s3_store_obj_name = s3ify(s3_store_obj_name)
    cmd = "aws s3 ls {0}/".format(s3_store_obj_name)
    contents = []
    for row in popen(cmd):
        contents.append(row.split()[-1])
    return contents
Пример #3
0
def append(args):
    """
    %prog append bamfile

    Append /1 or /2 to read names. Useful for using the Tophat2 bam file for
    training AUGUSTUS gene models.
    """
    p = OptionParser(append.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bamfile, = args
    icmd = "samtools view -h {0}".format(bamfile)
    bamfile = bamfile.rsplit(".", 1)[0] + ".append.bam"
    ocmd = "samtools view -b -@ 64 - -o {0}".format(bamfile)
    p = Popen(ocmd, stdin=PIPE)
    for row in popen(icmd):
        if row[0] == '@':
            print >> p.stdin, row.strip()
        else:
            s = SamLine(row)
            s.update_readname()
            print >> p.stdin, s
Пример #4
0
Файл: ks.py Проект: bennyyu/jcvi
def get_mixture(data, components):
    """
    probs = [.476, .509]
    mus = [.69069, -.15038]
    variances = [.468982e-1, .959052e-1]
    """
    from jcvi.apps.base import popen

    probs, mus, sigmas = [], [], []
    fw = must_open("tmp", "w")
    log_data = [log(x) for x in data if x > .05]
    data = "\n".join(["%.4f" % x for x in log_data]).replace("inf\n", "")
    fw.write(data)
    fw.close()

    cmd = "gmm-bic {0} {1} {2}".format(components, len(log_data), fw.name)
    pipe = popen(cmd)

    for row in pipe:
        if row[0] != '#':
            continue

        atoms = row.split(",")
        a, b, c = atoms[1:4]
        a = float(a)
        b = float(b)
        c = float(c)

        mus.append(a)
        sigmas.append(b)
        probs.append(c)

    os.remove(fw.name)
    return probs, mus, sigmas
Пример #5
0
def top10(args):
    """
    %prog top10 blastfile.best

    Count the most frequent 10 hits. Usually the BLASTFILE needs to be screened
    the get the best match. You can also provide an .ids file to query the ids.
    For example the ids file can contain the seqid to species mapping.

    The ids file is two-column, and can sometimes be generated by
    `jcvi.formats.fasta ids --description`.
    """
    from jcvi.formats.base import DictFile

    p = OptionParser(top10.__doc__)
    p.add_option("--top", default=10, type="int",
                help="Top N taxa to extract [default: %default]")
    p.add_option("--ids", default=None,
                help="Two column ids file to query seqid [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    blastfile, = args
    mapping = DictFile(opts.ids, delimiter="\t") if opts.ids else {}

    cmd = "cut -f2 {0}".format(blastfile)
    cmd += " | sort | uniq -c | sort -k1,1nr | head -n {0}".format(opts.top)
    fp = popen(cmd)
    for row in fp:
        count, seqid = row.split()
        nseqid = mapping.get(seqid, seqid)
        print "\t".join((count, nseqid))
Пример #6
0
def locate(args):
    fi, fo = args.fi, args.fo
    seq = args.seq
    mtfs = read_motif(fi, args.motif)
    #
    mtf_str = " ".join([f'--motif {mid}' for mid, wd, score in mtfs])
    pre = f"tmp.lc{random.randrange(1000)}"
    #
    sh(f'fimo --bfile --motif-- {mtf_str} --thresh 1e-4 --skip-matched-sequence --text {fi} {seq} > {pre}_0.txt'
       )
    for mid, wd, score in mtfs:
        sh(f'grep -P "^{mid}\t" {pre}_0.txt > {pre}_0a.txt')
        #
        score_thresh = score
        if not score:
            xh = popen(
                f'cut -f7 {pre}_0a.txt | sed \'1d\' | sort -k1,1nr | head')
            max_score = float(xh.readline().decode("utf-8").strip())
            score_thresh = max_score * args.score_thresh
        #
        sh("bioawk -tH '{if($7>%f) {print $1\"%%\"$3, $4-1, $5}}' %s_0a.txt > %s_1.bed"
           % (score_thresh, pre, pre))
        hwd = round(wd * args.motif_frac)
        if os.stat(f"{pre}_1.bed").st_size == 0:
            sh(f'touch {pre}_4_{mid}.bed')
        else:
            sh(f'sortBed -i {pre}_1.bed | mergeBed > {pre}_2.bed')
            sh(f'bedtools makewindows -w {wd} -b {pre}_2.bed > {pre}_3.bed')
            sh(f'bed.py filter --minsize {hwd} {pre}_3.bed > {pre}_4_{mid}.bed'
               )
    sh(f'cat {pre}_4_*.bed > {fo}')
    if not args.debug:
        sh(f'rm -rf {pre}_*')
Пример #7
0
def top10(args):
    """
    %prog top10 blastfile.best

    Count the most frequent 10 hits. Usually the BLASTFILE needs to be screened
    the get the best match. You can also provide an .ids file to query the ids.
    For example the ids file can contain the seqid to species mapping.

    The ids file is two-column, and can sometimes be generated by
    `jcvi.formats.fasta ids --description`.
    """
    from jcvi.formats.base import DictFile

    p = OptionParser(top10.__doc__)
    p.add_option("--ids", default=None,
                help="Two column ids file to query seqid [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    blastfile, = args
    mapping = DictFile(opts.ids, delimiter="\t") if opts.ids else {}

    cmd = "cut -f2 {0}".format(blastfile)
    cmd += " | sort | uniq -c | sort -k1,1nr | head"
    fp = popen(cmd)
    for row in fp:
        count, seqid = row.split()
        nseqid = mapping.get(seqid, seqid)
        print "\t".join((count, nseqid))
Пример #8
0
def append(args):
    """
    %prog append bamfile

    Append /1 or /2 to read names. Useful for using the Tophat2 bam file for
    training AUGUSTUS gene models.
    """
    p = OptionParser(append.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bamfile, = args
    icmd = "samtools view -h {0}".format(bamfile)
    bamfile = bamfile.rsplit(".", 1)[0] + ".append.bam"
    ocmd = "samtools view -b -@ 64 - -o {0}".format(bamfile)
    p = Popen(ocmd, stdin=PIPE)
    for row in popen(icmd):
        if row[0] == '@':
            print >> p.stdin, row.strip()
        else:
            s = SamLine(row)
            s.update_readname()
            print >> p.stdin, s
Пример #9
0
def append(args):
    """
    %prog append bamfile

    Append /1 or /2 to read names. Useful for using the Tophat2 bam file for
    training AUGUSTUS gene models.
    """
    p = OptionParser(append.__doc__)
    p.add_option("--prepend", help="Prepend string to read names")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (bamfile,) = args
    prepend = opts.prepend

    icmd = "samtools view -h {0}".format(bamfile)
    bamfile = bamfile.rsplit(".", 1)[0] + ".append.bam"
    ocmd = "samtools view -b -@ 64 - -o {0}".format(bamfile)
    p = Popen(ocmd, stdin=PIPE)
    for row in popen(icmd):
        if row[0] == "@":
            print(row.strip(), file=p.stdin)
        else:
            s = SamLine(row)
            if prepend:
                s.qname = prepend + "_" + s.qname
            else:
                s.update_readname()
            print(s, file=p.stdin)
Пример #10
0
def get_mixture(data, components):
    """
    probs = [.476, .509]
    mus = [.69069, -.15038]
    variances = [.468982e-1, .959052e-1]
    """
    from jcvi.apps.base import popen

    probs, mus, sigmas = [], [], []
    fw = must_open("tmp", "w")
    log_data = [log(x) for x in data if x > .05]
    data = "\n".join(["%.4f" % x for x in log_data]).replace("inf\n", "")
    fw.write(data)
    fw.close()

    cmd = "gmm-bic {0} {1} {2}".format(components, len(log_data), fw.name)
    pipe = popen(cmd)

    for row in pipe:
        if row[0] != '#':
            continue

        atoms = row.split(",")
        a, b, c = atoms[1:4]
        a = float(a)
        b = float(b)
        c = float(c)

        mus.append(a)
        sigmas.append(b)
        probs.append(c)

    os.remove(fw.name)
    return probs, mus, sigmas
Пример #11
0
def ls_s3(s3_store_obj_name):
    s3_store_obj_name = s3ify(s3_store_obj_name)
    cmd = "aws s3 ls {0}/".format(s3_store_obj_name)
    contents = []
    for row in popen(cmd):
        contents.append(row.split()[-1])
    return contents
Пример #12
0
def validate(args):
    """
    %prog validate outdir genome.fasta

    Validate current folder after MAKER run and check for failures. Failed batch
    will be written to a directory for additional work.
    """
    from jcvi.utils.counter import Counter

    p = OptionParser(validate.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    outdir, genome = args
    counter = Counter()

    fsnames, suffix = get_fsnames(outdir)
    dsfile = "{0}{1}/{0}.maker.output/{0}_master_datastore_index.log"
    dslogs = [dsfile.format(x, suffix) for x in fsnames]
    all_failed = []
    for f, d in zip(fsnames, dslogs):
        dslog = DatastoreIndexFile(d)
        counter.update(dslog.scaffold_status.values())
        all_failed.extend([(f, x) for x in dslog.failed])

    cmd = 'tail maker.*.out | grep -c "now finished"'
    n = int(popen(cmd).read())
    assert len(fsnames) == n
    print("ALL jobs have been finished", file=sys.stderr)

    nfailed = len(all_failed)
    if nfailed == 0:
        print("ALL scaffolds are completed with no errors", file=sys.stderr)
        return

    print("Scaffold status:", file=sys.stderr)
    print(counter, file=sys.stderr)
    failed = "FAILED"
    fw = open(failed, "w")
    print("\n".join(["\t".join((f, x)) for f, x in all_failed]), file=fw)
    fw.close()

    nlines = sum(1 for x in open("FAILED"))
    assert nlines == nfailed
    print("FAILED !! {0} instances.".format(nfailed), file=sys.stderr)

    # Rebuild the failed batch
    failed_ids = failed + ".ids"
    failed_fasta = failed + ".fasta"
    cmd = "cut -f2 {0}".format(failed)
    sh(cmd, outfile=failed_ids)
    if need_update((genome, failed_ids), failed_fasta):
        cmd = "faSomeRecords {0} {1} {2}".\
                    format(genome, failed_ids, failed_fasta)
        sh(cmd)
Пример #13
0
def validate(args):
    """
    %prog validate outdir genome.fasta

    Validate current folder after MAKER run and check for failures. Failed batch
    will be written to a directory for additional work.
    """
    from jcvi.utils.counter import Counter

    p = OptionParser(validate.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    outdir, genome = args
    counter = Counter()

    fsnames, suffix = get_fsnames(outdir)
    dsfile = "{0}{1}/{0}.maker.output/{0}_master_datastore_index.log"
    dslogs = [dsfile.format(x, suffix) for x in fsnames]
    all_failed = []
    for f, d in zip(fsnames, dslogs):
        dslog = DatastoreIndexFile(d)
        counter.update(dslog.scaffold_status.values())
        all_failed.extend([(f, x) for x in dslog.failed])

    cmd = 'tail maker.*.out | grep -c "now finished"'
    n = int(popen(cmd).read())
    assert len(fsnames) == n
    print("ALL jobs have been finished", file=sys.stderr)

    nfailed = len(all_failed)
    if nfailed == 0:
        print("ALL scaffolds are completed with no errors", file=sys.stderr)
        return

    print("Scaffold status:", file=sys.stderr)
    print(counter, file=sys.stderr)
    failed = "FAILED"
    fw = open(failed, "w")
    print("\n".join(["\t".join((f, x)) for f, x in all_failed]), file=fw)
    fw.close()

    nlines = sum(1 for x in open("FAILED"))
    assert nlines == nfailed
    print("FAILED !! {0} instances.".format(nfailed), file=sys.stderr)

    # Rebuild the failed batch
    failed_ids = failed + ".ids"
    failed_fasta = failed + ".fasta"
    cmd = "cut -f2 {0}".format(failed)
    sh(cmd, outfile=failed_ids)
    if need_update((genome, failed_ids), failed_fasta):
        cmd = "faSomeRecords {0} {1} {2}".\
                    format(genome, failed_ids, failed_fasta)
        sh(cmd)
Пример #14
0
def ls_s3(s3_store_obj_name, recursive=False):
    s3_store_obj_name = s3ify(s3_store_obj_name)
    cmd = "aws s3 ls {0}/".format(s3_store_obj_name)
    if recursive:
        cmd += " --recursive"
    contents = []
    for row in popen(cmd):
        contents.append(row.split()[-1])
    return contents
Пример #15
0
def ls_s3(s3_store_obj_name, recursive=False):
    s3_store_obj_name = s3ify(s3_store_obj_name)
    cmd = "aws s3 ls {0}/".format(s3_store_obj_name)
    if recursive:
        cmd += " --recursive"
    contents = []
    for row in popen(cmd):
        contents.append(row.split()[-1])
    return contents
Пример #16
0
def compare_worker(arg):
    cnvoutput, truths = arg
    cmd = "intersectBed -f .5 -F .5"
    cmd += " -a {} -b {} | wc -l".format(cnvoutput, truths)
    nlines = int(popen(cmd, debug=False).read())
    target_lines = len([x for x in open(cnvoutput)])
    truths_lines = len([x for x in open(truths)])
    precision = nlines * 100. / target_lines
    recall = nlines * 100. / truths_lines
    d = "\t".join(str(x) for x in (cnvoutput, truths,
                                   nlines, target_lines, truths_lines,
                                   precision, recall))
    return d
Пример #17
0
def compare_worker(arg):
    cnvoutput, truths = arg
    cmd = "intersectBed -f .5 -F .5"
    cmd += " -a {} -b {} | wc -l".format(cnvoutput, truths)
    nlines = int(popen(cmd, debug=False).read())
    target_lines = len([x for x in open(cnvoutput)])
    truths_lines = len([x for x in open(truths)])
    precision = nlines * 100. / target_lines
    recall = nlines * 100. / truths_lines
    d = "\t".join(str(x) for x in (cnvoutput, truths,
                                   nlines, target_lines, truths_lines,
                                   precision, recall))
    return d
Пример #18
0
def consolidate(nbedfile, obedfile, cbedfile):
    from pybedtools import BedTool

    nbedtool = BedTool(nbedfile)
    obedtool = BedTool(obedfile)

    ab = nbedtool.intersect(obedtool, s=True, u=True)
    ba = obedtool.intersect(nbedtool, s=True, u=True)

    cmd = "cat {0} {1} | sort -k1,1 -k2,2n".format(ab.fn, ba.fn)
    fp = popen(cmd)
    ovl = BedTool(fp.readlines())

    abmerge = ovl.merge(s=True, nms=True, scores="mean").sort()
    cmd = "cat {0}".format(abmerge.fn)
    fp = popen(cmd, debug=False)
    ovl = BedTool(fp.readlines())

    notovl = nbedtool.intersect(ovl.sort(), s=True, v=True)

    infile = "{0} {1}".format(notovl.fn, ovl.fn)
    tmpfile = "/tmp/reformat.{0}.bed".format(os.getpid())
    cmd = "sort -k1,1 -k2,2n"
    sh(cmd, infile=infile, outfile=tmpfile)

    fp = open(cbedfile, "w")
    bed = Bed(tmpfile)
    for b in bed:
        if ";" in b.accn:
            accns = set()
            for accn in b.accn.split(";"):
                accns.add(accn)
            b.accn = ";".join(accns)
        print(b, file=fp)
    fp.close()
    os.remove(tmpfile)

    sort([cbedfile, "-i"])
Пример #19
0
def get_splits(split_bed, gff_file, stype, key):
    """
    Use intersectBed to find the fused gene => split genes mappings.
    """
    bed_file = get_bed_file(gff_file, stype, key)
    cmd = "intersectBed -a {0} -b {1} -wao".format(split_bed, bed_file)
    cmd += " | cut -f4,10"
    p = popen(cmd)
    splits = defaultdict(set)
    for row in p:
        a, b = row.split()
        splits[a].add(b)

    return splits
Пример #20
0
def get_splits(split_bed, gff_file, stype, key):
    """
    Use intersectBed to find the fused gene => split genes mappings.
    """
    bed_file = get_bed_file(gff_file, stype, key)
    cmd = "intersectBed -a {0} -b {1} -wao".format(split_bed, bed_file)
    cmd += " | cut -f4,10"
    p = popen(cmd)
    splits = defaultdict(set)
    for row in p:
        a, b = row.split()
        splits[a].add(b)

    return splits
Пример #21
0
def consolidate(nbedfile, obedfile, cbedfile):
    from pybedtools import BedTool
    nbedtool = BedTool(nbedfile)
    obedtool = BedTool(obedfile)

    ab = nbedtool.intersect(obedtool, s=True, u=True)
    ba = obedtool.intersect(nbedtool, s=True, u=True)

    cmd = "cat {0} {1} | sort -k1,1 -k2,2n".format(ab.fn, ba.fn)
    fp = popen(cmd)
    ovl = BedTool(fp.readlines())

    abmerge = ovl.merge(s=True, nms=True, scores="mean").sort()
    cmd = "cat {0}".format(abmerge.fn)
    fp = popen(cmd, debug=False)
    ovl = BedTool(fp.readlines())

    notovl = nbedtool.intersect(ovl.sort(), s=True, v=True)

    infile = "{0} {1}".format(notovl.fn, ovl.fn)
    tmpfile = "/tmp/reformat.{0}.bed".format(os.getpid())
    cmd = "sort -k1,1 -k2,2n"
    sh(cmd, infile=infile, outfile=tmpfile)

    fp = open(cbedfile, "w")
    bed = Bed(tmpfile)
    for b in bed:
        if ";" in b.accn:
            accns = set()
            for accn in b.accn.split(";"):
                accns.add(accn)
            b.accn = ";".join(accns)
        print >> fp, b
    fp.close()
    os.remove(tmpfile)

    sort([cbedfile, "-i"])
Пример #22
0
    def start(self, path=sge):

        if self.is_defunct:
            return

        cwd = os.getcwd()
        if path:
            os.chdir(path)

        # Shell commands
        if "|" in self.cmd or "&&" in self.cmd or "||" in self.cmd:
            quote = "\"" if "'" in self.cmd else "'"
            self.cmd = "sh -c {1}{0}{1}".format(self.cmd, quote)

        # qsub command (the project code is specific to jcvi)
        qsub = "qsub -P {0} -cwd".format(PCODE)
        if self.queue != "default":
            qsub += " -l {0}".format(self.queue)
        if self.threaded:
            qsub += " -pe threaded {0}".format(self.threaded)
        if self.infile:
            qsub += " -i {0}".format(self.infile)
        if self.outfile:
            qsub += " -o {0}".format(self.outfile)
        if self.errfile:
            qsub += " -e {0}".format(self.errfile)

        cmd = " ".join((qsub, self.cmd))
        # run the command and get the job-ID (important)
        output = popen(cmd, debug=False).read()

        if output.strip() != "":
            self.jobid = re.search(self.pat, output).group("id")
        else:
            self.jobid = "-1"

        msg = "[{0}] {1}".format(self.jobid, self.cmd)
        if self.infile:
            msg += " < {0} ".format(self.infile)
        if self.outfile:
            backup(self.outfile)
            msg += " > {0} ".format(self.outfile)
        if self.errfile:
            backup(self.errfile)
            msg += " 2> {0} ".format(self.errfile)

        logging.debug(msg)

        os.chdir(cwd)
Пример #23
0
    def start(self, path=sge):

        if self.is_defunct:
            return

        cwd = os.getcwd()
        if path:
            os.chdir(path)

        # Shell commands
        if "|" in self.cmd or "&&" in self.cmd or "||" in self.cmd:
            quote = "\"" if "'" in self.cmd else "'"
            self.cmd = "sh -c {1}{0}{1}".format(self.cmd, quote)

        # qsub command (the project code is specific to jcvi)
        qsub = "qsub -P {0} -cwd".format(PCODE)
        if self.queue != "default":
            qsub += " -l {0}".format(self.queue)
        if self.threaded:
            qsub += " -pe threaded {0}".format(self.threaded)
        if self.infile:
            qsub += " -i {0}".format(self.infile)
        if self.outfile:
            qsub += " -o {0}".format(self.outfile)
        if self.errfile:
            qsub += " -e {0}".format(self.errfile)

        cmd = " ".join((qsub, self.cmd))
        # run the command and get the job-ID (important)
        output = popen(cmd, debug=False).read()

        if output.strip() != "":
            self.jobid = re.search(self.pat, output).group("id")
        else:
            self.jobid = "-1"

        msg = "[{0}] {1}".format(self.jobid, self.cmd)
        if self.infile:
            msg += " < {0} ".format(self.infile)
        if self.outfile:
            backup(self.outfile)
            msg += " > {0} ".format(self.outfile)
        if self.errfile:
            backup(self.errfile)
            msg += " 2> {0} ".format(self.errfile)

        logging.debug(msg)

        os.chdir(cwd)
Пример #24
0
def ls_s3(s3_store_obj_name, recursive=False):
    s3_store_obj_name = s3ify(s3_store_obj_name)
    cmd = "aws s3 ls {0}/".format(s3_store_obj_name)
    contents = []
    for row in popen(cmd):
        f = row.split()[-1]
        f = op.join(s3_store_obj_name, f)
        contents.append(f)

    if recursive:
        que = [x for x in contents if x.endswith("/")]
        while que:
            f = que.pop(0).rstrip("/")
            contents += ls_s3(f, recursive=True)

    return contents
Пример #25
0
def ls_s3(s3_store_obj_name, recursive=False):
    s3_store_obj_name = s3ify(s3_store_obj_name)
    cmd = "aws s3 ls {0}/".format(s3_store_obj_name)
    contents = []
    for row in popen(cmd):
        f = row.split()[-1]
        f = op.join(s3_store_obj_name, f)
        contents.append(f)

    if recursive:
        que = [x for x in contents if x.endswith("/")]
        while que:
            f = que.pop(0).rstrip("/")
            contents += ls_s3(f, recursive=True)

    return contents
Пример #26
0
def gaps(args):
    """
    %prog gaps idsfile fractionationfile gapsbed

    Check gene locations against gaps. `idsfile` contains a list of IDs to query
    into `fractionationfile` in order to get expected locations.
    """
    from jcvi.formats.base import DictFile
    from jcvi.apps.base import popen
    from jcvi.utils.cbook import percentage

    p = OptionParser(gaps.__doc__)
    p.add_option("--bdist",
                 default=0,
                 type="int",
                 help="Base pair distance [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    idsfile, frfile, gapsbed = args
    bdist = opts.bdist
    d = DictFile(frfile, keypos=1, valuepos=2)
    bedfile = idsfile + ".bed"
    fw = open(bedfile, "w")
    fp = open(idsfile)
    total = 0
    for row in fp:
        id = row.strip()
        hit = d[id]
        tag, pos = get_tag(hit, None)
        seqid, start, end = pos
        start, end = max(start - bdist, 1), end + bdist
        print >> fw, "\t".join(str(x) for x in (seqid, start - 1, end, id))
        total += 1
    fw.close()

    cmd = "intersectBed -a {0} -b {1} -v | wc -l".format(bedfile, gapsbed)
    not_in_gaps = popen(cmd).read()
    not_in_gaps = int(not_in_gaps)
    in_gaps = total - not_in_gaps
    print >> sys.stderr, "Ids in gaps: {1}".\
            format(total, percentage(in_gaps, total))
Пример #27
0
Файл: ca.py Проект: zjwang6/jcvi
def removecontains(args):
    """
    %prog removecontains 4-unitigger/best.contains asm.gkpStore

    Remove contained reads from gkpStore. This will improve assembly contiguity
    without sacrificing accuracy, when using bogart unitigger.
    """
    p = OptionParser(removecontains.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    contains, gkpStore = args

    s = set()
    fp = open(contains)
    for row in fp:
        if row[0] == "#":
            continue
        iid = int(row.split()[0])
        s.add(iid)

    cmd = "gatekeeper -dumpfragments -lastfragiid {}".format(gkpStore)
    gkpmsg = popen(cmd).read()
    last_iid = int(gkpmsg.strip().split()[-1])

    ndeleted = 0
    editfile = "delete.edit"
    fw = open(editfile, "w")
    for iid in range(1, last_iid + 1):
        if iid in s:
            print("frg iid {0} isdeleted 1".format(iid), file=fw)
            ndeleted += 1

    fw.close()
    assert len(s) == ndeleted
    logging.debug(
        "A total of {0} contained reads flagged as deleted.".format(ndeleted))
    print("Now you can run:", file=sys.stderr)
    print("$ gatekeeper --edit {0} {1}".format(editfile, gkpStore),
          file=sys.stderr)
Пример #28
0
def gaps(args):
    """
    %prog gaps idsfile fractionationfile gapsbed

    Check gene locations against gaps. `idsfile` contains a list of IDs to query
    into `fractionationfile` in order to get expected locations.
    """
    from jcvi.formats.base import DictFile
    from jcvi.apps.base import popen
    from jcvi.utils.cbook import percentage

    p = OptionParser(gaps.__doc__)
    p.add_option("--bdist", default=0, type="int",
                 help="Base pair distance [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    idsfile, frfile, gapsbed = args
    bdist = opts.bdist
    d =  DictFile(frfile, keypos=1, valuepos=2)
    bedfile = idsfile + ".bed"
    fw = open(bedfile, "w")
    fp = open(idsfile)
    total = 0
    for row in fp:
        id = row.strip()
        hit = d[id]
        tag, pos = get_tag(hit, None)
        seqid, start, end = pos
        start, end = max(start - bdist, 1), end + bdist
        print >> fw, "\t".join(str(x) for x in (seqid, start - 1, end, id))
        total += 1
    fw.close()

    cmd = "intersectBed -a {0} -b {1} -v | wc -l".format(bedfile, gapsbed)
    not_in_gaps = popen(cmd).read()
    not_in_gaps = int(not_in_gaps)
    in_gaps = total - not_in_gaps
    print >> sys.stderr, "Ids in gaps: {1}".\
            format(total, percentage(in_gaps, total))
Пример #29
0
def removecontains(args):
    """
    %prog removecontains 4-unitigger/best.contains asm.gkpStore

    Remove contained reads from gkpStore. This will improve assembly contiguity
    without sacrificing accuracy, when using bogart unitigger.
    """
    p = OptionParser(removecontains.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    contains, gkpStore = args

    s = set()
    fp = open(contains)
    for row in fp:
        if row[0] == '#':
            continue
        iid = int(row.split()[0])
        s.add(iid)

    cmd = "gatekeeper -dumpfragments -lastfragiid {}".format(gkpStore)
    gkpmsg = popen(cmd).read()
    last_iid = int(gkpmsg.strip().split()[-1])

    ndeleted = 0
    editfile = "delete.edit"
    fw = open(editfile, "w")
    for iid in xrange(1, last_iid + 1):
        if iid in s:
            print >> fw, "frg iid {0} isdeleted 1".format(iid)
            ndeleted += 1

    fw.close()
    assert len(s) == ndeleted
    logging.debug("A total of {0} contained reads flagged as deleted."\
                  .format(ndeleted))
    print >> sys.stderr, "Now you can run:"
    print >> sys.stderr, "$ gatekeeper --edit {0} {1}".format(editfile, gkpStore)
Пример #30
0
    def start(self):
        cmd = self.build()
        # run the command and get the job-ID (important)
        output = popen(cmd, debug=False).read()

        if output.strip() != "":
            self.jobid = re.search(self.pat, output).group("id")
        else:
            self.jobid = "-1"

        msg = "[{0}] {1}".format(self.jobid, self.cmd)
        if self.infile:
            msg += " < {0} ".format(self.infile)
        if self.outfile:
            backup(self.outfile)
            msg += " > {0} ".format(self.outfile)
        if self.errfile:
            backup(self.errfile)
            msg += " 2> {0} ".format(self.errfile)

        logging.debug(msg)
Пример #31
0
    def start(self):
        cmd = self.build()
        # run the command and get the job-ID (important)
        output = popen(cmd, debug=False).read()

        if output.strip() != "":
            self.jobid = re.search(self.pat, output).group("id")
        else:
            self.jobid = "-1"

        msg = "[{0}] {1}".format(self.jobid, self.cmd)
        if self.infile:
            msg += " < {0} ".format(self.infile)
        if self.outfile:
            backup(self.outfile)
            msg += " > {0} ".format(self.outfile)
        if self.errfile:
            backup(self.errfile)
            msg += " 2> {0} ".format(self.errfile)

        logging.debug(msg)
Пример #32
0
def intersectBed_wao(abedfile, bbedfile, minOverlap=0):
    abed = Bed(abedfile)
    bbed = Bed(bbedfile)
    print >> sys.stderr, "`{0}` has {1} features.".format(abedfile, len(abed))
    print >> sys.stderr, "`{0}` has {1} features.".format(bbedfile, len(bbed))

    cmd = "intersectBed -wao -a {0} -b {1}".format(abedfile, bbedfile)
    acols = abed[0].nargs
    bcols = bbed[0].nargs
    fp = popen(cmd)
    for row in fp:
        atoms = row.split()
        aline = "\t".join(atoms[:acols])
        bline = "\t".join(atoms[acols:acols + bcols])
        c = int(atoms[-1])
        if c < minOverlap:
            continue
        a = BedLine(aline)
        try:
            b = BedLine(bline)
        except AssertionError:
            b = None

        yield a, b
Пример #33
0
def intersectBed_wao(abedfile, bbedfile, minOverlap=0):
    abed = Bed(abedfile)
    bbed = Bed(bbedfile)
    print >> sys.stderr, "`{0}` has {1} features.".format(abedfile, len(abed))
    print >> sys.stderr, "`{0}` has {1} features.".format(bbedfile, len(bbed))

    cmd = "intersectBed -wao -a {0} -b {1}".format(abedfile, bbedfile)
    acols = abed[0].nargs
    bcols = bbed[0].nargs
    fp = popen(cmd)
    for row in fp:
        atoms = row.split()
        aline = "\t".join(atoms[:acols])
        bline = "\t".join(atoms[acols:acols + bcols])
        c = int(atoms[-1])
        if c < minOverlap:
            continue
        a = BedLine(aline)
        try:
            b = BedLine(bline)
        except AssertionError:
            b = None

        yield a, b
Пример #34
0
def must_open(filename,
              mode="r",
              checkexists=False,
              skipcheck=False,
              oappend=False):
    """
    Accepts filename and returns filehandle.

    Checks on multiple files, stdin/stdout/stderr, .gz or .bz2 file.
    """
    if isinstance(filename, list):
        assert "r" in mode

        if filename[0].endswith((".gz", ".bz2")):
            filename = " ".join(
                filename)  # allow opening multiple gz/bz2 files
        else:
            import fileinput

            return fileinput.input(filename)

    if filename.startswith("s3://"):
        from jcvi.utils.aws import pull_from_s3

        filename = pull_from_s3(filename)

    if filename in ("-", "stdin"):
        assert "r" in mode
        fp = sys.stdin

    elif filename == "stdout":
        assert "w" in mode
        fp = sys.stdout

    elif filename == "stderr":
        assert "w" in mode
        fp = sys.stderr

    elif filename == "tmp" and mode == "w":
        from tempfile import NamedTemporaryFile

        fp = NamedTemporaryFile(mode=mode, delete=False)

    elif filename.endswith(".gz"):
        import gzip

        if "r" in mode:
            fp = gzip.open(filename, mode + "t")
        elif "w" in mode:
            fp = gzip.open(filename, mode)

    elif filename.endswith(".bz2"):
        if "r" in mode:
            cmd = "bzcat {0}".format(filename)
            fp = popen(cmd, debug=False)
        elif "w" in mode:
            import bz2

            fp = bz2.BZ2File(filename, mode)

    else:
        if checkexists:
            assert mode == "w"
            overwrite = ((not op.exists(filename))
                         if skipcheck else check_exists(filename, oappend))
            if overwrite:
                if oappend:
                    fp = open(filename, "a")
                else:
                    fp = open(filename, "w")
            else:
                logging.debug(
                    "File `{0}` already exists. Skipped.".format(filename))
                return None
        else:
            fp = open(filename, mode)

    return fp
Пример #35
0
def overlap(args):
    """
    %prog overlap <a|a.fasta> <b|b.fasta>

    Check overlaps between two fasta records. The arguments can be genBank IDs
    instead of FASTA files. In case of IDs, the sequences will be downloaded
    first.
    """
    from jcvi.apps.command import BLPATH
    from jcvi.formats.blast import chain_HSPs

    p = OptionParser(overlap.__doc__)
    p.add_option("--dir",
                 default=os.getcwd(),
                 help="Download sequences to dir [default: %default]")
    p.add_option("--qreverse",
                 default=False,
                 action="store_true",
                 help="Reverse seq a [default: %default]")
    p.add_option("--nochain",
                 default=False,
                 action="store_true",
                 help="Do not chain adjacent HSPs [default: chain HSPs]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    afasta, bfasta = args
    dir = opts.dir
    chain = not opts.nochain

    # Check first whether it is file or accession name
    if not op.exists(afasta):
        af = op.join(dir, afasta + ".fasta")
        if not op.exists(af):  # Check to avoid redownload
            fetch([afasta, "--skipcheck", "--outdir=" + dir])
        afasta = af

    if not op.exists(bfasta):
        bf = op.join(dir, bfasta + ".fasta")
        if not op.exists(bf):
            fetch([bfasta, "--skipcheck", "--outdir=" + dir])
        bfasta = bf

    assert op.exists(afasta) and op.exists(bfasta)

    cmd = BLPATH("blastn")
    cmd += " -query {0} -subject {1}".format(afasta, bfasta)
    cmd += " -evalue 0.01 -outfmt 6 -perc_identity {0}".format(GoodPct)

    fp = popen(cmd)
    hsps = fp.readlines()

    hsps = [BlastLine(x) for x in hsps]
    hsps = [x for x in hsps if x.hitlen >= GoodOverlap]
    dist = 2 * GoodOverlap  # Distance to chain the HSPs
    if chain:
        logging.debug("Chain HSPs in the Blast output.")
        hsps = chain_HSPs(hsps, xdist=dist, ydist=dist)

    if len(hsps) == 0:
        print >> sys.stderr, "No match found."
        return None

    besthsp = hsps[0]

    aid, asize = Fasta(afasta).itersizes().next()
    bid, bsize = Fasta(bfasta).itersizes().next()
    o = Overlap(besthsp, asize, bsize)
    o.print_graphic(qreverse=opts.qreverse)
    print >> sys.stderr, str(o)

    return o
Пример #36
0
def check_exists_s3(s3_store_obj_name):
    s3_store_obj_name = s3ify(s3_store_obj_name)
    cmd = "aws s3 ls {0} | wc -l".format(s3_store_obj_name)
    counts = int(popen(cmd).read())
    return counts != 0
Пример #37
0
def overlap(args):
    """
    %prog overlap <a|a.fasta> <b|b.fasta>

    Check overlaps between two fasta records. The arguments can be genBank IDs
    instead of FASTA files. In case of IDs, the sequences will be downloaded
    first.
    """
    from jcvi.apps.command import BLPATH
    from jcvi.formats.blast import chain_HSPs

    p = OptionParser(overlap.__doc__)
    p.add_option("--dir", default=os.getcwd(),
            help="Download sequences to dir [default: %default]")
    p.add_option("--qreverse", default=False, action="store_true",
            help="Reverse seq a [default: %default]")
    p.add_option("--nochain", default=False, action="store_true",
            help="Do not chain adjacent HSPs [default: chain HSPs]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    afasta, bfasta = args
    dir = opts.dir
    chain = not opts.nochain

    # Check first whether it is file or accession name
    if not op.exists(afasta):
        af = op.join(dir, afasta + ".fasta")
        if not op.exists(af):  # Check to avoid redownload
            fetch([afasta, "--skipcheck", "--outdir=" + dir])
        afasta = af

    if not op.exists(bfasta):
        bf = op.join(dir, bfasta + ".fasta")
        if not op.exists(bf):
            fetch([bfasta, "--skipcheck", "--outdir=" + dir])
        bfasta = bf

    assert op.exists(afasta) and op.exists(bfasta)

    cmd = BLPATH("blastn")
    cmd += " -query {0} -subject {1}".format(afasta, bfasta)
    cmd += " -evalue 0.01 -outfmt 6 -perc_identity {0}".format(GoodPct)

    fp = popen(cmd)
    hsps = fp.readlines()

    hsps = [BlastLine(x) for x in hsps]
    hsps = [x for x in hsps if x.hitlen >= GoodOverlap]
    dist = 2 * GoodOverlap  # Distance to chain the HSPs
    if chain:
        logging.debug("Chain HSPs in the Blast output.")
        hsps = chain_HSPs(hsps, xdist=dist, ydist=dist)

    if len(hsps) == 0:
        print >> sys.stderr, "No match found."
        return None

    besthsp = hsps[0]

    aid, asize = Fasta(afasta).itersizes().next()
    bid, bsize = Fasta(bfasta).itersizes().next()
    o = Overlap(besthsp, asize, bsize)
    o.print_graphic(qreverse=opts.qreverse)
    print >> sys.stderr, str(o)

    return o
Пример #38
0
def overlap(args):
    """
    %prog overlap <a|a.fasta> <b|b.fasta>

    Check overlaps between two fasta records. The arguments can be genBank IDs
    instead of FASTA files. In case of IDs, the sequences will be downloaded
    first.
    """
    from jcvi.formats.blast import chain_HSPs

    p = OptionParser(overlap.__doc__)
    p.add_option("--dir", default=os.getcwd(),
            help="Download sequences to dir [default: %default]")
    p.add_option("--suffix", default="fasta",
            help="Suffix of the sequence file in dir [default: %default]")
    p.add_option("--qreverse", default=False, action="store_true",
            help="Reverse seq a [default: %default]")
    p.add_option("--nochain", default=False, action="store_true",
            help="Do not chain adjacent HSPs [default: chain HSPs]")
    p.set_align(pctid=GoodPct, hitlen=GoodOverlap, evalue=.01)
    p.set_outfile(outfile=None)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    afasta, bfasta = args
    dir = opts.dir
    chain = not opts.nochain
    suffix = opts.suffix
    evalue = opts.evalue
    pctid = opts.pctid
    hitlen = opts.hitlen
    cutoff = Cutoff(pctid, hitlen)

    # Check first whether it is file or accession name
    if not op.exists(afasta):
        af = op.join(dir, ".".join((afasta, suffix)))
        if not op.exists(af):  # Check to avoid redownload
            entrez([afasta, "--skipcheck", "--outdir=" + dir])
        afasta = af

    if not op.exists(bfasta):
        bf = op.join(dir, ".".join((bfasta, suffix)))
        if not op.exists(bf):
            entrez([bfasta, "--skipcheck", "--outdir=" + dir])
        bfasta = bf

    assert op.exists(afasta) and op.exists(bfasta)

    cmd = "blastn -dust no"
    cmd += " -query {0} -subject {1}".format(afasta, bfasta)
    cmd += " -evalue {0} -outfmt 6 -perc_identity {1}".format(evalue, pctid)

    fp = popen(cmd)
    hsps = fp.readlines()

    hsps = [BlastLine(x) for x in hsps]
    hsps = [x for x in hsps if x.hitlen >= hitlen]
    if chain:
        logging.debug("Chain HSPs in the Blast output.")
        dist = 2 * hitlen  # Distance to chain the HSPs
        hsps = chain_HSPs(hsps, xdist=dist, ydist=dist)

    if len(hsps) == 0:
        print >> sys.stderr, "No match found."
        return None

    besthsp = hsps[0]

    aid, asize = Fasta(afasta).itersizes().next()
    bid, bsize = Fasta(bfasta).itersizes().next()
    o = Overlap(besthsp, asize, bsize, cutoff, qreverse=opts.qreverse)
    o.print_graphic()

    if opts.outfile:
        fw = must_open(opts.outfile, "w")
        print >> fw, str(o)
        fw.close()

    return o
Пример #39
0
def overlap(args):
    """
    %prog overlap best.contains iid

    Visualize overlaps for a given fragment. Must be run in 4-unitigger. All
    overlaps for iid were retrieved, excluding the ones matching best.contains.
    """
    from jcvi.apps.console import green

    p = OptionParser(overlap.__doc__)
    p.add_option("--maxerr", default=2, type="int", help="Maximum error rate")
    p.add_option("--canvas", default=100, type="int", help="Canvas size")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bestcontains, iid = args
    canvas = opts.canvas

    bestcontainscache = bestcontains + ".cache"
    if need_update(bestcontains, bestcontainscache):
        fp = open(bestcontains)
        fw = open(bestcontainscache, "w")
        exclude = set()
        for row in fp:
            if row[0] == "#":
                continue
            j = int(row.split()[0])
            exclude.add(j)
        cPickle.dump(exclude, fw)
        fw.close()

    exclude = cPickle.load(open(bestcontainscache))
    logging.debug("A total of {0} reads to exclude".format(len(exclude)))

    cmd = "overlapStore -d ../asm.ovlStore -b {0} -e {0}".format(iid)
    cmd += " -E {0}".format(opts.maxerr)
    frags = []
    for row in popen(cmd):
        r = OverlapLine(row)
        if r.bid in exclude:
            continue
        frags.append(r)

    # Also include to query fragment
    frags.append(OverlapLine("{0} {0} N 0 0 0 0".format(iid)))
    frags.sort(key=lambda x: x.ahang)

    # Determine size of the query fragment
    cmd = "gatekeeper -b {0} -e {0}".format(iid)
    cmd += " -tabular -dumpfragments ../asm.gkpStore"
    fp = popen(cmd)
    row = fp.next()
    size = int(fp.next().split()[-1])

    # Determine size of canvas
    xmin = min(x.ahang for x in frags)
    xmax = max(x.bhang for x in frags)
    xsize = -xmin + size + xmax
    ratio = xsize / canvas

    fw = sys.stdout
    for f in frags:
        fsize = -f.ahang + size + f.bhang
        a = (f.ahang - xmin) / ratio
        b = fsize / ratio
        t = "-" * b
        if f.orientation == "N":
            t = t[:-1] + ">"
        else:
            t = "<" + t[1:]
        if f.ahang == 0 and f.bhang == 0:
            t = green(t)
        c = canvas - a - b
        fw.write(" " * a)
        fw.write(t)
        fw.write(" " * c)
        print >> fw, "{0} ({1})".format(str(f.bid).rjust(10), f.erate_adj)
Пример #40
0
def get_grid_engine():
    cmd = "qsub --version"
    ret = popen(cmd, debug=False).read()
    return "PBS" if "PBS" in ret else "SGE"
Пример #41
0
    def allocate(self, info, chr, start_id, end_id, id_table):

        start_bp = info[0].start
        end_bp = info[-1].end

        current_chr = chr_number(chr)
        needed = info
        assert end_id > start_id, \
            "end ({0}) > start ({1})".format(end_id, start_id)

        spots = end_id - start_id - 1
        available = [
            x for x in xrange(start_id + 1, end_id)
            if (current_chr, x) not in self.black
        ]

        message = "{0} need {1} ids, has {2} spots ({3} available)".\
                format(chr, len(needed), spots, len(available))

        start_gene = gene_name(current_chr, start_id, prefix=self.prefix, \
                pad0=self.pad0, uc=self.uc)
        end_gene = gene_name(current_chr,
                             end_id,
                             prefix=self.prefix,
                             pad0=self.pad0,
                             uc=self.uc)
        message += " between {0} - {1}\n".format(start_gene, end_gene)

        assert end_bp > start_bp

        b = "\t".join(str(x) for x in (chr, start_bp - 1, end_bp))
        cmd = "echo '{0}' |".format(b)
        cmd += " intersectBed -a {0} -b stdin".format(self.gapfile)
        gaps = list(BedLine(x) for x in popen(cmd, debug=False))
        ngaps = len(gaps)

        gapsexpanded = []
        GeneDensity = 10000.  # assume 10Kb per gene
        for gap in gaps:
            gap_bp = int(gap.score)
            gap_ids = int(round(gap_bp / GeneDensity))
            gapsexpanded += [gap] * gap_ids

        lines = sorted(info + gapsexpanded, key=lambda x: x.start)

        message += "between bp: {0} - {1}, there are {2} gaps (total {3} ids)".\
                format(start_bp, end_bp, ngaps, len(lines))

        needed = lines
        stride = Stride(needed, available)
        conf = stride.conf
        message += " stride: {0}".format(conf)
        print >> sys.stderr, message

        nneeded = len(needed)
        if conf is None:  # prefix rule - prepend version number for spills
            magic = 400000  # version 4
            firstdigit = 100000
            step = 10  # stride for the prefixed ids
            rank = start_id + magic
            if rank > magic + firstdigit:
                rank -= firstdigit
            available = []
            while len(available) != nneeded:
                rank += step
                if (current_chr, rank) in self.black:  # avoid blacklisted ids
                    continue
                available.append(rank)

        else:  # follow the best stride
            available = stride.available
            if start_id == 0:  # follow right flank at start of chr
                available = available[-nneeded:]
            else:  # follow left flank otherwise
                available = available[:nneeded]

        # Finally assign the ids
        assert len(needed) == len(available)
        for b, rank in zip(needed, available):
            name = gene_name(current_chr, rank, prefix=self.prefix, \
                    pad0=self.pad0, uc=self.uc)
            print >> sys.stderr, "\t".join((str(b), name))
            id_table[b.accn] = name
            self.black.add((current_chr, rank))
        print >> sys.stderr
Пример #42
0
def overlap(args):
    """
    %prog overlap <a|a.fasta> <b|b.fasta>

    Check overlaps between two fasta records. The arguments can be genBank IDs
    instead of FASTA files. In case of IDs, the sequences will be downloaded
    first.
    """
    from jcvi.formats.blast import chain_HSPs

    p = OptionParser(overlap.__doc__)
    p.add_option("--dir",
                 default=os.getcwd(),
                 help="Download sequences to dir [default: %default]")
    p.add_option("--suffix",
                 default="fasta",
                 help="Suffix of the sequence file in dir [default: %default]")
    p.add_option("--qreverse",
                 default=False,
                 action="store_true",
                 help="Reverse seq a [default: %default]")
    p.add_option("--nochain",
                 default=False,
                 action="store_true",
                 help="Do not chain adjacent HSPs [default: chain HSPs]")
    p.set_align(pctid=GoodPct, hitlen=GoodOverlap, evalue=.01)
    p.set_outfile(outfile=None)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    afasta, bfasta = args
    dir = opts.dir
    chain = not opts.nochain
    suffix = opts.suffix
    evalue = opts.evalue
    pctid = opts.pctid
    hitlen = opts.hitlen
    cutoff = Cutoff(pctid, hitlen)

    # Check first whether it is file or accession name
    if not op.exists(afasta):
        af = op.join(dir, ".".join((afasta, suffix)))
        if not op.exists(af):  # Check to avoid redownload
            entrez([afasta, "--skipcheck", "--outdir=" + dir])
        afasta = af

    if not op.exists(bfasta):
        bf = op.join(dir, ".".join((bfasta, suffix)))
        if not op.exists(bf):
            entrez([bfasta, "--skipcheck", "--outdir=" + dir])
        bfasta = bf

    assert op.exists(afasta) and op.exists(bfasta)

    cmd = "blastn -dust no"
    cmd += " -query {0} -subject {1}".format(afasta, bfasta)
    cmd += " -evalue {0} -outfmt 6 -perc_identity {1}".format(evalue, pctid)

    fp = popen(cmd)
    hsps = fp.readlines()

    hsps = [BlastLine(x) for x in hsps]
    hsps = [x for x in hsps if x.hitlen >= hitlen]
    if chain:
        logging.debug("Chain HSPs in the Blast output.")
        dist = 2 * hitlen  # Distance to chain the HSPs
        hsps = chain_HSPs(hsps, xdist=dist, ydist=dist)

    if len(hsps) == 0:
        print >> sys.stderr, "No match found."
        return None

    besthsp = hsps[0]

    aid, asize = Fasta(afasta).itersizes().next()
    bid, bsize = Fasta(bfasta).itersizes().next()
    o = Overlap(besthsp, asize, bsize, cutoff, qreverse=opts.qreverse)
    o.print_graphic()

    if opts.outfile:
        fw = must_open(opts.outfile, "w")
        print >> fw, str(o)
        fw.close()

    return o
Пример #43
0
def check_exists_s3(s3_store_obj_name):
    s3_store_obj_name = s3ify(s3_store_obj_name)
    cmd = "aws s3 ls {0} | wc -l".format(s3_store_obj_name)
    counts = int(popen(cmd).read())
    return counts != 0
Пример #44
0
Файл: ca.py Проект: zjwang6/jcvi
def overlap(args):
    """
    %prog overlap best.contains iid

    Visualize overlaps for a given fragment. Must be run in 4-unitigger. All
    overlaps for iid were retrieved, excluding the ones matching best.contains.
    """
    p = OptionParser(overlap.__doc__)
    p.add_option("--maxerr", default=2, type="int", help="Maximum error rate")
    p.add_option("--canvas", default=100, type="int", help="Canvas size")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bestcontains, iid = args
    canvas = opts.canvas

    bestcontainscache = bestcontains + ".cache"
    if need_update(bestcontains, bestcontainscache):
        fp = open(bestcontains)
        fw = open(bestcontainscache, "w")
        exclude = set()
        for row in fp:
            if row[0] == "#":
                continue
            j = int(row.split()[0])
            exclude.add(j)
        dump(exclude, fw)
        fw.close()

    exclude = load(open(bestcontainscache))
    logging.debug("A total of {0} reads to exclude".format(len(exclude)))

    cmd = "overlapStore -d ../asm.ovlStore -b {0} -e {0}".format(iid)
    cmd += " -E {0}".format(opts.maxerr)
    frags = []
    for row in popen(cmd):
        r = OverlapLine(row)
        if r.bid in exclude:
            continue
        frags.append(r)

    # Also include to query fragment
    frags.append(OverlapLine("{0} {0} N 0 0 0 0".format(iid)))
    frags.sort(key=lambda x: x.ahang)

    # Determine size of the query fragment
    cmd = "gatekeeper -b {0} -e {0}".format(iid)
    cmd += " -tabular -dumpfragments ../asm.gkpStore"
    fp = popen(cmd)
    row = next(fp)
    size = int(fp.next().split()[-1])

    # Determine size of canvas
    xmin = min(x.ahang for x in frags)
    xmax = max(x.bhang for x in frags)
    xsize = -xmin + size + xmax
    ratio = xsize / canvas

    for f in frags:
        fsize = -f.ahang + size + f.bhang
        a = (f.ahang - xmin) / ratio
        b = fsize / ratio
        t = "-" * b
        if f.orientation == "N":
            t = t[:-1] + ">"
        else:
            t = "<" + t[1:]
        if f.ahang == 0 and f.bhang == 0:
            t = "[green]{}".format(t)
        c = canvas - a - b
        printf(
            "{}{}{}{} ({})".format(" " * a, t, " " * c,
                                   str(f.bid).rjust(10), f.erate_adj), )
Пример #45
0
def get_grid_engine():
    cmd = "qsub --version"
    popen(cmd, debug=False).read()
    return "PBS" if "PBS" in cmd else "SGE"
Пример #46
0
    def allocate(self, info, chr, start_id, end_id, id_table, extended_stride=False):

        start_bp = info[0].start
        end_bp = info[-1].end

        current_chr = chr_number(chr)
        needed = info
        assert end_id > start_id, \
            "end ({0}) > start ({1})".format(end_id, start_id)

        spots = end_id - start_id - 1
        available = [x for x in xrange(start_id + 1, end_id) if
                            (current_chr, x) not in self.black]

        message = "{0} need {1} ids, has {2} spots ({3} available)".\
                format(chr, len(needed), spots, len(available))

        start_gene = gene_name(current_chr, start_id, prefix=self.prefix, \
                pad0=self.pad0, uc=self.uc)
        end_gene = gene_name(current_chr, end_id, prefix=self.prefix,
                pad0=self.pad0, uc=self.uc)
        message += " between {0} - {1}\n".format(start_gene, end_gene)

        assert end_bp > start_bp

        b = "\t".join(str(x) for x in (chr, start_bp - 1, end_bp))
        cmd = "echo '{0}' |".format(b)
        cmd += " intersectBed -a {0} -b stdin".format(self.gapfile)
        gaps = list(BedLine(x) for x in popen(cmd, debug=False))
        ngaps = len(gaps)

        gapsexpanded = []
        GeneDensity = 10000.  # assume 10Kb per gene
        for gap in gaps:
            gap_bp = int(gap.score)
            gap_ids = int(round(gap_bp / GeneDensity))
            gapsexpanded += [gap] * gap_ids

        lines = sorted(info + gapsexpanded, key=lambda x: x.start)

        message += "between bp: {0} - {1}, there are {2} gaps (total {3} ids)".\
                format(start_bp, end_bp, ngaps, len(lines))

        needed = lines
        stride = Stride(needed, available, extended=extended_stride)
        conf = stride.conf
        message += " stride: {0}".format(conf)
        print >> sys.stderr, message

        nneeded = len(needed)
        if conf is None: # prefix rule - prepend version number for spills
            magic = 400000  # version 4
            firstdigit = 100000
            step = 10  # stride for the prefixed ids
            rank = start_id + magic
            if rank > magic + firstdigit:
                rank -= firstdigit
            available = []
            while len(available) != nneeded:
                rank += step
                if (current_chr, rank) in self.black:  # avoid blacklisted ids
                    continue
                available.append(rank)

        else: # follow the best stride
            available = stride.available
            if start_id == 0:  # follow right flank at start of chr
                available = available[- nneeded:]
            else:  # follow left flank otherwise
                available = available[:nneeded]

        # Finally assign the ids
        assert len(needed) == len(available)
        for b, rank in zip(needed, available):
            name = gene_name(current_chr, rank, prefix=self.prefix, \
                    pad0=self.pad0, uc=self.uc)
            print >> sys.stderr, "\t".join((str(b), name))
            id_table[b.accn] = name
            self.black.add((current_chr, rank))
        print >> sys.stderr