def write_filtered(vcffile, lhome, store=None): if vcffile.startswith("s3://"): vcffile = pull_from_s3(vcffile) filteredvcf = op.basename(vcffile).replace(".vcf", ".filtered.vcf") cmd = "python {}/scripts/lobSTR_filter_vcf.py".format(lhome) cmd += " --vcf {}".format(vcffile) cmd += " --loc-cov 5 --loc-log-score 0.8" #cmd += " --loc-call-rate 0.8 --loc-max-ref-length 80" #cmd += " --call-cov 5 --call-log-score 0.8 --call-dist-end 20" sh(cmd, outfile=filteredvcf) if store: push_to_s3(store, filteredvcf) return filteredvcf
def must_open(filename, mode="r", checkexists=False, skipcheck=False, oappend=False): """ Accepts filename and returns filehandle. Checks on multiple files, stdin/stdout/stderr, .gz or .bz2 file. """ if isinstance(filename, list): assert "r" in mode if filename[0].endswith((".gz", ".bz2")): filename = " ".join( filename) # allow opening multiple gz/bz2 files else: import fileinput return fileinput.input(filename) if filename.startswith("s3://"): from jcvi.utils.aws import pull_from_s3 filename = pull_from_s3(filename) if filename in ("-", "stdin"): assert "r" in mode fp = sys.stdin elif filename == "stdout": assert "w" in mode fp = sys.stdout elif filename == "stderr": assert "w" in mode fp = sys.stderr elif filename == "tmp" and mode == "w": from tempfile import NamedTemporaryFile fp = NamedTemporaryFile(mode=mode, delete=False) elif filename.endswith(".gz"): import gzip if "r" in mode: fp = gzip.open(filename, mode + "t") elif "w" in mode: fp = gzip.open(filename, mode) elif filename.endswith(".bz2"): if "r" in mode: cmd = "bzcat {0}".format(filename) fp = popen(cmd, debug=False) elif "w" in mode: import bz2 fp = bz2.BZ2File(filename, mode) else: if checkexists: assert mode == "w" overwrite = ((not op.exists(filename)) if skipcheck else check_exists(filename, oappend)) if overwrite: if oappend: fp = open(filename, "a") else: fp = open(filename, "w") else: logging.debug( "File `{0}` already exists. Skipped.".format(filename)) return None else: fp = open(filename, mode) return fp
def lobstr(args): """ %prog lobstr lobstr_index1 lobstr_index2 ... Run lobSTR on a big BAM file. There can be multiple lobSTR indices. In addition, bamfile can be S3 location and --lobstr_home can be S3 location (e.g. s3://hli-mv-data-science/htang/str-build/lobSTR/) """ p = OptionParser(lobstr.__doc__) p.add_option("--haploid", default="chrY,chrM", help="Use haploid model for these chromosomes") p.add_option("--chr", help="Run only this chromosome") p.add_option("--simulation", default=False, action="store_true", help="Simulation mode") p.set_home("lobstr", default="s3://hli-mv-data-science/htang/str-build/lobSTR/") p.set_cpus() p.set_aws_opts(store="hli-mv-data-science/htang/str-data") opts, args = p.parse_args(args) bamfile = opts.input_bam_path if len(args) < 1 or bamfile is None: sys.exit(not p.print_help()) lbindices = args if opts.simulation: # Simulation mode cmd, vcf_file = allelotype_on_chr(bamfile, "chr4", "/mnt/software/lobSTR/", "TREDs", haploid=opts.haploid) stats_file = vcf_file.rsplit(".", 1)[0] + ".allelotype.stats" results_dir = "lobstr_results" mkdir(results_dir) sh(cmd) sh("mv {} {}/ && rm {}".format(vcf_file, results_dir, stats_file)) return s3mode = bamfile.startswith("s3") store = opts.output_path cleanup = not opts.nocleanup workdir = opts.workdir mkdir(workdir) os.chdir(workdir) lhome = opts.lobstr_home if lhome.startswith("s3://"): lhome = pull_from_s3(lhome, overwrite=False) exec_id, sample_id = opts.workflow_execution_id, opts.sample_id prefix = [x for x in (exec_id, sample_id) if x] if prefix: pf = "_".join(prefix) else: pf = bamfile.split("/")[-1].split(".")[0] if s3mode: gzfile = pf + ".{0}.vcf.gz".format(lbindices[-1]) remotegzfile = "{0}/{1}".format(store, gzfile) if check_exists_s3(remotegzfile): logging.debug("Object `{0}` exists. Computation skipped."\ .format(remotegzfile)) return localbamfile = pf + ".bam" localbaifile = localbamfile + ".bai" if op.exists(localbamfile): logging.debug("BAM file already downloaded.") else: pull_from_s3(bamfile, localbamfile) if op.exists(localbaifile): logging.debug("BAM index file already downloaded.") else: remotebaifile = bamfile + ".bai" if check_exists_s3(remotebaifile): pull_from_s3(remotebaifile, localbaifile) else: remotebaifile = bamfile.rsplit(".")[0] + ".bai" if check_exists_s3(remotebaifile): pull_from_s3(remotebaifile, localbaifile) else: logging.debug("BAM index cannot be found in S3!") sh("samtools index {0}".format(localbamfile)) bamfile = localbamfile chrs = [opts.chr] if opts.chr else (range(1, 23) + ["X", "Y"]) for lbidx in lbindices: makefile = "makefile.{0}".format(lbidx) mm = MakeManager(filename=makefile) vcffiles = [] for chr in chrs: cmd, vcffile = allelotype_on_chr(bamfile, chr, lhome, lbidx, haploid=opts.haploid) mm.add(bamfile, vcffile, cmd) filteredvcffile = vcffile.replace(".vcf", ".filtered.vcf") cmd = "python -m jcvi.variation.str filtervcf {}".format(vcffile) cmd += " --lobstr_home {}".format(lhome) mm.add(vcffile, filteredvcffile, cmd) vcffiles.append(filteredvcffile) gzfile = bamfile.split(".")[0] + ".{0}.vcf.gz".format(lbidx) cmd = "vcf-concat {0} | vcf-sort".format(" ".join(vcffiles)) cmd += " | bgzip -c > {0}".format(gzfile) mm.add(vcffiles, gzfile, cmd) mm.run(cpus=opts.cpus) if s3mode: push_to_s3(store, gzfile) if cleanup: mm.clean() sh("rm -f {} {} *.bai *.stats".format(bamfile, mm.makefile))
def lobstr(args): """ %prog lobstr lobstr_index1 lobstr_index2 ... Run lobSTR on a big BAM file. There can be multiple lobSTR indices. In addition, bamfile can be S3 location and --lobstr_home can be S3 location (e.g. s3://hli-mv-data-science/htang/str-build/lobSTR/) """ p = OptionParser(lobstr.__doc__) p.add_option("--chr", help="Run only this chromosome") p.set_home("lobstr", default="s3://hli-mv-data-science/htang/str-build/lobSTR/") p.set_cpus() p.set_aws_opts(store="hli-mv-data-science/htang/str-data") opts, args = p.parse_args(args) bamfile = opts.input_bam_path if len(args) < 1 or bamfile is None: sys.exit(not p.print_help()) lbindices = args s3mode = bamfile.startswith("s3") store = opts.output_path cleanup = not opts.nocleanup workdir = opts.workdir mkdir(workdir) os.chdir(workdir) lhome = opts.lobstr_home if lhome.startswith("s3://"): lhome = pull_from_s3(lhome, overwrite=False) exec_id, sample_id = opts.workflow_execution_id, opts.sample_id prefix = [x for x in (exec_id, sample_id) if x] if prefix: pf = "_".join(prefix) else: pf = bamfile.split("/")[-1].split(".")[0] if s3mode: gzfile = pf + ".{0}.vcf.gz".format(lbindices[-1]) remotegzfile = "{0}/{1}".format(store, gzfile) if check_exists_s3(remotegzfile): logging.debug("Object `{0}` exists. Computation skipped."\ .format(remotegzfile)) return localbamfile = pf + ".bam" localbaifile = localbamfile + ".bai" if op.exists(localbamfile): logging.debug("BAM file already downloaded.") else: pull_from_s3(bamfile, localbamfile) if op.exists(localbaifile): logging.debug("BAM index file already downloaded.") else: remotebaifile = bamfile + ".bai" if check_exists_s3(remotebaifile): pull_from_s3(remotebaifile, localbaifile) else: remotebaifile = bamfile.rsplit(".")[0] + ".bai" if check_exists_s3(remotebaifile): pull_from_s3(remotebaifile, localbaifile) else: logging.debug("BAM index cannot be found in S3!") sh("samtools index {0}".format(localbamfile)) bamfile = localbamfile chrs = [opts.chr] if opts.chr else (range(1, 23) + ["X", "Y"]) for lbidx in lbindices: makefile = "makefile.{0}".format(lbidx) mm = MakeManager(filename=makefile) vcffiles = [] for chr in chrs: cmd, vcffile = allelotype_on_chr(bamfile, chr, lhome, lbidx) mm.add(bamfile, vcffile, cmd) filteredvcffile = vcffile.replace(".vcf", ".filtered.vcf") cmd = "python -m jcvi.variation.str filtervcf {}".format(vcffile) cmd += " --lobstr_home {}".format(lhome) mm.add(vcffile, filteredvcffile, cmd) vcffiles.append(filteredvcffile) gzfile = bamfile.split(".")[0] + ".{0}.vcf.gz".format(lbidx) cmd = "vcf-concat {0} | vcf-sort".format(" ".join(vcffiles)) cmd += " | bgzip -c > {0}".format(gzfile) mm.add(vcffiles, gzfile, cmd) mm.run(cpus=opts.cpus) if s3mode: push_to_s3(store, gzfile) if cleanup: mm.clean() sh("rm -f {} {} *.bai *.stats".format(bamfile, mm.makefile))
def must_open(filename, mode="r", checkexists=False, skipcheck=False, \ oappend=False): """ Accepts filename and returns filehandle. Checks on multiple files, stdin/stdout/stderr, .gz or .bz2 file. """ if isinstance(filename, list): assert "r" in mode if filename[0].endswith(".gz") or filename[0].endswith(".bz2"): filename = " ".join(filename) # allow opening multiple gz/bz2 files else: import fileinput return fileinput.input(filename) if filename.startswith("s3://"): from jcvi.utils.aws import pull_from_s3 filename = pull_from_s3(filename) if filename in ("-", "stdin"): assert "r" in mode fp = sys.stdin elif filename == "stdout": assert "w" in mode fp = sys.stdout elif filename == "stderr": assert "w" in mode fp = sys.stderr elif filename == "tmp" and mode == "w": from tempfile import NamedTemporaryFile fp = NamedTemporaryFile(delete=False) elif filename.endswith(".gz"): if 'r' in mode: cmd = "zcat {0}".format(filename) fp = popen(cmd, debug=False) elif 'w' in mode: import gzip fp = gzip.open(filename, mode) elif filename.endswith(".bz2"): if 'r' in mode: cmd = "bzcat {0}".format(filename) fp = popen(cmd, debug=False) elif 'w' in mode: import bz2 fp = bz2.BZ2File(filename, mode) else: if checkexists: assert mode == "w" overwrite = (not op.exists(filename)) if skipcheck \ else check_exists(filename, oappend) if overwrite: if oappend: fp = open(filename, "a") else: fp = open(filename, "w") else: logging.debug("File `{0}` already exists. Skipped."\ .format(filename)) return None else: fp = open(filename, mode) return fp
def lobstr(args): """ %prog lobstr bamfile lobstr_index1 lobstr_index2 ... Run lobSTR on a big BAM file. There can be multiple lobSTR indices. """ p = OptionParser(lobstr.__doc__) p.add_option("--chr", help="Run only this chromosome") p.add_option("--prefix", help="Use prefix file name") p.set_home("lobstr") p.set_cpus() p.set_aws_opts(store="hli-mv-data-science/htang/str") opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) bamfile = args[0] lbindices = args[1:] s3mode = bamfile.startswith("s3") store = opts.store workdir = opts.workdir mkdir(workdir) os.chdir(workdir) pf = opts.prefix or bamfile.split("/")[-1].split(".")[0] if s3mode: gzfile = pf + ".{0}.vcf.gz".format(lbindices[-1]) remotegzfile = "s3://{0}/{1}".format(store, gzfile) if check_exists_s3(remotegzfile): logging.debug("Object `{0}` exists. Computation skipped."\ .format(remotegzfile)) return localbamfile = pf + ".bam" localbaifile = localbamfile + ".bai" if op.exists(localbamfile): logging.debug("BAM file already downloaded.") else: pull_from_s3(bamfile, localbamfile) if op.exists(localbaifile): logging.debug("BAM index file already downloaded.") else: remotebaifile = bamfile + ".bai" if check_exists_s3(remotebaifile): pull_from_s3(remotebaifile, localbaifile) else: remotebaifile = bamfile.rsplit(".")[0] + ".bai" if check_exists_s3(remotebaifile): pull_from_s3(remotebaifile, localbaifile) else: logging.debug("BAM index cannot be found in S3!") sh("samtools index {0}".format(localbamfile)) bamfile = localbamfile lhome = opts.lobstr_home chrs = [opts.chr] if opts.chr else (range(1, 23) + ["X", "Y"]) for lbidx in lbindices: mm = MakeManager(filename="makefile.{0}".format(lbidx)) vcffiles = [] for chr in chrs: cmd, vcffile = allelotype_on_chr(bamfile, chr, lhome, lbidx) mm.add(bamfile, vcffile, cmd) vcffiles.append(vcffile) gzfile = bamfile.split(".")[0] + ".{0}.vcf.gz".format(lbidx) cmd = "vcf-concat {0} | vcf-sort".format(" ".join(vcffiles)) cmd += " | bgzip -c > {0}".format(gzfile) mm.add(vcffiles, gzfile, cmd) mm.run(cpus=opts.cpus) if s3mode: push_to_s3(store, gzfile) if opts.cleanup: sh("rm -f *")
def must_open(filename, mode="r", checkexists=False, skipcheck=False, \ oappend=False): """ Accepts filename and returns filehandle. Checks on multiple files, stdin/stdout/stderr, .gz or .bz2 file. """ if isinstance(filename, list): assert "r" in mode if filename[0].endswith((".gz", ".bz2")): filename = " ".join( filename) # allow opening multiple gz/bz2 files else: import fileinput return fileinput.input(filename) if filename.startswith("s3://"): from jcvi.utils.aws import pull_from_s3 filename = pull_from_s3(filename) if filename in ("-", "stdin"): assert "r" in mode fp = sys.stdin elif filename == "stdout": assert "w" in mode fp = sys.stdout elif filename == "stderr": assert "w" in mode fp = sys.stderr elif filename == "tmp" and mode == "w": from tempfile import NamedTemporaryFile fp = NamedTemporaryFile(delete=False) elif filename.endswith(".gz"): if "r" in mode: cmd = "gunzip -c %s" % filename from subprocess import Popen, PIPE fp = Popen(cmd, bufsize=1, stdout=PIPE, shell=True, universal_newlines=True).stdout elif "w" in mode: import gzip fp = gzip.open(filename, mode) elif filename.endswith(".bz2"): if "r" in mode: cmd = "bzcat -c %s" % filename from subprocess import Popen, PIPE fp = Popen(cmd, bufsize=1, stdout=PIPE, shell=True, universal_newlines=True).stdout elif "w" in mode: import bz2 fp = bz2.open(filename, mode) else: if checkexists: assert mode == "w" overwrite = (not op.exists(filename)) if skipcheck \ else check_exists(filename, oappend) if overwrite: if oappend: fp = open(filename, "a") else: fp = open(filename, "w") else: logging.debug("File `{0}` already exists. Skipped."\ .format(filename)) return None else: fp = open(filename, mode) return fp