def lobstr(args): """ %prog lobstr lobstr_index1 lobstr_index2 ... Run lobSTR on a big BAM file. There can be multiple lobSTR indices. In addition, bamfile can be S3 location and --lobstr_home can be S3 location (e.g. s3://hli-mv-data-science/htang/str-build/lobSTR/) """ p = OptionParser(lobstr.__doc__) p.add_option("--haploid", default="chrY,chrM", help="Use haploid model for these chromosomes") p.add_option("--chr", help="Run only this chromosome") p.add_option("--simulation", default=False, action="store_true", help="Simulation mode") p.set_home("lobstr", default="s3://hli-mv-data-science/htang/str-build/lobSTR/") p.set_cpus() p.set_aws_opts(store="hli-mv-data-science/htang/str-data") opts, args = p.parse_args(args) bamfile = opts.input_bam_path if len(args) < 1 or bamfile is None: sys.exit(not p.print_help()) lbindices = args if opts.simulation: # Simulation mode cmd, vcf_file = allelotype_on_chr(bamfile, "chr4", "/mnt/software/lobSTR/", "TREDs", haploid=opts.haploid) stats_file = vcf_file.rsplit(".", 1)[0] + ".allelotype.stats" results_dir = "lobstr_results" mkdir(results_dir) sh(cmd) sh("mv {} {}/ && rm {}".format(vcf_file, results_dir, stats_file)) return s3mode = bamfile.startswith("s3") store = opts.output_path cleanup = not opts.nocleanup workdir = opts.workdir mkdir(workdir) os.chdir(workdir) lhome = opts.lobstr_home if lhome.startswith("s3://"): lhome = pull_from_s3(lhome, overwrite=False) exec_id, sample_id = opts.workflow_execution_id, opts.sample_id prefix = [x for x in (exec_id, sample_id) if x] if prefix: pf = "_".join(prefix) else: pf = bamfile.split("/")[-1].split(".")[0] if s3mode: gzfile = pf + ".{0}.vcf.gz".format(lbindices[-1]) remotegzfile = "{0}/{1}".format(store, gzfile) if check_exists_s3(remotegzfile): logging.debug("Object `{0}` exists. Computation skipped."\ .format(remotegzfile)) return localbamfile = pf + ".bam" localbaifile = localbamfile + ".bai" if op.exists(localbamfile): logging.debug("BAM file already downloaded.") else: pull_from_s3(bamfile, localbamfile) if op.exists(localbaifile): logging.debug("BAM index file already downloaded.") else: remotebaifile = bamfile + ".bai" if check_exists_s3(remotebaifile): pull_from_s3(remotebaifile, localbaifile) else: remotebaifile = bamfile.rsplit(".")[0] + ".bai" if check_exists_s3(remotebaifile): pull_from_s3(remotebaifile, localbaifile) else: logging.debug("BAM index cannot be found in S3!") sh("samtools index {0}".format(localbamfile)) bamfile = localbamfile chrs = [opts.chr] if opts.chr else (range(1, 23) + ["X", "Y"]) for lbidx in lbindices: makefile = "makefile.{0}".format(lbidx) mm = MakeManager(filename=makefile) vcffiles = [] for chr in chrs: cmd, vcffile = allelotype_on_chr(bamfile, chr, lhome, lbidx, haploid=opts.haploid) mm.add(bamfile, vcffile, cmd) filteredvcffile = vcffile.replace(".vcf", ".filtered.vcf") cmd = "python -m jcvi.variation.str filtervcf {}".format(vcffile) cmd += " --lobstr_home {}".format(lhome) mm.add(vcffile, filteredvcffile, cmd) vcffiles.append(filteredvcffile) gzfile = bamfile.split(".")[0] + ".{0}.vcf.gz".format(lbidx) cmd = "vcf-concat {0} | vcf-sort".format(" ".join(vcffiles)) cmd += " | bgzip -c > {0}".format(gzfile) mm.add(vcffiles, gzfile, cmd) mm.run(cpus=opts.cpus) if s3mode: push_to_s3(store, gzfile) if cleanup: mm.clean() sh("rm -f {} {} *.bai *.stats".format(bamfile, mm.makefile))
def lobstr(args): """ %prog lobstr bamfile lobstr_index1 lobstr_index2 ... Run lobSTR on a big BAM file. There can be multiple lobSTR indices. """ p = OptionParser(lobstr.__doc__) p.add_option("--chr", help="Run only this chromosome") p.add_option("--prefix", help="Use prefix file name") p.set_home("lobstr") p.set_cpus() p.set_aws_opts(store="hli-mv-data-science/htang/str") opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) bamfile = args[0] lbindices = args[1:] s3mode = bamfile.startswith("s3") store = opts.store workdir = opts.workdir mkdir(workdir) os.chdir(workdir) pf = opts.prefix or bamfile.split("/")[-1].split(".")[0] if s3mode: gzfile = pf + ".{0}.vcf.gz".format(lbindices[-1]) remotegzfile = "s3://{0}/{1}".format(store, gzfile) if check_exists_s3(remotegzfile): logging.debug("Object `{0}` exists. Computation skipped."\ .format(remotegzfile)) return localbamfile = pf + ".bam" localbaifile = localbamfile + ".bai" if op.exists(localbamfile): logging.debug("BAM file already downloaded.") else: pull_from_s3(bamfile, localbamfile) if op.exists(localbaifile): logging.debug("BAM index file already downloaded.") else: remotebaifile = bamfile + ".bai" if check_exists_s3(remotebaifile): pull_from_s3(remotebaifile, localbaifile) else: remotebaifile = bamfile.rsplit(".")[0] + ".bai" if check_exists_s3(remotebaifile): pull_from_s3(remotebaifile, localbaifile) else: logging.debug("BAM index cannot be found in S3!") sh("samtools index {0}".format(localbamfile)) bamfile = localbamfile lhome = opts.lobstr_home chrs = [opts.chr] if opts.chr else (range(1, 23) + ["X", "Y"]) for lbidx in lbindices: mm = MakeManager(filename="makefile.{0}".format(lbidx)) vcffiles = [] for chr in chrs: cmd, vcffile = allelotype_on_chr(bamfile, chr, lhome, lbidx) mm.add(bamfile, vcffile, cmd) vcffiles.append(vcffile) gzfile = bamfile.split(".")[0] + ".{0}.vcf.gz".format(lbidx) cmd = "vcf-concat {0} | vcf-sort".format(" ".join(vcffiles)) cmd += " | bgzip -c > {0}".format(gzfile) mm.add(vcffiles, gzfile, cmd) mm.run(cpus=opts.cpus) if s3mode: push_to_s3(store, gzfile) if opts.cleanup: sh("rm -f *")
def lobstr(args): """ %prog lobstr lobstr_index1 lobstr_index2 ... Run lobSTR on a big BAM file. There can be multiple lobSTR indices. In addition, bamfile can be S3 location and --lobstr_home can be S3 location (e.g. s3://hli-mv-data-science/htang/str-build/lobSTR/) """ p = OptionParser(lobstr.__doc__) p.add_option("--chr", help="Run only this chromosome") p.set_home("lobstr", default="s3://hli-mv-data-science/htang/str-build/lobSTR/") p.set_cpus() p.set_aws_opts(store="hli-mv-data-science/htang/str-data") opts, args = p.parse_args(args) bamfile = opts.input_bam_path if len(args) < 1 or bamfile is None: sys.exit(not p.print_help()) lbindices = args s3mode = bamfile.startswith("s3") store = opts.output_path cleanup = not opts.nocleanup workdir = opts.workdir mkdir(workdir) os.chdir(workdir) lhome = opts.lobstr_home if lhome.startswith("s3://"): lhome = pull_from_s3(lhome, overwrite=False) exec_id, sample_id = opts.workflow_execution_id, opts.sample_id prefix = [x for x in (exec_id, sample_id) if x] if prefix: pf = "_".join(prefix) else: pf = bamfile.split("/")[-1].split(".")[0] if s3mode: gzfile = pf + ".{0}.vcf.gz".format(lbindices[-1]) remotegzfile = "{0}/{1}".format(store, gzfile) if check_exists_s3(remotegzfile): logging.debug("Object `{0}` exists. Computation skipped."\ .format(remotegzfile)) return localbamfile = pf + ".bam" localbaifile = localbamfile + ".bai" if op.exists(localbamfile): logging.debug("BAM file already downloaded.") else: pull_from_s3(bamfile, localbamfile) if op.exists(localbaifile): logging.debug("BAM index file already downloaded.") else: remotebaifile = bamfile + ".bai" if check_exists_s3(remotebaifile): pull_from_s3(remotebaifile, localbaifile) else: remotebaifile = bamfile.rsplit(".")[0] + ".bai" if check_exists_s3(remotebaifile): pull_from_s3(remotebaifile, localbaifile) else: logging.debug("BAM index cannot be found in S3!") sh("samtools index {0}".format(localbamfile)) bamfile = localbamfile chrs = [opts.chr] if opts.chr else (range(1, 23) + ["X", "Y"]) for lbidx in lbindices: makefile = "makefile.{0}".format(lbidx) mm = MakeManager(filename=makefile) vcffiles = [] for chr in chrs: cmd, vcffile = allelotype_on_chr(bamfile, chr, lhome, lbidx) mm.add(bamfile, vcffile, cmd) filteredvcffile = vcffile.replace(".vcf", ".filtered.vcf") cmd = "python -m jcvi.variation.str filtervcf {}".format(vcffile) cmd += " --lobstr_home {}".format(lhome) mm.add(vcffile, filteredvcffile, cmd) vcffiles.append(filteredvcffile) gzfile = bamfile.split(".")[0] + ".{0}.vcf.gz".format(lbidx) cmd = "vcf-concat {0} | vcf-sort".format(" ".join(vcffiles)) cmd += " | bgzip -c > {0}".format(gzfile) mm.add(vcffiles, gzfile, cmd) mm.run(cpus=opts.cpus) if s3mode: push_to_s3(store, gzfile) if cleanup: mm.clean() sh("rm -f {} {} *.bai *.stats".format(bamfile, mm.makefile))