def filtervcf(args): """ %prog filtervcf NA12878.hg38.vcf.gz Filter lobSTR VCF using script shipped in lobSTR. Input file can be a list of vcf files. """ p = OptionParser(filtervcf.__doc__) p.set_home("lobstr", default="/mnt/software/lobSTR") p.set_aws_opts(store="hli-mv-data-science/htang/str") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) samples, = args lhome = opts.lobstr_home store = opts.output_path if samples.endswith((".vcf", ".vcf.gz")): vcffiles = [samples] else: vcffiles = [x.strip() for x in must_open(samples)] vcffiles = [x for x in vcffiles if ".filtered." not in x] run_args = [(x, lhome, x.startswith("s3://") and store) for x in vcffiles] cpus = min(opts.cpus, len(run_args)) p = Pool(processes=cpus) for res in p.map_async(run_filter, run_args).get(): continue
def mito(args): """ %prog mito chrM.fa input.bam Identify mitochondrial deletions. """ p = OptionParser(mito.__doc__) p.set_aws_opts(store="hli-mv-data-science/htang/mito-deletions") p.add_option("--realignonly", default=False, action="store_true", help="Realign only") p.add_option("--svonly", default=False, action="store_true", help="Run Realign => SV calls only") p.add_option("--support", default=1, type="int", help="Minimum number of supporting reads") p.set_home("speedseq", default="/mnt/software/speedseq/bin") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) chrMfa, bamfile = args store = opts.output_path cleanup = not opts.nocleanup if not op.exists(chrMfa): logging.debug("File `{}` missing. Exiting.".format(chrMfa)) return chrMfai = chrMfa + ".fai" if not op.exists(chrMfai): cmd = "samtools index {}".format(chrMfa) sh(cmd) if not bamfile.endswith(".bam"): bamfiles = [x.strip() for x in open(bamfile)] else: bamfiles = [bamfile] if store: computed = ls_s3(store) computed = [op.basename(x).split('.')[0] for x in computed if \ x.endswith(".depth")] remaining_samples = [x for x in bamfiles \ if op.basename(x).split(".")[0] not in computed] logging.debug("Already computed on `{}`: {}".\ format(store, len(bamfiles) - len(remaining_samples))) bamfiles = remaining_samples logging.debug("Total samples: {}".format(len(bamfiles))) for bamfile in bamfiles: run_mito(chrMfa, bamfile, opts, realignonly=opts.realignonly, svonly=opts.svonly, store=store, cleanup=cleanup)
def compilevcf(args): """ %prog compilevcf samples.csv Compile vcf results into master spreadsheet. """ p = OptionParser(compilevcf.__doc__) p.add_option("--db", default="hg38", help="Use these lobSTR db") p.add_option( "--nofilter", default=False, action="store_true", help="Do not filter the variants", ) p.set_home("lobstr") p.set_cpus() p.set_aws_opts(store="hli-mv-data-science/htang/str-data") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (samples, ) = args workdir = opts.workdir store = opts.output_path cleanup = not opts.nocleanup filtered = not opts.nofilter dbs = opts.db.split(",") cwd = os.getcwd() mkdir(workdir) os.chdir(workdir) samples = op.join(cwd, samples) stridsfile = "STR.ids" if samples.endswith((".vcf", ".vcf.gz")): vcffiles = [samples] else: vcffiles = [x.strip() for x in must_open(samples)] if not op.exists(stridsfile): ids = [] for db in dbs: ids.extend(STRFile(opts.lobstr_home, db=db).ids) uids = uniqify(ids) logging.debug("Combined: {} Unique: {}".format(len(ids), len(uids))) fw = open(stridsfile, "w") print("\n".join(uids), file=fw) fw.close() run_args = [(x, filtered, cleanup, store) for x in vcffiles] cpus = min(opts.cpus, len(run_args)) p = Pool(processes=cpus) for _ in p.map_async(run_compile, run_args).get(): continue
def batchlobstr(args): """ %prog batchlobstr samples.csv Run lobSTR sequentially on list of samples. Each line contains: sample-name,s3-location """ p = OptionParser(batchlobstr.__doc__) p.add_option("--sep", default=",", help="Separator for building commandline") p.set_home("lobstr", default="s3://hli-mv-data-science/htang/str-build/lobSTR/") p.set_aws_opts(store="hli-mv-data-science/htang/str-data") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (samplesfile,) = args store = opts.output_path computed = ls_s3(store) fp = open(samplesfile) skipped = total = 0 for row in fp: total += 1 sample, s3file = row.strip().split(",")[:2] exec_id, sample_id = sample.split("_") bamfile = s3file.replace(".gz", "").replace(".vcf", ".bam") gzfile = sample + ".{0}.vcf.gz".format("hg38") if gzfile in computed: skipped += 1 continue print( opts.sep.join( "python -m jcvi.variation.str lobstr".split() + [ "hg38", "--input_bam_path", bamfile, "--output_path", store, "--sample_id", sample_id, "--workflow_execution_id", exec_id, "--lobstr_home", opts.lobstr_home, "--workdir", opts.workdir, ] ) ) fp.close() logging.debug("Total skipped: {0}".format(percentage(skipped, total)))
def batchlobstr(args): """ %prog batchlobstr samples.csv Run lobSTR sequentially on list of samples. Each line contains: sample-name,s3-location """ p = OptionParser(batchlobstr.__doc__) p.add_option("--sep", default=",", help="Separator for building commandline") p.set_home("lobstr", default="s3://hli-mv-data-science/htang/str-build/lobSTR/") p.set_aws_opts(store="hli-mv-data-science/htang/str-data") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) samplesfile, = args store = opts.output_path computed = ls_s3(store) fp = open(samplesfile) skipped = total = 0 for row in fp: total += 1 sample, s3file = row.strip().split(",")[:2] exec_id, sample_id = sample.split("_") bamfile = s3file.replace(".gz", "").replace(".vcf", ".bam") gzfile = sample + ".{0}.vcf.gz".format("hg38") if gzfile in computed: skipped += 1 continue print opts.sep.join( "python -m jcvi.variation.str lobstr".split() + [ "hg38", "--input_bam_path", bamfile, "--output_path", store, "--sample_id", sample_id, "--workflow_execution_id", exec_id, "--lobstr_home", opts.lobstr_home, "--workdir", opts.workdir, ] ) fp.close() logging.debug("Total skipped: {0}".format(percentage(skipped, total)))
def compile(args): """ %prog compile samples.csv Compile vcf results into master spreadsheet. """ from multiprocessing import Pool p = OptionParser(compile.__doc__) p.add_option("--db", default="hg38,hg38-named", help="Use these lobSTR db") p.set_home("lobstr") p.set_cpus() p.set_aws_opts(store="hli-mv-data-science/htang/str") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) samples, = args workdir = opts.workdir dbs = opts.db.split(",") mkdir(workdir) os.chdir(workdir) stridsfile = "STR.ids" vcffiles = [x.strip() for x in must_open(samples)] if not op.exists(stridsfile): ids = [] for db in dbs: ids.extend(STRFile(opts.lobstr_home, db=db).ids) uids = uniqify(ids) logging.debug("Combined: {} Unique: {}".format(len(ids), len(uids))) fw = open(stridsfile, "w") print >> fw, "\n".join(uids) fw.close() # Generate two alleles dipuids = [] for uid in uids: dipuids.extend([uid + ".1", uid + ".2"]) fw = open("header.ids", "w") print >> fw, ",".join(dipuids) fw.close() p = Pool(processes=opts.cpus) run_args = [(x, opts.store, opts.cleanup) for x in vcffiles] #run(run_args[0]) for res in p.map_async(run, run_args).get(): continue
def compilevcf(args): """ %prog compilevcf samples.csv Compile vcf results into master spreadsheet. """ p = OptionParser(compilevcf.__doc__) p.add_option("--db", default="hg38", help="Use these lobSTR db") p.add_option("--nofilter", default=False, action="store_true", help="Do not filter the variants") p.set_home("lobstr") p.set_cpus() p.set_aws_opts(store="hli-mv-data-science/htang/str-data") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) samples, = args workdir = opts.workdir store = opts.output_path cleanup = not opts.nocleanup filtered = not opts.nofilter dbs = opts.db.split(",") cwd = os.getcwd() mkdir(workdir) os.chdir(workdir) samples = op.join(cwd, samples) stridsfile = "STR.ids" if samples.endswith((".vcf", ".vcf.gz")): vcffiles = [samples] else: vcffiles = [x.strip() for x in must_open(samples)] if not op.exists(stridsfile): ids = [] for db in dbs: ids.extend(STRFile(opts.lobstr_home, db=db).ids) uids = uniqify(ids) logging.debug("Combined: {} Unique: {}".format(len(ids), len(uids))) fw = open(stridsfile, "w") print >> fw, "\n".join(uids) fw.close() run_args = [(x, filtered, cleanup, store) for x in vcffiles] cpus = min(opts.cpus, len(run_args)) p = Pool(processes=cpus) for res in p.map_async(run_compile, run_args).get(): continue
def compile(args): """ %prog compile samples.csv Compile vcf results into master spreadsheet. """ p = OptionParser(compile.__doc__) p.add_option("--db", default="hg38", help="Use these lobSTR db") p.set_home("lobstr") p.set_cpus() p.set_aws_opts(store="hli-mv-data-science/htang/str-data") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) samples, = args workdir = opts.workdir store = opts.output_path cleanup = not opts.nocleanup dbs = opts.db.split(",") cwd = os.getcwd() mkdir(workdir) os.chdir(workdir) samples = op.join(cwd, samples) stridsfile = "STR.ids" vcffiles = [x.strip() for x in must_open(samples)] if not op.exists(stridsfile): ids = [] for db in dbs: ids.extend(STRFile(opts.lobstr_home, db=db).ids) uids = uniqify(ids) logging.debug("Combined: {} Unique: {}".format(len(ids), len(uids))) fw = open(stridsfile, "w") print >> fw, "\n".join(uids) fw.close() p = Pool(processes=opts.cpus) run_args = [(x, store, cleanup) for x in vcffiles] for res in p.map_async(run, run_args).get(): continue
def batchlobstr(args): """ %prog batchlobstr samples.csv Run lobSTR sequentially on list of samples. Each line contains: sample-name,s3-location """ p = OptionParser(batchlobstr.__doc__) p.add_option("--sep", default=",", help="Separator for building commandline") p.set_aws_opts(store="hli-mv-data-science/htang/str") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) samplesfile, = args store = opts.store computed = ls_s3(store) fp = open(samplesfile) skipped = total = 0 for row in fp: total += 1 sample, s3file = row.strip().split(",")[:2] bamfile = s3file.replace(".gz", "").replace(".vcf", ".bam") gzfile = sample + ".{0}.vcf.gz".format("hg38") if gzfile in computed: skipped += 1 continue print opts.sep.join("python -m jcvi.variation.str lobstr".split() + \ [bamfile, "hg38", "--prefix", sample, "--workdir", opts.workdir, "--cleanup", "--store", opts.store]) fp.close() logging.debug("Total skipped: {0}".format(percentage(skipped, total)))
def lobstr(args): """ %prog lobstr lobstr_index1 lobstr_index2 ... Run lobSTR on a big BAM file. There can be multiple lobSTR indices. In addition, bamfile can be S3 location and --lobstr_home can be S3 location (e.g. s3://hli-mv-data-science/htang/str-build/lobSTR/) """ p = OptionParser(lobstr.__doc__) p.add_option("--haploid", default="chrY,chrM", help="Use haploid model for these chromosomes") p.add_option("--chr", help="Run only this chromosome") p.add_option("--simulation", default=False, action="store_true", help="Simulation mode") p.set_home("lobstr", default="s3://hli-mv-data-science/htang/str-build/lobSTR/") p.set_cpus() p.set_aws_opts(store="hli-mv-data-science/htang/str-data") opts, args = p.parse_args(args) bamfile = opts.input_bam_path if len(args) < 1 or bamfile is None: sys.exit(not p.print_help()) lbindices = args if opts.simulation: # Simulation mode cmd, vcf_file = allelotype_on_chr(bamfile, "chr4", "/mnt/software/lobSTR/", "TREDs", haploid=opts.haploid) stats_file = vcf_file.rsplit(".", 1)[0] + ".allelotype.stats" results_dir = "lobstr_results" mkdir(results_dir) sh(cmd) sh("mv {} {}/ && rm {}".format(vcf_file, results_dir, stats_file)) return s3mode = bamfile.startswith("s3") store = opts.output_path cleanup = not opts.nocleanup workdir = opts.workdir mkdir(workdir) os.chdir(workdir) lhome = opts.lobstr_home if lhome.startswith("s3://"): lhome = pull_from_s3(lhome, overwrite=False) exec_id, sample_id = opts.workflow_execution_id, opts.sample_id prefix = [x for x in (exec_id, sample_id) if x] if prefix: pf = "_".join(prefix) else: pf = bamfile.split("/")[-1].split(".")[0] if s3mode: gzfile = pf + ".{0}.vcf.gz".format(lbindices[-1]) remotegzfile = "{0}/{1}".format(store, gzfile) if check_exists_s3(remotegzfile): logging.debug("Object `{0}` exists. Computation skipped."\ .format(remotegzfile)) return localbamfile = pf + ".bam" localbaifile = localbamfile + ".bai" if op.exists(localbamfile): logging.debug("BAM file already downloaded.") else: pull_from_s3(bamfile, localbamfile) if op.exists(localbaifile): logging.debug("BAM index file already downloaded.") else: remotebaifile = bamfile + ".bai" if check_exists_s3(remotebaifile): pull_from_s3(remotebaifile, localbaifile) else: remotebaifile = bamfile.rsplit(".")[0] + ".bai" if check_exists_s3(remotebaifile): pull_from_s3(remotebaifile, localbaifile) else: logging.debug("BAM index cannot be found in S3!") sh("samtools index {0}".format(localbamfile)) bamfile = localbamfile chrs = [opts.chr] if opts.chr else (range(1, 23) + ["X", "Y"]) for lbidx in lbindices: makefile = "makefile.{0}".format(lbidx) mm = MakeManager(filename=makefile) vcffiles = [] for chr in chrs: cmd, vcffile = allelotype_on_chr(bamfile, chr, lhome, lbidx, haploid=opts.haploid) mm.add(bamfile, vcffile, cmd) filteredvcffile = vcffile.replace(".vcf", ".filtered.vcf") cmd = "python -m jcvi.variation.str filtervcf {}".format(vcffile) cmd += " --lobstr_home {}".format(lhome) mm.add(vcffile, filteredvcffile, cmd) vcffiles.append(filteredvcffile) gzfile = bamfile.split(".")[0] + ".{0}.vcf.gz".format(lbidx) cmd = "vcf-concat {0} | vcf-sort".format(" ".join(vcffiles)) cmd += " | bgzip -c > {0}".format(gzfile) mm.add(vcffiles, gzfile, cmd) mm.run(cpus=opts.cpus) if s3mode: push_to_s3(store, gzfile) if cleanup: mm.clean() sh("rm -f {} {} *.bai *.stats".format(bamfile, mm.makefile))
def lobstr(args): """ %prog lobstr lobstr_index1 lobstr_index2 ... Run lobSTR on a big BAM file. There can be multiple lobSTR indices. In addition, bamfile can be S3 location and --lobstr_home can be S3 location (e.g. s3://hli-mv-data-science/htang/str-build/lobSTR/) """ p = OptionParser(lobstr.__doc__) p.add_option("--chr", help="Run only this chromosome") p.set_home("lobstr", default="s3://hli-mv-data-science/htang/str-build/lobSTR/") p.set_cpus() p.set_aws_opts(store="hli-mv-data-science/htang/str-data") opts, args = p.parse_args(args) bamfile = opts.input_bam_path if len(args) < 1 or bamfile is None: sys.exit(not p.print_help()) lbindices = args s3mode = bamfile.startswith("s3") store = opts.output_path cleanup = not opts.nocleanup workdir = opts.workdir mkdir(workdir) os.chdir(workdir) lhome = opts.lobstr_home if lhome.startswith("s3://"): lhome = pull_from_s3(lhome, overwrite=False) exec_id, sample_id = opts.workflow_execution_id, opts.sample_id prefix = [x for x in (exec_id, sample_id) if x] if prefix: pf = "_".join(prefix) else: pf = bamfile.split("/")[-1].split(".")[0] if s3mode: gzfile = pf + ".{0}.vcf.gz".format(lbindices[-1]) remotegzfile = "{0}/{1}".format(store, gzfile) if check_exists_s3(remotegzfile): logging.debug("Object `{0}` exists. Computation skipped."\ .format(remotegzfile)) return localbamfile = pf + ".bam" localbaifile = localbamfile + ".bai" if op.exists(localbamfile): logging.debug("BAM file already downloaded.") else: pull_from_s3(bamfile, localbamfile) if op.exists(localbaifile): logging.debug("BAM index file already downloaded.") else: remotebaifile = bamfile + ".bai" if check_exists_s3(remotebaifile): pull_from_s3(remotebaifile, localbaifile) else: remotebaifile = bamfile.rsplit(".")[0] + ".bai" if check_exists_s3(remotebaifile): pull_from_s3(remotebaifile, localbaifile) else: logging.debug("BAM index cannot be found in S3!") sh("samtools index {0}".format(localbamfile)) bamfile = localbamfile chrs = [opts.chr] if opts.chr else (range(1, 23) + ["X", "Y"]) for lbidx in lbindices: makefile = "makefile.{0}".format(lbidx) mm = MakeManager(filename=makefile) vcffiles = [] for chr in chrs: cmd, vcffile = allelotype_on_chr(bamfile, chr, lhome, lbidx) mm.add(bamfile, vcffile, cmd) filteredvcffile = vcffile.replace(".vcf", ".filtered.vcf") cmd = "python -m jcvi.variation.str filtervcf {}".format(vcffile) cmd += " --lobstr_home {}".format(lhome) mm.add(vcffile, filteredvcffile, cmd) vcffiles.append(filteredvcffile) gzfile = bamfile.split(".")[0] + ".{0}.vcf.gz".format(lbidx) cmd = "vcf-concat {0} | vcf-sort".format(" ".join(vcffiles)) cmd += " | bgzip -c > {0}".format(gzfile) mm.add(vcffiles, gzfile, cmd) mm.run(cpus=opts.cpus) if s3mode: push_to_s3(store, gzfile) if cleanup: mm.clean() sh("rm -f {} {} *.bai *.stats".format(bamfile, mm.makefile))
def lobstr(args): """ %prog lobstr bamfile lobstr_index1 lobstr_index2 ... Run lobSTR on a big BAM file. There can be multiple lobSTR indices. """ p = OptionParser(lobstr.__doc__) p.add_option("--chr", help="Run only this chromosome") p.add_option("--prefix", help="Use prefix file name") p.set_home("lobstr") p.set_cpus() p.set_aws_opts(store="hli-mv-data-science/htang/str") opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) bamfile = args[0] lbindices = args[1:] s3mode = bamfile.startswith("s3") store = opts.store workdir = opts.workdir mkdir(workdir) os.chdir(workdir) pf = opts.prefix or bamfile.split("/")[-1].split(".")[0] if s3mode: gzfile = pf + ".{0}.vcf.gz".format(lbindices[-1]) remotegzfile = "s3://{0}/{1}".format(store, gzfile) if check_exists_s3(remotegzfile): logging.debug("Object `{0}` exists. Computation skipped."\ .format(remotegzfile)) return localbamfile = pf + ".bam" localbaifile = localbamfile + ".bai" if op.exists(localbamfile): logging.debug("BAM file already downloaded.") else: pull_from_s3(bamfile, localbamfile) if op.exists(localbaifile): logging.debug("BAM index file already downloaded.") else: remotebaifile = bamfile + ".bai" if check_exists_s3(remotebaifile): pull_from_s3(remotebaifile, localbaifile) else: remotebaifile = bamfile.rsplit(".")[0] + ".bai" if check_exists_s3(remotebaifile): pull_from_s3(remotebaifile, localbaifile) else: logging.debug("BAM index cannot be found in S3!") sh("samtools index {0}".format(localbamfile)) bamfile = localbamfile lhome = opts.lobstr_home chrs = [opts.chr] if opts.chr else (range(1, 23) + ["X", "Y"]) for lbidx in lbindices: mm = MakeManager(filename="makefile.{0}".format(lbidx)) vcffiles = [] for chr in chrs: cmd, vcffile = allelotype_on_chr(bamfile, chr, lhome, lbidx) mm.add(bamfile, vcffile, cmd) vcffiles.append(vcffile) gzfile = bamfile.split(".")[0] + ".{0}.vcf.gz".format(lbidx) cmd = "vcf-concat {0} | vcf-sort".format(" ".join(vcffiles)) cmd += " | bgzip -c > {0}".format(gzfile) mm.add(vcffiles, gzfile, cmd) mm.run(cpus=opts.cpus) if s3mode: push_to_s3(store, gzfile) if opts.cleanup: sh("rm -f *")