def make_trio_vcfs(vcfmap, options): """ merge up samples into same vcf using rtg return index of merged files""" robust_makedirs(options.out_dir) ped_file = os.path.join(options.out_dir, "predigree.ped") with open(ped_file, "w") as f: f.write(options.ped + "\n") mergetable = dict() for region, rd in vcfmap.items(): mergetable[region] = dict() region_dir = os.path.join(options.out_dir, "trio_vcfs", region) robust_makedirs(region_dir) # round up all sampels for graph bygraph = dict() for sample, sd in rd.items(): for graph, pvcf in sd.items(): if graph not in bygraph: bygraph[graph] = dict() bygraph[graph][sample] = pvcf # make a merged vcf for each graph for graph, sd in bygraph.items(): input_vcfs = {"snp": [], "indel": [], "all": []} for sample, pvcf in sd.items(): work_dir = os.path.join(region_dir, "input_vcf") merge_dir = os.path.join(region_dir, "merged_vcf") robust_makedirs(work_dir) robust_makedirs(merge_dir) for kind in input_vcfs.keys(): filter_vcf = os.path.join( work_dir, "{}_{}_{}.vcf".format(graph, sample, kind)) vstr = "-v snps,mnps" if kind is "snp" else "-V snps,mnps" if kind is "indel" else "" if options.clip is not None: vstr += " -R {}".format(options.clip) run("bcftools view {} -f PASS,. {} | bcftools norm - -f {} > {}" .format(pvcf, vstr, options.chrom_fa_path, filter_vcf)) run("bgzip -f {}".format(filter_vcf)) run("tabix -f -p vcf {}.gz".format(filter_vcf)) input_vcfs[kind].append("{}.gz".format(filter_vcf)) if len(sd.items()) >= 3 and \ len(input_vcfs["all"]) == len(sd.items()) and\ len(input_vcfs["snp"]) == len(sd.items()) and\ len(input_vcfs["indel"]) == len(sd.items()): mergetable[region][graph] = dict() for kind in input_vcfs.keys(): out_vcf = os.path.join( merge_dir, "{}_{}_merged.vcf.gz".format(graph, kind)) run("rm -f {}".format(out_vcf)) run("rtg vcfmerge {} -o {}".format( " ".join(input_vcfs[kind]), out_vcf), fail_hard=True) mergetable[region][graph][kind] = out_vcf return mergetable
def make_trio_vcfs(vcfmap, options): """ merge up samples into same vcf using rtg return index of merged files""" robust_makedirs(options.out_dir) ped_file = os.path.join(options.out_dir, "predigree.ped") with open(ped_file, "w") as f: f.write(options.ped + "\n") mergetable = dict() for region, rd in vcfmap.items(): mergetable[region] = dict() region_dir = os.path.join(options.out_dir, "trio_vcfs", region) robust_makedirs(region_dir) # round up all sampels for graph bygraph = dict() for sample, sd in rd.items(): for graph, pvcf in sd.items(): if graph not in bygraph: bygraph[graph] = dict() bygraph[graph][sample] = pvcf # make a merged vcf for each graph for graph, sd in bygraph.items(): input_vcfs = { "snp" : [], "indel" : [], "all" : [] } for sample, pvcf in sd.items(): work_dir = os.path.join(region_dir, "input_vcf") merge_dir = os.path.join(region_dir, "merged_vcf") robust_makedirs(work_dir) robust_makedirs(merge_dir) for kind in input_vcfs.keys(): filter_vcf = os.path.join(work_dir, "{}_{}_{}.vcf".format(graph, sample, kind)) vstr = "-v snps,mnps" if kind is "snp" else "-V snps,mnps" if kind is "indel" else "" if options.clip is not None: vstr += " -R {}".format(options.clip) run("bcftools view {} -f PASS,. {} | bcftools norm - -f {} > {}".format( pvcf, vstr, options.chrom_fa_path, filter_vcf)) run("bgzip -f {}".format(filter_vcf)) run("tabix -f -p vcf {}.gz".format(filter_vcf)) input_vcfs[kind].append("{}.gz".format(filter_vcf)) if len(sd.items()) >= 3 and \ len(input_vcfs["all"]) == len(sd.items()) and\ len(input_vcfs["snp"]) == len(sd.items()) and\ len(input_vcfs["indel"]) == len(sd.items()): mergetable[region][graph] = dict() for kind in input_vcfs.keys(): out_vcf = os.path.join(merge_dir, "{}_{}_merged.vcf.gz".format(graph, kind)) run("rm -f {}".format(out_vcf)) run("rtg vcfmerge {} -o {}".format(" ".join(input_vcfs[kind]), out_vcf), fail_hard = True) mergetable[region][graph][kind] = out_vcf return mergetable
def trio_stats(sample_vcf, filter_xref, ignore_genotype, options): """ compute trio statistics """ # we are hardcoding trio information here assert options.sample is "NA12878" child = sample_vcf.replace("NA12878", "NA12879") p1 = sample_vcf p2 = sample_vcf.replace("NA12878", "NA12877") out_base = tempfile.mkdtemp(prefix = "callStats_", dir = ".") if filter_xref is True: sys.stderr.write("Filtering {}\n".format(sample_vcf)) filter_vcf = os.path.join(out_base, "child_filter.vcf") if os.path.isfile(child): os.system("grep -v XREF {} > {}".format(child, filter_vcf)) child = filter_vcf filter1_vcf = os.path.join(out_base, "p1_filter.vcf") if os.path.isfile(p1): os.system("grep -v XREF {} > {}".format(p1, filter1_vcf)) p1 = filter1_vcf filter2_vcf = os.path.join(out_base, "p2_filter.vcf") if os.path.isfile(p2): os.system("grep -v XREF {} > {}".format(p2, filter2_vcf)) p2 = filter2_vcf trio_res = os.path.join(out_base, "ts.txt") try: ig = "-g" if ignore_genotype else "" sys.stderr.write("\nscripts/trioConcordance.py {} {} {} {} > {}\n".format(child, p1, p2, ig, trio_res)) run("scripts/trioConcordance.py {} {} {} {} > {}".format(child, p1, p2, ig, trio_res)) with open(trio_res) as f: toks = f.readline().split() res = toks[0:3] sys.stderr.write(" === {}\n".format(str(res))) ts = dict() ts["GOOD"] = int(res[0]) ts["BAD"] = int(res[1]) ts["RATIO"] = float(res[2]) except: sys.stderr.write("trio concordance failed for {} {} {}".format(child, p1, p2)) ts = dict() ts["GOOD"] = -1 ts["BAD"] = -1 ts["RATIO"] = -1.0 os.system("rm -rf {}".format(out_base)) return ts
def plot_kmer_comp(tsv_path, options): """ take a kmer compare table and make a jaccard boxplot for the first column and a recall / precision ploot for the 2nd and third column """ out_dir = os.path.join(options.comp_dir, "comp_plots") robust_makedirs(out_dir) out_name = os.path.basename(os.path.splitext(tsv_path)[0]) out_base_path = os.path.join(out_dir, out_name) sample = out_name.split("-")[-1].upper() region = out_name.split("-")[-2].upper() params = " ".join(PLOT_PARAMS) # jaccard boxplot jac_tsv = out_base_path + "_jac.tsv" awkstr = '''awk '{if (NR!=1) print $1 "\t" $2}' ''' run("{} {} > {}".format(awkstr, tsv_path, jac_tsv)) jac_png = out_base_path + "_jac.png" run("scripts/boxplot.py {} --save {} --title \"{} KMER Set Jaccard\" --x_label \"Graph\" --y_label \"Jaccard Index\" --x_sideways {}".format(jac_tsv, jac_png, region, params)) # precision recall scatter plot acc_tsv = out_base_path + "_acc.tsv" awkstr = '''awk '{if (NR!=1) print $1 "\t" $4 "\t" $3}' ''' run("{} {} > {}".format(awkstr, tsv_path, acc_tsv)) acc_png = out_base_path + "_acc.png" run("scripts/scatter.py {} --save {} --title \"{} KMER Set Accuracy\" --x_label \"Recall\" --y_label \"Precision\" --width 12 --height 9 --lines {}".format(acc_tsv, acc_png, region, params))
def plot_kmer_comp(tsv_path, options): """ take a kmer compare table and make a jaccard boxplot for the first column and a recall / precision ploot for the 2nd and third column """ out_dir = os.path.join(options.comp_dir, "comp_plots") robust_makedirs(out_dir) out_name = os.path.basename(os.path.splitext(tsv_path)[0]) out_base_path = os.path.join(out_dir, out_name) region = out_name.split("-")[-1].upper() params = " ".join(PLOT_PARAMS) # jaccard boxplot jac_tsv = out_base_path + "_jac.tsv" awkstr = '''awk '{if (NR!=1) print $1 "\t" $2}' ''' run("{} {} > {}".format(awkstr, tsv_path, jac_tsv)) jac_png = out_base_path + "_jac.png" run("scripts/boxplot.py {} --save {} --title \"{} KMER Set Jaccard\" --x_label \"Graph\" --y_label \"Jaccard Index\" --x_sideways {}".format(jac_tsv, jac_png, region, params)) # precision recall scatter plot acc_tsv = out_base_path + "_acc.tsv" awkstr = '''awk '{if (NR!=1) print $1 "\t" $4 "\t" $3}' ''' run("{} {} > {}".format(awkstr, tsv_path, acc_tsv)) acc_png = out_base_path + "_acc.png" run("scripts/scatter.py {} --save {} --title \"{} KMER Set Accuracy\" --x_label \"Recall\" --y_label \"Precision\" --width 12 --height 9 --lines {}".format(acc_tsv, acc_png, region, params))
def sompy_stats(sample_vcf, truth_vcf, filter_xref, options): """ run sompy (copied from computeVariantsDistances, mostly) """ out_base = tempfile.mkdtemp(prefix = "callStats_", dir = ".") if filter_xref is True: filter_vcf = os.path.join(out_base, "filter.vcf") os.system("grep -v XREF {} > {}".format(sample_vcf, filter_vcf)) sample_vcf = filter_vcf run("som.py {} {} -P -o {} -r {} > /dev/null".format(truth_vcf, sample_vcf, os.path.join(out_base, "sp_out"), options.chrom_fa_path), fail_hard=True) indels, snps = None, None with open(os.path.join(out_base, "sp_out.stats.csv")) as sp_result: for line in sp_result: toks = line.split(",") if len(toks) < 2: continue if toks[1] == "type": header = toks tp_idx = toks.index("tp") fp_idx = toks.index("fp") elif toks[1] == "indels": indels = toks elif toks[1] == "SNVs": snps = toks elif toks[1] == "records": total = toks os.system("rm -rf {}".format(out_base)) # indels optional if indels is None: indels = [0] * 100 if snps is None: snps = [0] * 100 ret = dict() ret["SNP-TP"] = int(snps[tp_idx]) ret["SNP-FP"] = int(snps[fp_idx]) ret["INDEL-TP"] = int(indels[tp_idx]) ret["INDEL-FP"] = int(indels[fp_idx]) ret["TOTAL-TP"] = int(total[tp_idx]) ret["TOTAL-FP"] = int(total[fp_idx]) return ret
def do_mendel(mergetable, options): """ run rtg mendelian on all our merged vcfs """ header = ["graph", "all", "snp", "indel"] for region, gd in mergetable.items(): table = [] for graph, mergefiles in gd.items(): annot_dir = os.path.join(options.out_dir, "mendel", region, graph) robust_makedirs(annot_dir) concordance = dict() for kind, mergefile in mergefiles.items(): out_vcf = os.path.join(annot_dir, "mendel_{}.vcf.gz".format(kind)) con_vcf = os.path.join(annot_dir, "consistent_{}.vcf.gz".format(kind)) incon_vcf = os.path.join(annot_dir, "inconsistent_{}.vcf.gz".format(kind)) out_stdout = os.path.join(annot_dir, "mendel_{}.stdout".format(kind)) run("rtg mendelian -l -i {} -t {} --pedigree {} --output {} --output-consistent {} --output-inconsistent {} > {}" .format(mergefile, os.path.join(options.comp_dir, "chrom.sdf"), os.path.join(options.out_dir, "predigree.ped"), out_vcf, con_vcf, incon_vcf, out_stdout)) concordance[kind] = scrape_mendel(out_stdout) table.append([ graph, concordance["all"], concordance["snp"], concordance["indel"] ]) # write the tsv for this region with open( os.path.join(options.out_dir, "mendel-{}.tsv".format(region)), "w") as f: f.write("\t".join(header) + "\n") for row in table: if None not in row: line = [str(s) for s in row] f.write("\t".join(line) + "\n")
def do_mendel(mergetable, options): """ run rtg mendelian on all our merged vcfs """ header = ["graph", "all", "snp", "indel"] for region, gd in mergetable.items(): table = [] for graph, mergefiles in gd.items(): annot_dir = os.path.join(options.out_dir, "mendel", region, graph) robust_makedirs(annot_dir) concordance = dict() for kind, mergefile in mergefiles.items(): out_vcf = os.path.join(annot_dir, "mendel_{}.vcf.gz".format(kind)) con_vcf = os.path.join(annot_dir, "consistent_{}.vcf.gz".format(kind)) incon_vcf = os.path.join(annot_dir, "inconsistent_{}.vcf.gz".format(kind)) out_stdout = os.path.join(annot_dir, "mendel_{}.stdout".format(kind)) run("rtg mendelian -l -i {} -t {} --pedigree {} --output {} --output-consistent {} --output-inconsistent {} > {}".format( mergefile, os.path.join(options.comp_dir, "chrom.sdf"), os.path.join(options.out_dir, "predigree.ped"), out_vcf, con_vcf, incon_vcf, out_stdout)) concordance[kind] = scrape_mendel(out_stdout) table.append([graph, concordance["all"], concordance["snp"], concordance["indel"]]) # write the tsv for this region with open(os.path.join(options.out_dir, "mendel-{}.tsv".format(region)), "w") as f: f.write("\t".join(header) + "\n") for row in table: if None not in row: line = [str(s) for s in row] f.write("\t".join(line) + "\n")
def plot_vcf_comp(tsv_path, options): """ take the big vcf compare table and make precision_recall plots for all the categories""" out_dir = os.path.join(options.comp_dir, "comp_plots") robust_makedirs(out_dir) out_name = os.path.basename(os.path.splitext(tsv_path)[0]) sample = out_name.split("-")[-1].upper() region = out_name.split("-")[-2].upper() def out_base_path(tag, label, extension): bd = tag if extension != ".tsv" else "tsv" ret = os.path.join(out_dir, bd, "-".join(out_name.split("-")[:-1]) + "-{}-{}-".format(sample, tag) + region) + "_" + label + extension robust_makedirs(os.path.dirname(ret)) return ret params = " ".join(PLOT_PARAMS) # precision recall scatter plot header = vcf_dist_header(options) # strip qual header = header[:-1] for i in range(len(header) / 2): prec_idx = 2 * i rec_idx = prec_idx + 1 qual_idx = len(header) print prec_idx, header[prec_idx], rec_idx, header[rec_idx] ptoks = header[prec_idx].split("-") rtoks = header[rec_idx].split("-") assert ptoks[1] == "Precision" assert rtoks[1] == "Recall" assert ptoks[:1] == rtoks[:1] comp_cat = ptoks[0] if comp_cat not in ["TOT", "SNP", "INDEL"]: continue label = header[prec_idx].replace("Precision", "acc") acc_tsv = out_base_path("pr", label, ".tsv") print "Make {} tsv with cols {} {}".format(label, rec_idx, prec_idx) # +1 to convert to awk 1-base coordinates. +1 again since header doesnt include row_label col awkcmd = '''if (NR!=1) print $1 "\t" ${} "\t" ${} "\t" ${}'''.format(rec_idx + 2, prec_idx + 2, qual_idx + 2) awkstr = "awk \'{" + awkcmd + "}\'" run("{} {} > {}".format(awkstr, tsv_path, acc_tsv)) acc_png = out_base_path("pr", label, ".png") title = sample.upper() + " " if comp_cat == "TOT": title += " Total Accuracy" else: title += " {} Accuracy".format(comp_cat.title()) if region == "TOTAL": title += ", all regions" else: title += ", {}".format(region) cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x -0.01 --max_x 1.01 --min_y -0.01 --max_y 1.01".format(acc_tsv, acc_png, title, params) print cmd os.system(cmd) #flatten to max f1 tsv and plot as bars f1_tsv = out_base_path("f1bar", label, ".tsv") f1_png = out_base_path("f1bar", label, ".png") f1_pr_tsv = out_base_path("f1pr", label, ".tsv") f1_pr_png = out_base_path("f1pr", label, ".png") f1_qual_tsv = out_base_path("f1qual", label, ".tsv") f1_qual_png = out_base_path("f1qual", label, ".png") make_max_f1_tsv(acc_tsv, f1_tsv, f1_pr_tsv, f1_qual_tsv, options) cmd = "scripts/barchart.py {} --ascending --no_n --save {} --title \"{}\" --x_sideways --x_label \"Graph\" --y_label \"Max F1\" {}".format(f1_tsv, f1_png, title, params) print cmd os.system(cmd) cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5".format(f1_pr_tsv, f1_pr_png, title, params) print cmd os.system(cmd) cmd = "scripts/barchart.py {} --ascending --no_n --save {} --title \"{}\" --x_sideways --x_label \"Graph\" --y_label \"Quality for Max F1\" {}".format(f1_qual_tsv, f1_qual_png, title, params) print cmd os.system(cmd) if options.top is True: # top 20 cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.798 --max_x 1.002 --min_y 0.798 --max_y 1.002".format(acc_tsv, acc_png.replace(".png", "_top20.png"), title, params) print cmd os.system(cmd) # top 20 cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 11 --height 5.5 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.796 --max_x 1.004 --min_y 0.796 --max_y 1.004".format(acc_tsv, acc_png.replace(".png", "_top20_inset.png"), title, params) print cmd os.system(cmd) # top 40 cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.596 --max_x 1.004 --min_y 0.596 --max_y 1.004".format(acc_tsv, acc_png.replace(".png", "_top40.png"), title, params) print cmd os.system(cmd) # top .5 bar cmd = "scripts/barchart.py {} --ascending --no_n --save {} --title \"{}\" --x_sideways --x_label \"Graph\" --y_label \"Max F1\" {} --min 0.5".format(f1_tsv, f1_png.replace(".png", "_top50.png"), title, params) print cmd os.system(cmd) # top .6 bar cmd = "scripts/barchart.py {} --ascending --no_n --save {} --title \"{}\" --x_sideways --x_label \"Graph\" --y_label \"Max F1\" {} --min 0.6".format(f1_tsv, f1_png.replace(".png", "_top60.png"), title, params) print cmd os.system(cmd) # top .7 bar cmd = "scripts/barchart.py {} --ascending --no_n --save {} --title \"{}\" --x_sideways --x_label \"Graph\" --y_label \"Max F1\" {} --min 0.7".format(f1_tsv, f1_png.replace(".png", "_top70.png"), title, params) print cmd os.system(cmd) # top .85 bar cmd = "scripts/barchart.py {} --ascending --no_n --save {} --title \"{}\" --x_sideways --x_label \"Graph\" --y_label \"Max F1\" {} --min 0.85".format(f1_tsv, f1_png.replace(".png", "_top85.png"), title, params) print cmd os.system(cmd) # top .25 f1pr scatter cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.746 --max_x 1.004 --min_y 0.746 --max_y 1.004".format(f1_pr_tsv, f1_pr_png.replace(".png", "_top25.png"), title, params) print cmd os.system(cmd) # top .50 f1pr scatter cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.496 --max_x 1.004 --min_y 0.496 --max_y 1.004".format(f1_pr_tsv, f1_pr_png.replace(".png", "_top50.png"), title, params) print cmd os.system(cmd) # top .65 f1pr scatter cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.646 --max_x 1.004 --min_y 0.646 --max_y 1.004".format(f1_pr_tsv, f1_pr_png.replace(".png", "_top65.png"), title, params) print cmd os.system(cmd)
def main(args): options = parse_args(args) RealTimeLogger.start_master() if options.classic: # expect call_dir/SAMPLE/region.vcf for sampledir in glob.glob(os.path.join(options.call_dir, "*")): if os.path.isdir(sampledir): sample = os.path.basename(sampledir) vcfs = [] outfile = os.path.join(sampledir, "TOTAL.vcf") for vcf in glob.glob(os.path.join(sampledir, "*.vcf")): if os.path.basename(vcf) in ["BRCA1.vcf", "BRCA2.vcf", "SMA.vcf", "LRC_KIR.vcf", "MHC.vcf"]: run("vcfsort {} > {}.sort".format(vcf, vcf), fail_hard = True) run("bgzip -c {}.sort > {}.gz".format(vcf, vcf), fail_hard = True) run("rm -f {}.sort".format(vcf)) run("tabix -f -p vcf {}.gz".format(vcf), fail_hard = True) vcfs.append("{}.gz".format(vcf)) if len(vcfs) > 0: run("vt cat {} > {}".format(" ".join(vcfs), outfile), fail_hard = True) run("vcfsort {} > {}.sort".format(outfile, outfile), fail_hard = True) run("mv {}.sort {}".format(outfile, outfile), fail_hard = True) run("bgzip -c {} > {}.gz".format(outfile, outfile), fail_hard = True) run("tabix -f -p vcf {}.gz".format(outfile), fail_hard = True) return 0 # expect call_dir/<REGION>/<GRAPH>/<SAMPLE>_sample.vcf # count up regions regions = set() for regiondir in glob.glob(os.path.join(options.call_dir, "*")): if os.path.isdir(regiondir): region = os.path.basename(regiondir) # avoid crufty directories (including outputs of previous runs of this script) if region in ["brca1", "brca2", "mhc", "lrc_kir", "sma"]: regions.add(region) print regions # count up graphs (that are present in every region) graphs = set() gcount = defaultdict(int) for region in regions: for graphdir in glob.glob(os.path.join(options.call_dir, region, "*")): if os.path.isdir(graphdir): graph = os.path.basename(graphdir) gcount[graph] = gcount[graph] + 1 for graph, count in gcount.items(): if count == len(regions): graphs.add(graph) print graphs # count up samples samples = set() scount = defaultdict(int) for region in regions: for graph in graphs: for vcf in glob.glob(os.path.join(options.call_dir, region, graph, "*_sample.vcf")): sample = os.path.basename(vcf).split("_")[0] scount[sample] = scount[sample] + 1 for sample, count in scount.items(): samples.add(sample) print samples # make our output directory out_dir = os.path.join(options.call_dir, options.name) robust_makedirs(out_dir) for graph in graphs: g_out_dir = os.path.join(out_dir, graph) for sample in samples: vcf_files = [] for region in regions: vcf = os.path.join(options.call_dir, region, graph, "{}_sample.vcf".format(sample)) if os.path.isfile(vcf): vcf_files.append((region, vcf)) # this sample doesn't span all regions, skip it if len(vcf_files) < len(regions): print "Skipping Sample {} for Graph {}".format(sample, graph) continue # output vcf merge_vcf_path = os.path.join(out_dir, graph, "{}_sample.vcf".format(sample)) # working directory for intermediates / debugging work_path = os.path.join(out_dir, graph, "input", sample) robust_makedirs(work_path) # preprocess all the vcfs and leave in input dir input_files = [] for region, vcf in vcf_files: outbase = os.path.join(work_path, region) run("vcfsort {} > {}.vcf".format(vcf, outbase), fail_hard = True) run("bgzip -f {}.vcf".format(outbase)) run("tabix -f -p vcf {}.vcf.gz".format(outbase)) input_files.append("{}.vcf.gz".format(outbase)) # run the merge run("vt cat {} > {}".format(" ".join(input_files), merge_vcf_path), fail_hard = True) # make an index just in case run("vcfsort {} > {}.sort".format(merge_vcf_path, merge_vcf_path), fail_hard = True) run("mv {}.sort {}".format(merge_vcf_path, merge_vcf_path), fail_hard = True) run("bgzip -c {} > {}.gz".format(merge_vcf_path, merge_vcf_path), fail_hard = True) run("tabix -f -p vcf {}.gz".format(merge_vcf_path, merge_vcf_path), fail_hard = True) return 0
def plot_vcf_comp(tsv_path, options): """ take the big vcf compare table and make precision_recall plots for all the categories""" out_dir = os.path.join(options.comp_dir, "comp_plots") robust_makedirs(out_dir) out_name = os.path.basename(os.path.splitext(tsv_path)[0]) out_base_path = os.path.join(out_dir, out_name) region = out_name.split("-")[-1].upper() out_base_path_f1 = os.path.join(out_dir, "-".join(out_name.split("-")[:-1]) + "--f1-" + region) params = " ".join(PLOT_PARAMS) # precision recall scatter plot header = vcf_dist_header(options) # strip qual header = header[:-1] for i in range(len(header) / 2): prec_idx = 2 * i rec_idx = prec_idx + 1 qual_idx = len(header) print prec_idx, header[prec_idx], rec_idx, header[rec_idx] ptoks = header[prec_idx].split("-") rtoks = header[rec_idx].split("-") assert ptoks[1] == "Precision" assert rtoks[1] == "Recall" assert ptoks[:1] == rtoks[:1] comp_cat = ptoks[0] if comp_cat not in ["TOT", "SNP", "INDEL"]: continue label = header[prec_idx].replace("Precision", "acc") acc_tsv = out_base_path + "_" + label + ".tsv" print "Make {} tsv with cols {} {}".format(label, rec_idx, prec_idx) # +1 to convert to awk 1-base coordinates. +1 again since header doesnt include row_label col awkcmd = '''if (NR!=1) print $1 "\t" ${} "\t" ${} "\t" ${}'''.format(rec_idx + 2, prec_idx + 2, qual_idx + 2) awkstr = "awk \'{" + awkcmd + "}\'" run("{} {} > {}".format(awkstr, tsv_path, acc_tsv)) acc_png = out_base_path + "_" + label + ".png" title = "VCF" if comp_cat == "TOT": title += " Total Accuracy" else: title += " {} Accuracy".format(comp_cat) title += " for {}".format(region) cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x -0.01 --max_x 1.01 --min_y -0.01 --max_y 1.01".format(acc_tsv, acc_png, title, params) print cmd os.system(cmd) #flatten to max f1 tsv and plot as bars f1_tsv = out_base_path_f1 + "_" + label + ".tsv" f1_png = out_base_path_f1 + "_" + label + ".png" f1_pr_tsv = out_base_path_f1.replace("-f1-", "-f1--pr-") + "_" + label + ".tsv" f1_pr_png = out_base_path_f1.replace("-f1-", "-f1--pr-") + "_" + label + ".png" f1_qual_tsv = out_base_path_f1.replace("-f1-", "-f1-qual-") + "_" + label + ".tsv" f1_qual_png = out_base_path_f1.replace("-f1-", "-f1-qual-") + "_" + label + ".png" make_max_f1_tsv(acc_tsv, f1_tsv, f1_pr_tsv, f1_qual_tsv, options) cmd = "scripts/barchart.py {} --save {} --title \"{}\" --x_sideways --x_label \"Graph\" --y_label \"Max F1\" {}".format(f1_tsv, f1_png, title, params) print cmd os.system(cmd) cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x -0.01 --max_x 1.01 --min_y -0.01 --max_y 1.01".format(f1_pr_tsv, f1_pr_png, title, params) print cmd os.system(cmd) cmd = "scripts/barchart.py {} --save {} --title \"{}\" --x_sideways --x_label \"Graph\" --y_label \"Quality for Max F1\" {} --max 20".format(f1_qual_tsv, f1_qual_png, title, params) print cmd os.system(cmd) if options.top is True: # top 20 cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.798 --max_x 1.002 --min_y 0.798 --max_y 1.002".format(acc_tsv, acc_png.replace(".png", "_top20.png"), title, params) print cmd os.system(cmd) # top 20 cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 11 --height 5.5 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.796 --max_x 1.004 --min_y 0.796 --max_y 1.004".format(acc_tsv, acc_png.replace(".png", "_top20_inset.png"), title, params) print cmd os.system(cmd) # top 40 cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.596 --max_x 1.004 --min_y 0.596 --max_y 1.004".format(acc_tsv, acc_png.replace(".png", "_top40.png"), title, params) print cmd os.system(cmd)
def plot_vcf_comp(tsv_path, options): """ take the big vcf compare table and make precision_recall plots for all the categories""" out_dir = os.path.join(options.comp_dir, "comp_plots") robust_makedirs(out_dir) out_name = os.path.basename(os.path.splitext(tsv_path)[0]) sample = out_name.split("-")[-1].upper() region = out_name.split("-")[-2].upper() def out_base_path(tag, label, extension): bd = tag if extension != ".tsv" else "tsv" ret = ( os.path.join(out_dir, bd, "-".join(out_name.split("-")[:-1]) + "-{}-{}-".format(sample, tag) + region) + "_" + label + extension ) robust_makedirs(os.path.dirname(ret)) return ret params = " ".join(PLOT_PARAMS) # precision recall scatter plot header = vcf_dist_header(options) # strip qual header = header[:-1] for i in range(len(header) / 2): prec_idx = 2 * i rec_idx = prec_idx + 1 qual_idx = len(header) print prec_idx, header[prec_idx], rec_idx, header[rec_idx] ptoks = header[prec_idx].split("-") rtoks = header[rec_idx].split("-") assert ptoks[1] == "Precision" assert rtoks[1] == "Recall" assert ptoks[:1] == rtoks[:1] comp_cat = ptoks[0] if comp_cat not in ["TOT", "SNP", "INDEL"]: continue label = header[prec_idx].replace("Precision", "acc") acc_tsv = out_base_path("pr", label, ".tsv") print "Make {} tsv with cols {} {}".format(label, rec_idx, prec_idx) # +1 to convert to awk 1-base coordinates. +1 again since header doesnt include row_label col awkcmd = """if (NR!=1) print $1 "\t" ${} "\t" ${} "\t" ${}""".format(rec_idx + 2, prec_idx + 2, qual_idx + 2) awkstr = "awk '{" + awkcmd + "}'" run("{} {} > {}".format(awkstr, tsv_path, acc_tsv)) acc_png = out_base_path("pr", label, ".png") title = sample.upper() + " " if comp_cat == "TOT": title += " Total Accuracy" else: title += " {} Accuracy".format(comp_cat.title()) if region == "TOTAL": title += ", all regions" else: title += ", {}".format(region) cmd = 'scripts/scatter.py {} --save {} --title "{}" --x_label "Recall" --y_label "Precision" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x -0.01 --max_x 1.01 --min_y -0.01 --max_y 1.01'.format( acc_tsv, acc_png, title, params ) print cmd os.system(cmd) # flatten to max f1 tsv and plot as bars f1_tsv = out_base_path("f1bar", label, ".tsv") f1_png = out_base_path("f1bar", label, ".png") f1_pr_tsv = out_base_path("f1pr", label, ".tsv") f1_pr_png = out_base_path("f1pr", label, ".png") f1_qual_tsv = out_base_path("f1qual", label, ".tsv") f1_qual_png = out_base_path("f1qual", label, ".png") make_max_f1_tsv(acc_tsv, f1_tsv, f1_pr_tsv, f1_qual_tsv, options) cmd = 'scripts/barchart.py {} --ascending --no_n --save {} --title "{}" --x_sideways --x_label "Graph" --y_label "Max F1" {}'.format( f1_tsv, f1_png, title, params ) print cmd os.system(cmd) cmd = 'scripts/scatter.py {} --save {} --title "{}" --x_label "Recall" --y_label "Precision" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5'.format( f1_pr_tsv, f1_pr_png, title, params ) print cmd os.system(cmd) cmd = 'scripts/barchart.py {} --ascending --no_n --save {} --title "{}" --x_sideways --x_label "Graph" --y_label "Quality for Max F1" {}'.format( f1_qual_tsv, f1_qual_png, title, params ) print cmd os.system(cmd) if options.top is True: # top 20 cmd = 'scripts/scatter.py {} --save {} --title "{}" --x_label "Recall" --y_label "Precision" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.798 --max_x 1.002 --min_y 0.798 --max_y 1.002'.format( acc_tsv, acc_png.replace(".png", "_top20.png"), title, params ) print cmd os.system(cmd) # top 20 cmd = 'scripts/scatter.py {} --save {} --title "{}" --x_label "Recall" --y_label "Precision" --width 11 --height 5.5 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.796 --max_x 1.004 --min_y 0.796 --max_y 1.004'.format( acc_tsv, acc_png.replace(".png", "_top20_inset.png"), title, params ) print cmd os.system(cmd) # top 40 cmd = 'scripts/scatter.py {} --save {} --title "{}" --x_label "Recall" --y_label "Precision" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.596 --max_x 1.004 --min_y 0.596 --max_y 1.004'.format( acc_tsv, acc_png.replace(".png", "_top40.png"), title, params ) print cmd os.system(cmd) # top .5 bar cmd = 'scripts/barchart.py {} --ascending --no_n --save {} --title "{}" --x_sideways --x_label "Graph" --y_label "Max F1" {} --min 0.5'.format( f1_tsv, f1_png.replace(".png", "_top50.png"), title, params ) print cmd os.system(cmd) # top .6 bar cmd = 'scripts/barchart.py {} --ascending --no_n --save {} --title "{}" --x_sideways --x_label "Graph" --y_label "Max F1" {} --min 0.6'.format( f1_tsv, f1_png.replace(".png", "_top60.png"), title, params ) print cmd os.system(cmd) # top .7 bar cmd = 'scripts/barchart.py {} --ascending --no_n --save {} --title "{}" --x_sideways --x_label "Graph" --y_label "Max F1" {} --min 0.7'.format( f1_tsv, f1_png.replace(".png", "_top70.png"), title, params ) print cmd os.system(cmd) # top .85 bar cmd = 'scripts/barchart.py {} --ascending --no_n --save {} --title "{}" --x_sideways --x_label "Graph" --y_label "Max F1" {} --min 0.85'.format( f1_tsv, f1_png.replace(".png", "_top85.png"), title, params ) print cmd os.system(cmd) # top .25 f1pr scatter cmd = 'scripts/scatter.py {} --save {} --title "{}" --x_label "Recall" --y_label "Precision" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.746 --max_x 1.004 --min_y 0.746 --max_y 1.004'.format( f1_pr_tsv, f1_pr_png.replace(".png", "_top25.png"), title, params ) print cmd os.system(cmd) # top .50 f1pr scatter cmd = 'scripts/scatter.py {} --save {} --title "{}" --x_label "Recall" --y_label "Precision" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.496 --max_x 1.004 --min_y 0.496 --max_y 1.004'.format( f1_pr_tsv, f1_pr_png.replace(".png", "_top50.png"), title, params ) print cmd os.system(cmd) # top .65 f1pr scatter cmd = 'scripts/scatter.py {} --save {} --title "{}" --x_label "Recall" --y_label "Precision" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.646 --max_x 1.004 --min_y 0.646 --max_y 1.004'.format( f1_pr_tsv, f1_pr_png.replace(".png", "_top65.png"), title, params ) print cmd os.system(cmd)
def main(args): options = parse_args(args) RealTimeLogger.start_master() if options.classic: # expect call_dir/SAMPLE/region.vcf for sampledir in glob.glob(os.path.join(options.call_dir, "*")): if os.path.isdir(sampledir): sample = os.path.basename(sampledir) vcfs = [] outfile = os.path.join(sampledir, "TOTAL.vcf") for vcf in glob.glob(os.path.join(sampledir, "*.vcf")): if os.path.basename(vcf) in [ "BRCA1.vcf", "BRCA2.vcf", "SMA.vcf", "LRC_KIR.vcf", "MHC.vcf" ]: run("vcfsort {} > {}.sort".format(vcf, vcf), fail_hard=True) run("bgzip -c {}.sort > {}.gz".format(vcf, vcf), fail_hard=True) run("rm -f {}.sort".format(vcf)) run("tabix -f -p vcf {}.gz".format(vcf), fail_hard=True) vcfs.append("{}.gz".format(vcf)) if len(vcfs) > 0: run("vt cat {} > {}".format(" ".join(vcfs), outfile), fail_hard=True) run("vcfsort {} > {}.sort".format(outfile, outfile), fail_hard=True) run("mv {}.sort {}".format(outfile, outfile), fail_hard=True) run("bgzip -c {} > {}.gz".format(outfile, outfile), fail_hard=True) run("tabix -f -p vcf {}.gz".format(outfile), fail_hard=True) return 0 # expect call_dir/<REGION>/<GRAPH>/<SAMPLE>_sample.vcf # count up regions regions = set() for regiondir in glob.glob(os.path.join(options.call_dir, "*")): if os.path.isdir(regiondir): region = os.path.basename(regiondir) # avoid crufty directories (including outputs of previous runs of this script) if region in ["brca1", "brca2", "mhc", "lrc_kir", "sma"]: regions.add(region) print regions # count up graphs (that are present in every region) graphs = set() gcount = defaultdict(int) for region in regions: for graphdir in glob.glob(os.path.join(options.call_dir, region, "*")): if os.path.isdir(graphdir): graph = os.path.basename(graphdir) gcount[graph] = gcount[graph] + 1 for graph, count in gcount.items(): if count == len(regions): graphs.add(graph) print graphs # count up samples samples = set() scount = defaultdict(int) for region in regions: for graph in graphs: for vcf in glob.glob( os.path.join(options.call_dir, region, graph, "*_sample.vcf")): sample = os.path.basename(vcf).split("_")[0] scount[sample] = scount[sample] + 1 for sample, count in scount.items(): samples.add(sample) print samples # make our output directory out_dir = os.path.join(options.call_dir, options.name) robust_makedirs(out_dir) for graph in graphs: g_out_dir = os.path.join(out_dir, graph) for sample in samples: vcf_files = [] for region in regions: vcf = os.path.join(options.call_dir, region, graph, "{}_sample.vcf".format(sample)) if os.path.isfile(vcf): vcf_files.append((region, vcf)) # this sample doesn't span all regions, skip it if len(vcf_files) < len(regions): print "Skipping Sample {} for Graph {}".format(sample, graph) continue # output vcf merge_vcf_path = os.path.join(out_dir, graph, "{}_sample.vcf".format(sample)) # working directory for intermediates / debugging work_path = os.path.join(out_dir, graph, "input", sample) robust_makedirs(work_path) # preprocess all the vcfs and leave in input dir input_files = [] for region, vcf in vcf_files: outbase = os.path.join(work_path, region) run("vcfsort {} > {}.vcf".format(vcf, outbase), fail_hard=True) run("bgzip -f {}.vcf".format(outbase)) run("tabix -f -p vcf {}.vcf.gz".format(outbase)) input_files.append("{}.vcf.gz".format(outbase)) # run the merge run("vt cat {} > {}".format(" ".join(input_files), merge_vcf_path), fail_hard=True) # make an index just in case run("vcfsort {} > {}.sort".format(merge_vcf_path, merge_vcf_path), fail_hard=True) run("mv {}.sort {}".format(merge_vcf_path, merge_vcf_path), fail_hard=True) run("bgzip -c {} > {}.gz".format(merge_vcf_path, merge_vcf_path), fail_hard=True) run("tabix -f -p vcf {}.gz".format(merge_vcf_path, merge_vcf_path), fail_hard=True) return 0