def plot_vcf_comp(tsv_path, options): """ take the big vcf compare table and make precision_recall plots for all the categories""" out_dir = os.path.join(options.comp_dir, "comp_plots") robust_makedirs(out_dir) out_name = os.path.basename(os.path.splitext(tsv_path)[0]) out_base_path = os.path.join(out_dir, out_name) region = out_name.split("-")[-1].upper() out_base_path_f1 = os.path.join(out_dir, "-".join(out_name.split("-")[:-1]) + "--f1-" + region) params = " ".join(PLOT_PARAMS) # precision recall scatter plot header = vcf_dist_header(options) # strip qual header = header[:-1] for i in range(len(header) / 2): prec_idx = 2 * i rec_idx = prec_idx + 1 qual_idx = len(header) print prec_idx, header[prec_idx], rec_idx, header[rec_idx] ptoks = header[prec_idx].split("-") rtoks = header[rec_idx].split("-") assert ptoks[1] == "Precision" assert rtoks[1] == "Recall" assert ptoks[:1] == rtoks[:1] comp_cat = ptoks[0] if comp_cat not in ["TOT", "SNP", "INDEL"]: continue label = header[prec_idx].replace("Precision", "acc") acc_tsv = out_base_path + "_" + label + ".tsv" print "Make {} tsv with cols {} {}".format(label, rec_idx, prec_idx) # +1 to convert to awk 1-base coordinates. +1 again since header doesnt include row_label col awkcmd = '''if (NR!=1) print $1 "\t" ${} "\t" ${} "\t" ${}'''.format(rec_idx + 2, prec_idx + 2, qual_idx + 2) awkstr = "awk \'{" + awkcmd + "}\'" run("{} {} > {}".format(awkstr, tsv_path, acc_tsv)) acc_png = out_base_path + "_" + label + ".png" title = "VCF" if comp_cat == "TOT": title += " Total Accuracy" else: title += " {} Accuracy".format(comp_cat) title += " for {}".format(region) cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x -0.01 --max_x 1.01 --min_y -0.01 --max_y 1.01".format(acc_tsv, acc_png, title, params) print cmd os.system(cmd) #flatten to max f1 tsv and plot as bars f1_tsv = out_base_path_f1 + "_" + label + ".tsv" f1_png = out_base_path_f1 + "_" + label + ".png" f1_pr_tsv = out_base_path_f1.replace("-f1-", "-f1--pr-") + "_" + label + ".tsv" f1_pr_png = out_base_path_f1.replace("-f1-", "-f1--pr-") + "_" + label + ".png" f1_qual_tsv = out_base_path_f1.replace("-f1-", "-f1-qual-") + "_" + label + ".tsv" f1_qual_png = out_base_path_f1.replace("-f1-", "-f1-qual-") + "_" + label + ".png" make_max_f1_tsv(acc_tsv, f1_tsv, f1_pr_tsv, f1_qual_tsv, options) cmd = "scripts/barchart.py {} --save {} --title \"{}\" --x_sideways --x_label \"Graph\" --y_label \"Max F1\" {}".format(f1_tsv, f1_png, title, params) print cmd os.system(cmd) cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x -0.01 --max_x 1.01 --min_y -0.01 --max_y 1.01".format(f1_pr_tsv, f1_pr_png, title, params) print cmd os.system(cmd) cmd = "scripts/barchart.py {} --save {} --title \"{}\" --x_sideways --x_label \"Graph\" --y_label \"Quality for Max F1\" {} --max 20".format(f1_qual_tsv, f1_qual_png, title, params) print cmd os.system(cmd) if options.top is True: # top 20 cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.798 --max_x 1.002 --min_y 0.798 --max_y 1.002".format(acc_tsv, acc_png.replace(".png", "_top20.png"), title, params) print cmd os.system(cmd) # top 20 cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 11 --height 5.5 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.796 --max_x 1.004 --min_y 0.796 --max_y 1.004".format(acc_tsv, acc_png.replace(".png", "_top20_inset.png"), title, params) print cmd os.system(cmd) # top 40 cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.596 --max_x 1.004 --min_y 0.596 --max_y 1.004".format(acc_tsv, acc_png.replace(".png", "_top40.png"), title, params) print cmd os.system(cmd)
def plot_vcf_comp(tsv_path, options): """ take the big vcf compare table and make precision_recall plots for all the categories""" out_dir = os.path.join(options.comp_dir, "comp_plots") robust_makedirs(out_dir) out_name = os.path.basename(os.path.splitext(tsv_path)[0]) sample = out_name.split("-")[-1].upper() region = out_name.split("-")[-2].upper() def out_base_path(tag, label, extension): bd = tag if extension != ".tsv" else "tsv" ret = os.path.join(out_dir, bd, "-".join(out_name.split("-")[:-1]) + "-{}-{}-".format(sample, tag) + region) + "_" + label + extension robust_makedirs(os.path.dirname(ret)) return ret params = " ".join(PLOT_PARAMS) # precision recall scatter plot header = vcf_dist_header(options) # strip qual header = header[:-1] for i in range(len(header) / 2): prec_idx = 2 * i rec_idx = prec_idx + 1 qual_idx = len(header) print prec_idx, header[prec_idx], rec_idx, header[rec_idx] ptoks = header[prec_idx].split("-") rtoks = header[rec_idx].split("-") assert ptoks[1] == "Precision" assert rtoks[1] == "Recall" assert ptoks[:1] == rtoks[:1] comp_cat = ptoks[0] if comp_cat not in ["TOT", "SNP", "INDEL"]: continue label = header[prec_idx].replace("Precision", "acc") acc_tsv = out_base_path("pr", label, ".tsv") print "Make {} tsv with cols {} {}".format(label, rec_idx, prec_idx) # +1 to convert to awk 1-base coordinates. +1 again since header doesnt include row_label col awkcmd = '''if (NR!=1) print $1 "\t" ${} "\t" ${} "\t" ${}'''.format(rec_idx + 2, prec_idx + 2, qual_idx + 2) awkstr = "awk \'{" + awkcmd + "}\'" run("{} {} > {}".format(awkstr, tsv_path, acc_tsv)) acc_png = out_base_path("pr", label, ".png") title = sample.upper() + " " if comp_cat == "TOT": title += " Total Accuracy" else: title += " {} Accuracy".format(comp_cat.title()) if region == "TOTAL": title += ", all regions" else: title += ", {}".format(region) cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x -0.01 --max_x 1.01 --min_y -0.01 --max_y 1.01".format(acc_tsv, acc_png, title, params) print cmd os.system(cmd) #flatten to max f1 tsv and plot as bars f1_tsv = out_base_path("f1bar", label, ".tsv") f1_png = out_base_path("f1bar", label, ".png") f1_pr_tsv = out_base_path("f1pr", label, ".tsv") f1_pr_png = out_base_path("f1pr", label, ".png") f1_qual_tsv = out_base_path("f1qual", label, ".tsv") f1_qual_png = out_base_path("f1qual", label, ".png") make_max_f1_tsv(acc_tsv, f1_tsv, f1_pr_tsv, f1_qual_tsv, options) cmd = "scripts/barchart.py {} --ascending --no_n --save {} --title \"{}\" --x_sideways --x_label \"Graph\" --y_label \"Max F1\" {}".format(f1_tsv, f1_png, title, params) print cmd os.system(cmd) cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5".format(f1_pr_tsv, f1_pr_png, title, params) print cmd os.system(cmd) cmd = "scripts/barchart.py {} --ascending --no_n --save {} --title \"{}\" --x_sideways --x_label \"Graph\" --y_label \"Quality for Max F1\" {}".format(f1_qual_tsv, f1_qual_png, title, params) print cmd os.system(cmd) if options.top is True: # top 20 cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.798 --max_x 1.002 --min_y 0.798 --max_y 1.002".format(acc_tsv, acc_png.replace(".png", "_top20.png"), title, params) print cmd os.system(cmd) # top 20 cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 11 --height 5.5 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.796 --max_x 1.004 --min_y 0.796 --max_y 1.004".format(acc_tsv, acc_png.replace(".png", "_top20_inset.png"), title, params) print cmd os.system(cmd) # top 40 cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.596 --max_x 1.004 --min_y 0.596 --max_y 1.004".format(acc_tsv, acc_png.replace(".png", "_top40.png"), title, params) print cmd os.system(cmd) # top .5 bar cmd = "scripts/barchart.py {} --ascending --no_n --save {} --title \"{}\" --x_sideways --x_label \"Graph\" --y_label \"Max F1\" {} --min 0.5".format(f1_tsv, f1_png.replace(".png", "_top50.png"), title, params) print cmd os.system(cmd) # top .6 bar cmd = "scripts/barchart.py {} --ascending --no_n --save {} --title \"{}\" --x_sideways --x_label \"Graph\" --y_label \"Max F1\" {} --min 0.6".format(f1_tsv, f1_png.replace(".png", "_top60.png"), title, params) print cmd os.system(cmd) # top .7 bar cmd = "scripts/barchart.py {} --ascending --no_n --save {} --title \"{}\" --x_sideways --x_label \"Graph\" --y_label \"Max F1\" {} --min 0.7".format(f1_tsv, f1_png.replace(".png", "_top70.png"), title, params) print cmd os.system(cmd) # top .85 bar cmd = "scripts/barchart.py {} --ascending --no_n --save {} --title \"{}\" --x_sideways --x_label \"Graph\" --y_label \"Max F1\" {} --min 0.85".format(f1_tsv, f1_png.replace(".png", "_top85.png"), title, params) print cmd os.system(cmd) # top .25 f1pr scatter cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.746 --max_x 1.004 --min_y 0.746 --max_y 1.004".format(f1_pr_tsv, f1_pr_png.replace(".png", "_top25.png"), title, params) print cmd os.system(cmd) # top .50 f1pr scatter cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.496 --max_x 1.004 --min_y 0.496 --max_y 1.004".format(f1_pr_tsv, f1_pr_png.replace(".png", "_top50.png"), title, params) print cmd os.system(cmd) # top .65 f1pr scatter cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.646 --max_x 1.004 --min_y 0.646 --max_y 1.004".format(f1_pr_tsv, f1_pr_png.replace(".png", "_top65.png"), title, params) print cmd os.system(cmd)
def plot_vcf_comp(tsv_path, options): """ take the big vcf compare table and make precision_recall plots for all the categories""" out_dir = os.path.join(options.comp_dir, "comp_plots") robust_makedirs(out_dir) out_name = os.path.basename(os.path.splitext(tsv_path)[0]) sample = out_name.split("-")[-1].upper() region = out_name.split("-")[-2].upper() def out_base_path(tag, label, extension): bd = tag if extension != ".tsv" else "tsv" ret = ( os.path.join(out_dir, bd, "-".join(out_name.split("-")[:-1]) + "-{}-{}-".format(sample, tag) + region) + "_" + label + extension ) robust_makedirs(os.path.dirname(ret)) return ret params = " ".join(PLOT_PARAMS) # precision recall scatter plot header = vcf_dist_header(options) # strip qual header = header[:-1] for i in range(len(header) / 2): prec_idx = 2 * i rec_idx = prec_idx + 1 qual_idx = len(header) print prec_idx, header[prec_idx], rec_idx, header[rec_idx] ptoks = header[prec_idx].split("-") rtoks = header[rec_idx].split("-") assert ptoks[1] == "Precision" assert rtoks[1] == "Recall" assert ptoks[:1] == rtoks[:1] comp_cat = ptoks[0] if comp_cat not in ["TOT", "SNP", "INDEL"]: continue label = header[prec_idx].replace("Precision", "acc") acc_tsv = out_base_path("pr", label, ".tsv") print "Make {} tsv with cols {} {}".format(label, rec_idx, prec_idx) # +1 to convert to awk 1-base coordinates. +1 again since header doesnt include row_label col awkcmd = """if (NR!=1) print $1 "\t" ${} "\t" ${} "\t" ${}""".format(rec_idx + 2, prec_idx + 2, qual_idx + 2) awkstr = "awk '{" + awkcmd + "}'" run("{} {} > {}".format(awkstr, tsv_path, acc_tsv)) acc_png = out_base_path("pr", label, ".png") title = sample.upper() + " " if comp_cat == "TOT": title += " Total Accuracy" else: title += " {} Accuracy".format(comp_cat.title()) if region == "TOTAL": title += ", all regions" else: title += ", {}".format(region) cmd = 'scripts/scatter.py {} --save {} --title "{}" --x_label "Recall" --y_label "Precision" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x -0.01 --max_x 1.01 --min_y -0.01 --max_y 1.01'.format( acc_tsv, acc_png, title, params ) print cmd os.system(cmd) # flatten to max f1 tsv and plot as bars f1_tsv = out_base_path("f1bar", label, ".tsv") f1_png = out_base_path("f1bar", label, ".png") f1_pr_tsv = out_base_path("f1pr", label, ".tsv") f1_pr_png = out_base_path("f1pr", label, ".png") f1_qual_tsv = out_base_path("f1qual", label, ".tsv") f1_qual_png = out_base_path("f1qual", label, ".png") make_max_f1_tsv(acc_tsv, f1_tsv, f1_pr_tsv, f1_qual_tsv, options) cmd = 'scripts/barchart.py {} --ascending --no_n --save {} --title "{}" --x_sideways --x_label "Graph" --y_label "Max F1" {}'.format( f1_tsv, f1_png, title, params ) print cmd os.system(cmd) cmd = 'scripts/scatter.py {} --save {} --title "{}" --x_label "Recall" --y_label "Precision" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5'.format( f1_pr_tsv, f1_pr_png, title, params ) print cmd os.system(cmd) cmd = 'scripts/barchart.py {} --ascending --no_n --save {} --title "{}" --x_sideways --x_label "Graph" --y_label "Quality for Max F1" {}'.format( f1_qual_tsv, f1_qual_png, title, params ) print cmd os.system(cmd) if options.top is True: # top 20 cmd = 'scripts/scatter.py {} --save {} --title "{}" --x_label "Recall" --y_label "Precision" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.798 --max_x 1.002 --min_y 0.798 --max_y 1.002'.format( acc_tsv, acc_png.replace(".png", "_top20.png"), title, params ) print cmd os.system(cmd) # top 20 cmd = 'scripts/scatter.py {} --save {} --title "{}" --x_label "Recall" --y_label "Precision" --width 11 --height 5.5 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.796 --max_x 1.004 --min_y 0.796 --max_y 1.004'.format( acc_tsv, acc_png.replace(".png", "_top20_inset.png"), title, params ) print cmd os.system(cmd) # top 40 cmd = 'scripts/scatter.py {} --save {} --title "{}" --x_label "Recall" --y_label "Precision" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.596 --max_x 1.004 --min_y 0.596 --max_y 1.004'.format( acc_tsv, acc_png.replace(".png", "_top40.png"), title, params ) print cmd os.system(cmd) # top .5 bar cmd = 'scripts/barchart.py {} --ascending --no_n --save {} --title "{}" --x_sideways --x_label "Graph" --y_label "Max F1" {} --min 0.5'.format( f1_tsv, f1_png.replace(".png", "_top50.png"), title, params ) print cmd os.system(cmd) # top .6 bar cmd = 'scripts/barchart.py {} --ascending --no_n --save {} --title "{}" --x_sideways --x_label "Graph" --y_label "Max F1" {} --min 0.6'.format( f1_tsv, f1_png.replace(".png", "_top60.png"), title, params ) print cmd os.system(cmd) # top .7 bar cmd = 'scripts/barchart.py {} --ascending --no_n --save {} --title "{}" --x_sideways --x_label "Graph" --y_label "Max F1" {} --min 0.7'.format( f1_tsv, f1_png.replace(".png", "_top70.png"), title, params ) print cmd os.system(cmd) # top .85 bar cmd = 'scripts/barchart.py {} --ascending --no_n --save {} --title "{}" --x_sideways --x_label "Graph" --y_label "Max F1" {} --min 0.85'.format( f1_tsv, f1_png.replace(".png", "_top85.png"), title, params ) print cmd os.system(cmd) # top .25 f1pr scatter cmd = 'scripts/scatter.py {} --save {} --title "{}" --x_label "Recall" --y_label "Precision" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.746 --max_x 1.004 --min_y 0.746 --max_y 1.004'.format( f1_pr_tsv, f1_pr_png.replace(".png", "_top25.png"), title, params ) print cmd os.system(cmd) # top .50 f1pr scatter cmd = 'scripts/scatter.py {} --save {} --title "{}" --x_label "Recall" --y_label "Precision" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.496 --max_x 1.004 --min_y 0.496 --max_y 1.004'.format( f1_pr_tsv, f1_pr_png.replace(".png", "_top50.png"), title, params ) print cmd os.system(cmd) # top .65 f1pr scatter cmd = 'scripts/scatter.py {} --save {} --title "{}" --x_label "Recall" --y_label "Precision" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.646 --max_x 1.004 --min_y 0.646 --max_y 1.004'.format( f1_pr_tsv, f1_pr_png.replace(".png", "_top65.png"), title, params ) print cmd os.system(cmd)