def quantify(args): """ Run quantify and write tables """ vcf_name = args.in_vcf[0] if not vcf_name or not os.path.exists(vcf_name): raise Exception("Cannot read input VCF.") logging.info("Counting variants...") truth_or_query_is_bcf = False try: truth_or_query_is_bcf = args.vcf1.endswith(".bcf") and args.vcf2.endswith(".bcf") except: # args.vcf1 and args.vcf2 are only available when we're running # inside hap.py. pass if args.bcf or truth_or_query_is_bcf: internal_format_suffix = ".bcf" else: internal_format_suffix = ".vcf.gz" output_vcf = args.reports_prefix + internal_format_suffix roc_table = args.reports_prefix + ".roc.tsv" qfyregions = {} if args.fp_bedfile: if not os.path.exists(args.fp_bedfile): raise Exception("FP / Confident region file not found at %s" % args.fp_bedfile) qfyregions["CONF"] = args.fp_bedfile if args.strat_tsv: with open(args.strat_tsv) as sf: for l in sf: n, _, f = l.strip().partition("\t") if n in qfyregions: raise Exception("Duplicate stratification region ID: %s" % n) if not f: if n: raise Exception("No file for stratification region %s" % n) else: continue if not os.path.exists(f): f = os.path.join(os.path.abspath(os.path.dirname(args.strat_tsv)), f) if not os.path.exists(f): raise Exception("Quantification region file %s not found" % f) qfyregions[n] = f if args.strat_regions: for r in args.strat_regions: n, _, f = r.partition(":") if not os.path.exists(f): raise Exception("Quantification region file %s not found" % f) qfyregions[n] = f if vcf_name == output_vcf or vcf_name == output_vcf + internal_format_suffix: raise Exception("Cannot overwrite input VCF: %s would overwritten with output name %s." % (vcf_name, output_vcf)) roc_header = args.roc try: roc_header = args.roc_header except: pass Haplo.quantify.run_quantify(vcf_name, roc_table, output_vcf if args.write_vcf else False, qfyregions, args.ref, threads=args.threads, output_vtc=args.output_vtc, output_rocs=args.do_roc, qtype=args.type, roc_val=args.roc, roc_header=roc_header, roc_filter=args.roc_filter, roc_delta=args.roc_delta, roc_regions=args.roc_regions, clean_info=not args.preserve_info, strat_fixchr=args.strat_fixchr) metrics_output = makeMetricsObject("%s.comparison" % args.runner) filter_handling = None try: if args.engine == "vcfeval" or not args.usefiltered: filter_handling = "ALL" if args.usefiltered else "PASS" except AttributeError: # if we run this through qfy, these arguments are not present pass total_region_size = None headers = Tools.vcfextract.extractHeadersJSON(vcf_name) try: contigs_to_use = ",".join(headers["tabix"]["chromosomes"]) contig_lengths = fastasize.fastaNonNContigLengths(args.ref) total_region_size = fastasize.calculateLength(contig_lengths, contigs_to_use) logging.info("Subset.Size for * is %i, based on these contigs: %s " % (total_region_size, str(contigs_to_use))) except: pass res = Haplo.happyroc.roc(roc_table, args.reports_prefix + ".roc", filter_handling=filter_handling, ci_alpha=args.ci_alpha, total_region_size=total_region_size) df = res["all"] # only use summary numbers df = df[(df["QQ"] == "*") & (df["Filter"].isin(["ALL", "PASS"]))] summary_columns = ["Type", "Filter", ] for additional_column in ["TRUTH.TOTAL", "TRUTH.TP", "TRUTH.FN", "QUERY.TOTAL", "QUERY.FP", "QUERY.UNK", "FP.gt", "FP.al", "METRIC.Recall", "METRIC.Precision", "METRIC.Frac_NA", "METRIC.F1_Score", "TRUTH.TOTAL.TiTv_ratio", "QUERY.TOTAL.TiTv_ratio", "TRUTH.TOTAL.het_hom_ratio", "QUERY.TOTAL.het_hom_ratio"]: summary_columns.append(additional_column) # Remove subtype summary_df = df[(df["Subtype"] == "*") & (df["Genotype"] == "*") & (df["Subset"] == "*")] summary_df[summary_columns].to_csv(args.reports_prefix + ".summary.csv", index=False) metrics_output["metrics"].append(dataframeToMetricsTable("summary.metrics", summary_df[summary_columns])) if args.write_counts: df.to_csv(args.reports_prefix + ".extended.csv", index=False) metrics_output["metrics"].append(dataframeToMetricsTable("all.metrics", df)) essential_numbers = summary_df[summary_columns] pandas.set_option('display.max_columns', 500) pandas.set_option('display.width', 1000) essential_numbers = essential_numbers[essential_numbers["Type"].isin( ["SNP", "INDEL"])] logging.info("\n" + essential_numbers.to_string(index=False)) # in default mode, print result summary to stdout if not args.quiet and not args.verbose: print "Benchmarking Summary:" print essential_numbers.to_string(index=False) # keep this for verbose output if not args.verbose: try: os.unlink(roc_table) except: pass for t in res.iterkeys(): metrics_output["metrics"].append(dataframeToMetricsTable("roc." + t, res[t])) # gzip JSON output if args.write_json: with gzip.open(args.reports_prefix + ".metrics.json.gz", "w") as fp: json.dump(metrics_output, fp)
def main(): parser = argparse.ArgumentParser("Haplotype Comparison") # input parser.add_argument('--location', '-l', dest='locations', required=False, default=None, help='Add a location to the compare list (when not given, will use chr1-22, chrX, chrY).') parser.add_argument("-v", "--version", dest="version", action="store_true", help="Show version number and exit.") parser.add_argument("-P", "--include-nonpass", dest="usefiltered", action="store_true", default=False, help="Use to include failing query variants in comparison.") parser.add_argument("--include-nonpass-truth", dest="usefiltered_truth", action="store_true", default=False, help="Include failing variants from the truth dataset.") parser.add_argument("-R", "--restrict-regions", dest="regions_bedfile", default=None, type=str, help="Restrict analysis to given (sparse) regions (using -R in bcftools).") parser.add_argument("-T", "--target-regions", dest="targets_bedfile", default=None, type=str, help="Restrict analysis to given (dense) regions (using -T in bcftools).") parser.add_argument("-f", "--false-positives", dest="fp_bedfile", default=None, type=str, help="False positive / confident call regions (.bed or .bed.gz).") parser.add_argument("-r", "--reference", dest="ref", default=None, help="Specify a reference file.") # output parser.add_argument("-o", "--report-prefix", dest="reports_prefix", default=None, help="Filename prefix for report output.") parser.add_argument("-V", "--write-vcf", dest="write_vcf", default=False, action="store_true", help="Write an annotated VCF.") parser.add_argument("-B", "--write-bed", dest="write_bed", default=False, action="store_true", help="Write a bed file with the haplotype blocks that were used.") parser.add_argument("-X", "--write-counts", dest="write_counts", default=True, action="store_true", help="Write advanced counts and metrics.") parser.add_argument("--no-write-counts", dest="write_counts", default=True, action="store_false", help="Do not write advanced counts and metrics.") parser.add_argument("--raw-counts", dest="raw_counts", default=False, action="store_true", help="Count variants in unprocessed input VCFs and output as TOTAL.*.RAW.") parser.add_argument("--roc", dest="roc", default=False, help="Select an INFO feature to produce a ROC on. This works best with " "--no-internal-preprocessing and --no-internal-leftshift since these " "flags preserve the most INFO flags from the input files.") parser.add_argument("--roc-filter", dest="roc_filter", default=False, help="Select a filter to ignore when making ROCs.") parser.add_argument("--roc-reversed", dest="roc_reversed", default=False, help="Change the meaning of the ROC feature to count the other way around (higher values=bad).") parser.add_argument("--scratch-prefix", dest="scratch_prefix", default=None, help="Directory for scratch files.") parser.add_argument("--keep-scratch", dest="delete_scratch", default=True, action="store_false", help="Filename prefix for scratch report output.") # detailed control of comparison parser.add_argument("--preprocess-truth", dest="preprocessing_truth", action="store_true", default=False, help="Preprocess truth file using bcftools.") parser.add_argument("--external-preprocessing", dest="preprocessing", action="store_true", default=False, help="Perform VCF preprocessing using bcftools.") parser.add_argument("--bcftools-norm", dest="preprocessing_norm", action="store_true", default=False, help="Enable preprocessing through bcftools norm -c x -D (requires external " " preprocessing to be switched on).") parser.add_argument("--fixchr-truth", dest="fixchr_truth", action="store_true", default=None, help="Add chr prefix to truth file (default: auto).") parser.add_argument("--fixchr-query", dest="fixchr_query", action="store_true", default=None, help="Add chr prefix to query file (default: auto).") parser.add_argument("--no-fixchr-truth", dest="fixchr_truth", action="store_false", help="Disable chr replacement for truth (default: auto).") parser.add_argument("--no-fixchr-query", dest="fixchr_query", action="store_false", help="Add chr prefix to query file (default: auto).") parser.add_argument("--partial-credit", dest="partial_credit", action="store_true", default=None, help="give credit for partially matched variants. " "this is equivalent to --internal-leftshift and --internal-preprocessing.") parser.add_argument("--no-partial-credit", dest="partial_credit", action="store_false", default=None, help="Give credit for partially matched variants. " "This is equivalent to --internal-leftshift and --no-internal-preprocessing.") parser.add_argument("--internal-leftshift", dest="int_preprocessing_ls", action="store_true", default=None, help="Switch off xcmp's internal VCF leftshift preprocessing.") parser.add_argument("--internal-preprocessing", dest="int_preprocessing", action="store_true", default=None, help="Switch off xcmp's internal VCF leftshift preprocessing.") parser.add_argument("--no-internal-leftshift", dest="int_preprocessing_ls", action="store_false", default=None, help="Switch off xcmp's internal VCF leftshift preprocessing.") parser.add_argument("--no-internal-preprocessing", dest="int_preprocessing", action="store_false", default=None, help="Switch off xcmp's internal VCF leftshift preprocessing.") parser.add_argument("--match-raw", dest="int_match_raw", action="store_true", default=False, help="Add a matching step in xcmp which also matches raw variant calls. This helps" " when comparing files with very different representations.") parser.add_argument("--no-haplotype-comparison", dest="no_hc", action="store_true", default=False, help="Disable haplotype comparison (only count direct GT matches as TP).") parser.add_argument("--unhappy", dest="unhappy", action="store_true", default=False, help="Combination of --no-haplotype-comparison --no-internal-preprocessing " "--no-internal-leftshift.") parser.add_argument("--no-auto-index", dest="auto_index", action="store_false", default=True, help="Disable automatic index creation for input files. " "The index is only necessary at this stage if we want to auto-detect locations. " "When used with -l, and when it is known that there are variants at all given locations " "this is not needed and can be switched off to save time.") parser.add_argument("-w", "--window-size", dest="window", default=50, type=int, help="Minimum distance between two variants such that they fall into different haplotype " "blocks") parser.add_argument("--enumeration-threshold", dest="max_enum", default=16768, type=int, help="Enumeration threshold / maximum number of sequences to enumerate per block.") parser.add_argument("-e", "--expand-hapblocks", dest="hb_expand", default=30, type=int, help="Expand haplotype blocks by this many basepairs left and right.") parser.add_argument("--threads", dest="threads", default=multiprocessing.cpu_count(), type=int, help="Number of threads to use.") parser.add_argument("--engine", dest="engine", default="xcmp", choices=["xcmp", "vcfeval"], help="Comparison engine to use.") parser.add_argument("--engine-vcfeval-path", dest="engine_vcfeval", required=False, help="This parameter should give the path to the \"rtg\" executable.") parser.add_argument("--engine-vcfeval-template", dest="engine_vcfeval_template", required=False, help="Vcfeval needs the reference sequence formatted in its own file format " "(SDF -- run rtg format -o ref.SDF ref.fa).") if Tools.has_sge: parser.add_argument("--force-interactive", dest="force_interactive", default=False, action="store_true", help="Force running interactively (i.e. when JOB_ID is not in the environment)") parser.add_argument("_vcfs", help="Two VCF files.", default=[], nargs="*") parser.add_argument("--logfile", dest="logfile", default=None, help="Write logging information into file rather than to stderr") verbosity_options = parser.add_mutually_exclusive_group(required=False) verbosity_options.add_argument("--verbose", dest="verbose", default=False, action="store_true", help="Raise logging level from warning to info.") verbosity_options.add_argument("--quiet", dest="quiet", default=False, action="store_true", help="Set logging level to output errors only.") args, unknown_args = parser.parse_known_args() if not Tools.has_sge: args.force_interactive = True if args.verbose: loglevel = logging.INFO elif args.quiet: loglevel = logging.ERROR else: loglevel = logging.WARNING # reinitialize logging for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) logging.basicConfig(filename=args.logfile, format='%(asctime)s %(levelname)-8s %(message)s', level=loglevel) # remove some safe unknown args unknown_args = [x for x in unknown_args if x not in ["--force-interactive"]] if len(sys.argv) < 2 or len(unknown_args) > 0: if unknown_args: logging.error("Unknown arguments specified : %s " % str(unknown_args)) parser.print_help() exit(0) if args.version: print "Hap.py %s" % Tools.version exit(0) if args.roc: args.write_vcf = True # disable all clever matching if args.unhappy: args.int_preprocessing = False args.int_preprocessing_ls = False args.no_hc = True # Counting with partial credit elif args.partial_credit: # partial_credit switch is overridden by --no-* switches args.int_preprocessing = True args.int_preprocessing_ls = True elif args.partial_credit is None: # in the default setting, we enable partial credit but only override the # preprocessing settings if they haven't been specified if args.int_preprocessing is None: args.int_preprocessing = True if args.int_preprocessing_ls is None: args.int_preprocessing_ls = True elif args.partial_credit is not None: # explicitly set to false args.int_preprocessing = False args.int_preprocessing_ls = True if args.int_preprocessing is None: args.int_preprocessing = False if args.int_preprocessing_ls is None: args.int_preprocessing_ls = False logging.info("Preprocessing settings: %s / %s / %s" % ("leftshift" if args.int_preprocessing_ls else "no-leftshift", "splitting" if args.int_preprocessing else "raw calls", "haplocompare" if not args.no_hc else "no-haplocompare")) # sanity-check regions bed file (HAP-57) if args.regions_bedfile: logging.info("Checking input regions.") if bedOverlapCheck(args.regions_bedfile): raise Exception("The regions bed file (specified using -R) has overlaps, this will not work with xcmp." " You can either use -T, or run the file through bedtools merge") args.preprocessing_truth = True args.preprocessing = True if args.targets_bedfile or args.engine != "xcmp": args.preprocessing_truth = True args.preprocessing = True if args.fp_bedfile and not os.path.exists(args.fp_bedfile): raise Exception("FP/confident call region bed file does not exist.") tempfiles = [] try: if not args.force_interactive and "JOB_ID" not in os.environ: parser.print_help() raise Exception("Please qsub me so I get approximately 1 GB of RAM per thread.") if not args.ref: args.ref = Tools.defaultReference() if not os.path.exists(args.ref): raise Exception("Please specify a valid reference path using -r.") if not args.reports_prefix: raise Exception("Please specify an output prefix using -o ") if not os.path.exists(os.path.dirname(args.reports_prefix)): raise Exception("The output path does not exist. Please specify a valid output path and prefix using -o") if os.path.basename(args.reports_prefix) == "" or os.path.isdir(args.reports_prefix): raise Exception("The output path should specify a file name prefix. Please specify a valid output path " "and prefix using -o. For example, -o /tmp/test will create files named /tmp/test* .") # noinspection PyProtectedMember if not args._vcfs or len(args._vcfs) != 2: raise Exception("Please specify exactly two input VCFs.") # noinspection PyProtectedMember args.vcf1 = args._vcfs[0] # noinspection PyProtectedMember args.vcf2 = args._vcfs[1] if not os.path.exists(args.vcf1): raise Exception("Input file %s does not exist." % args.vcf1) if not os.path.exists(args.vcf2): raise Exception("Input file %s does not exist." % args.vcf2) logging.info("Comparing %s and %s" % (args.vcf1, args.vcf2)) h1 = vcfextract.extractHeadersJSON(args.vcf1) if args.auto_index and not h1["tabix"]: logging.info("Creating indexed version of %s -- consider creating an index beforehand to save time here." % args.vcf1) vtf = tempfile.NamedTemporaryFile(delete=False, dir=args.scratch_prefix, prefix="truth.ix", suffix=".vcf.gz") vtf.close() tempfiles.append(vtf.name) tempfiles.append(vtf.name + ".tbi") args.vcf1 = Tools.bcftools.makeIndex(args.vcf1, vtf.name) h1 = vcfextract.extractHeadersJSON(args.vcf1) h2 = vcfextract.extractHeadersJSON(args.vcf2) if args.auto_index and not h2["tabix"]: logging.info("Creating indexed version of %s -- consider creating an index beforehand to save time here." % args.vcf2) vtf = tempfile.NamedTemporaryFile(delete=False, dir=args.scratch_prefix, prefix="query.ix", suffix=".vcf.gz") vtf.close() tempfiles.append(vtf.name) tempfiles.append(vtf.name + ".tbi") args.vcf2 = Tools.bcftools.makeIndex(args.vcf2, vtf.name) h2 = vcfextract.extractHeadersJSON(args.vcf2) ref_check = False try: happy_ref = args.ref v1r = [_h for _h in h1["fields"] if _h["key"] == "reference"] v2r = [_h for _h in h2["fields"] if _h["key"] == "reference"] if args.verbose: logging.info("References used: hap.py: %s / truth: %s / " "query: %s" % (str(happy_ref), str(v1r), str(v2r))) v1_ref = ";".join([str(xxy["value"]) for xxy in v1r]).replace("file://", "") v2_ref = ";".join([str(xxy["value"]) for xxy in v2r]).replace("file://", "") if happy_ref == v1_ref and v1_ref == v2_ref: ref_check = True refids_found = 0 for refid in ["hg19", "hg38", "grc37", "grc38"]: if refid in happy_ref.lower() and refid in v1_ref.lower() and refid in v2_ref.lower(): if args.verbose: logging.info("Reference matches pattern: %s" % refid) refids_found += 1 if refids_found == 1: ref_check = True except: pass if not ref_check: logging.warn("Reference sequence check failed! " "Please ensure that truth and query VCF use the same reference sequence as " "hap.py. XCMP may fail if this is not the case, and the results will not be " " accurate.") if args.locations is None or len(args.locations) == 0: # all chromosomes args.locations = ["chr" + x for x in map(str, range(1, 23))] if type(args.locations) is not list and args.locations is not None: # noinspection PyUnresolvedReferences args.locations = args.locations.split(",") if not h1["tabix"]: args.preprocessing_truth = True logging.warn("Truth file is not Tabix indexed. Switching on pre-processing + chr name conversion.") if args.fixchr_truth is None: args.fixchr_truth = True elif args.fixchr_truth is None: # autodetect chr naming count_with_fix = len([__ for __ in h1["tabix"]["chromosomes"] if ("chr%s" % str(__)) in args.locations]) count_no_fix = len([__ for __ in h1["tabix"]["chromosomes"] if str(__) in args.locations]) logging.info("Truth: Number of chromosome names matching with / without renaming : %i / %i " % ( count_with_fix, count_no_fix)) if count_with_fix > count_no_fix: args.fixchr_truth = True logging.info("Will fix chromosome names (truth).") else: logging.info("Will not fix chromosome names (truth).") args.fixchr_truth = False if not h2["tabix"]: args.preprocessing = True logging.warn("Query file is not Tabix indexed. Switching on pre-processing + chr name conversion.") # don't overwrite setting, but if it's None, replace with True to be sure if args.fixchr_query is None: args.fixchr_query = True elif args.fixchr_query is None: # autodetect chr naming count_with_fix = len([__ for __ in h2["tabix"]["chromosomes"] if ("chr%s" % str(__)) in args.locations]) count_no_fix = len([__ for __ in h2["tabix"]["chromosomes"] if str(__) in args.locations]) logging.info("Query: Number of chromosome names matching with / without renaming : %i / %i " % ( count_with_fix, count_no_fix)) if count_with_fix > count_no_fix: args.fixchr_query = True logging.info("Will fix chromosome names (query).") else: logging.info("Will not fix chromosome names (query).") args.fixchr_query = False if args.fixchr_truth or args.preprocessing_norm: args.preprocessing_truth = True if args.fixchr_query or args.preprocessing_norm: args.preprocessing = True if args.preprocessing_truth: vtf = tempfile.NamedTemporaryFile(delete=False, dir=args.scratch_prefix, prefix="truth.pp", suffix=".vcf.gz") vtf.close() tempfiles.append(vtf.name) preprocessVCF(args.vcf1, vtf.name, ",".join(args.locations), not args.usefiltered_truth, # pass_only args.fixchr_truth, # chrprefix args.preprocessing_norm, # norm, args.regions_bedfile, args.targets_bedfile, args.ref) args.vcf1 = vtf.name # get headers again if we preprocessed h1 = vcfextract.extractHeadersJSON(args.vcf1) if args.preprocessing: vtf = tempfile.NamedTemporaryFile(delete=False, dir=args.scratch_prefix, prefix="query.pp", suffix=".vcf.gz") vtf.close() tempfiles.append(vtf.name) preprocessVCF(args.vcf2, vtf.name, ",".join(args.locations), not args.usefiltered, # pass_only args.fixchr_query, # chrprefix args.preprocessing_norm, # norm, args.regions_bedfile, args.targets_bedfile, args.ref) args.vcf2 = vtf.name # get headers again if we preprocessed h2 = vcfextract.extractHeadersJSON(args.vcf2) if not h1["tabix"]: raise Exception("Truth file is not Tabix indexed.") if not h2["tabix"]: raise Exception("Truth file is not Tabix indexed.") newlocations = [] if not h1["tabix"]["chromosomes"]: h1["tabix"]["chromosomes"] = [] if not h2["tabix"]["chromosomes"]: h2["tabix"]["chromosomes"] = [] for _xc in args.locations: xc = _xc.split(":")[0] if xc not in h1["tabix"]["chromosomes"]: logging.warn("No calls for location %s in truth!" % xc) if xc not in h2["tabix"]["chromosomes"]: logging.warn("No calls for location %s in query!" % xc) if (xc not in h1["tabix"]["chromosomes"]) and (xc not in h2["tabix"]["chromosomes"]): logging.warn("Removing location %s because neither input file has calls there." % xc) else: newlocations.append(_xc) if not newlocations: raise Exception("Location list is empty: the input files do not appear to have variants on any of %s" % str(args.locations)) args.locations = newlocations if args.threads > 1: logging.info("Running using %i parallel processes." % args.threads) pool = multiprocessing.Pool(int(args.threads)) # find balanced pieces args.pieces = (args.threads + len(args.locations) - 1) / len(args.locations) res = runParallel(pool, Haplo.blocksplit.blocksplitWrapper, args.locations, args) if None in res: raise Exception("One of the blocksplit processes failed.") tempfiles += res args.locations = [] for f in res: with open(f) as fp: for l in fp: ll = l.strip().split("\t", 3) if len(ll) < 3: continue xchr = ll[0] start = int(ll[1]) + 1 end = int(ll[2]) args.locations.append("%s:%i-%i" % (xchr, start, end)) else: pool = None # count variants before normalisation if "samples" not in h1 or not h1["samples"]: raise Exception("Cannot read sample names from truth input file") if args.raw_counts: counts_truth = Haplo.quantify.run_quantify(args.vcf1, None, None, {"CONF": args.fp_bedfile} if args.fp_bedfile else None, args.ref, h1["samples"][0], locations=args.locations) else: counts_truth = None if "samples" not in h2 or not h2["samples"]: raise Exception("Cannot read sample names from truth input file") if args.raw_counts: counts_query = Haplo.quantify.run_quantify(args.vcf2, None, None, {"CONF": args.fp_bedfile} if args.fp_bedfile else None, args.ref, h2["samples"][0], locations=args.locations) else: counts_query = None tf = tempfile.NamedTemporaryFile(delete=False, dir=args.scratch_prefix, prefix="hap.py.result.", suffix=".vcf.gz") tf.close() tempfiles.append(tf.name) output_name = tf.name if args.engine == "xcmp": # do xcmp logging.info("Using xcmp for comparison") res = runParallel(pool, Haplo.xcmp.xcmpWrapper, args.locations, args) tempfiles += [x[0] for x in res if x is not None] # VCFs tempfiles += [x[1] for x in res if x is not None and x[1] is not None] # beds (if any) if None in res: raise Exception("One of the xcmp jobs failed.") if len(res) == 0: raise Exception("Input files/regions do not contain variants (0 haplotype blocks were processed).") # concatenate + index bedfiles = [x[1] for x in res if x is not None and x[1] is not None] if args.write_bed and bedfiles: runme = " ".join(["cat"] + bedfiles + [">", args.reports_prefix.replace(" ", "\\ ") + ".blocks.bed"]) logging.info("Concatenating block files: %s..." % runme) subprocess.check_call(runme, shell=True) logging.info("Concatenating variants...") runme_list = [x[0] for x in res if x is not None] if len(runme_list) == 0: raise Exception("No outputs to concatenate!") fo = Tools.BGZipFile(output_name, True) for i, x in enumerate(runme_list): f = gzip.GzipFile(x) for l in f: if i == 0 or not l[0] == "#": fo.write(l) fo.close() logging.info("Indexing...") to_run = "tabix -p vcf %s" % output_name.replace(" ", "\\ ") logging.info("Running '%s'" % to_run) subprocess.check_call(to_run, shell=True) elif args.engine == "vcfeval": tempfiles += Haplo.vcfeval.runVCFEval(args.vcf1, args.vcf2, output_name, args) else: raise Exception("Unknown comparison engine: %s" % args.engine) if args.write_counts: json_name = args.reports_prefix + ".counts.json" else: tf = tempfile.NamedTemporaryFile(delete=False, dir=args.scratch_prefix, prefix="counts.", suffix=".json") tf.close() json_name = tf.name logging.info("Counting variants...") counts = Haplo.quantify.run_quantify(output_name, json_name, args.reports_prefix + ".vcf.gz" if args.write_vcf else False, {"CONF": args.fp_bedfile} if args.fp_bedfile else None, args.ref) df = pandas.DataFrame(counts) if args.write_counts: df.to_csv(args.reports_prefix + ".counts.csv") metrics_output = makeMetricsObject("hap.py.comparison") if args.write_counts: metrics_output["metrics"].append(dataframeToMetricsTable("raw.counts", df)) # calculate precision / recall count_types = [] if args.raw_counts: simplified_truth_counts = Haplo.quantify.simplify_counts(counts_truth, h1["samples"][0:1]) simplified_query_counts = Haplo.quantify.simplify_counts(counts_query, h2["samples"][0:1]) count_types += simplified_truth_counts.keys() count_types += simplified_query_counts.keys() else: simplified_truth_counts = None simplified_query_counts = None simplified_numbers = Haplo.quantify.simplify_counts(counts) count_types += simplified_numbers.keys() count_types = sorted(list(set(count_types))) for vtype in count_types: if vtype not in simplified_numbers: simplified_numbers[vtype] = {} simplified_numbers[vtype]["METRIC.Recall"] = 0 simplified_numbers[vtype]["METRIC.Recall2"] = 0 simplified_numbers[vtype]["METRIC.Precision"] = 0 simplified_numbers[vtype]["METRIC.Frac_NA"] = 0 try: simplified_numbers[vtype]["METRIC.Recall"] = \ float(simplified_numbers[vtype]["TRUTH.TP"]) / \ float(simplified_numbers[vtype]["TRUTH.TP"] + simplified_numbers[vtype]["TRUTH.FN"]) except: pass try: simplified_numbers[vtype]["METRIC.Recall2"] = \ float(simplified_numbers[vtype]["TRUTH.TP"]) / \ float(simplified_numbers[vtype]["TRUTH.TOTAL"]) except: pass try: simplified_numbers[vtype]["METRIC.Precision"] = \ float(simplified_numbers[vtype]["QUERY.TP"]) / \ float(simplified_numbers[vtype]["QUERY.TP"] + simplified_numbers[vtype]["QUERY.FP"]) except: pass try: simplified_numbers[vtype]["METRIC.Frac_NA"] = \ float(simplified_numbers[vtype]["QUERY.UNK"]) / \ float(simplified_numbers[vtype]["QUERY.TOTAL"]) except: pass try: simplified_numbers[vtype]["TRUTH.TOTAL.RAW"] = simplified_truth_counts[vtype][h1["samples"][0] + ".TOTAL"] except: pass try: simplified_numbers[vtype]["QUERY.TOTAL.RAW"] = simplified_query_counts[vtype][h2["samples"][0] + ".TOTAL"] except: pass pandas.set_option("display.width", 120) pandas.set_option("display.max_columns", 1000) df = pandas.DataFrame(simplified_numbers).transpose() vstring = "hap.py-%s" % Tools.version vstring += " ".join(sys.argv) df.loc[vstring] = 0 # for x in df: # # everything not a metric is a count # if not x.startswith("METRIC"): # df[x] = df[x].astype("int64") df[["TRUTH.TOTAL", "QUERY.TOTAL", "METRIC.Recall", "METRIC.Precision", "METRIC.Frac_NA"]].to_csv(args.reports_prefix + ".summary.csv") metrics_output["metrics"].append(dataframeToMetricsTable("summary.metrics", df[["TRUTH.TOTAL", "QUERY.TOTAL", "METRIC.Recall", "METRIC.Precision", "METRIC.Frac_NA"]])) if args.write_counts: df.to_csv(args.reports_prefix + ".extended.csv") metrics_output["metrics"].append(dataframeToMetricsTable("all.metrics", df)) essential_numbers = df[["TRUTH.TOTAL", "QUERY.TOTAL", "METRIC.Recall", "METRIC.Precision", "METRIC.Frac_NA"]] pandas.set_option('display.max_columns', 500) pandas.set_option('display.width', 1000) essential_numbers = essential_numbers[essential_numbers.index.isin( ["Locations.SNP", "Locations.INDEL"])] logging.info("\n" + str(essential_numbers)) # in default mode, print result summary to stdout if not args.quiet and not args.verbose: print "Benchmarking Summary:" print str(essential_numbers) if args.roc: vcf = args.reports_prefix + ".vcf.gz" res = Haplo.happyroc.roc(vcf, args.roc, args.roc_filter, args.reports_prefix + ".roc", args.roc_reversed) for t in res.iterkeys(): rocdf = pandas.read_table(res[t]) metrics_output["metrics"].append(dataframeToMetricsTable("roc." + t, rocdf)) with open(args.reports_prefix + ".metrics.json", "w") as fp: json.dump(metrics_output, fp) finally: if args.delete_scratch: for x in tempfiles: try: os.remove(x) except: pass else: logging.info("Scratch files kept : %s" % (str(tempfiles)))
def main(): parser = argparse.ArgumentParser("Somatic Comparison") parser.add_argument("truth", help="Truth VCF file") parser.add_argument("query", help="Query VCF file") parser.add_argument( "-o", "--output", dest="output", required=True, help="Output file prefix for statistics and feature table (when selected)", ) parser.add_argument("-l", "--location", dest="location", default="", help="Location for bcftools view (e.g. chr1)") parser.add_argument( "-R", "--restrict-regions", dest="regions_bedfile", default=None, type=str, help="Restrict analysis to given (sparse) regions (using -R in bcftools).", ) parser.add_argument( "-T", "--target-regions", dest="targets_bedfile", default=None, type=str, help="Restrict analysis to given (dense) regions (using -T in bcftools).", ) parser.add_argument( "-f", "--false-positives", dest="FP", help="False-positive region bed file to distinguish UNK from FP" ) parser.add_argument( "-a", "--ambiguous", dest="ambi", action="append", help="Ambiguous region bed file(s) to distinguish from FP (e.g. variant only observed " "in some replicates)", ) parser.add_argument( "--ambiguous-fp", dest="ambi_fp", action="store_true", default=False, help="Use FP calls from ambiguous region files also.", ) parser.add_argument( "-e", "--explain_ambiguous", dest="explain_ambiguous", required=False, default=False, action="store_true", help="print a table giving the number of ambiguous events per category", ) parser.add_argument( "-r", "--reference", dest="ref", default=Tools.defaultReference(), help="Specify a reference file." ) parser.add_argument( "--scratch-prefix", dest="scratch_prefix", default=None, help="Filename prefix for scratch report output." ) parser.add_argument( "--keep-scratch", dest="delete_scratch", default=True, action="store_false", help="Filename prefix for scratch report output.", ) parser.add_argument( "--continue", dest="cont", default=False, action="store_true", help="Continue from scratch space (i.e. use VCFs in there if they already exist).", ) parser.add_argument( "-P", "--include-nonpass", dest="inc_nonpass", action="store_true", default=False, help="Use to include failing variants in comparison.", ) parser.add_argument( "--feature-table", dest="features", default=False, choices=Somatic.FeatureSet.sets.keys(), help="Select a feature table to output.", ) parser.add_argument( "--bam", dest="bams", default=[], action="append", help="pass one or more BAM files for feature table extraction", ) parser.add_argument( "--normalize-truth", dest="normalize_truth", default=False, action="store_true", help="Enable running of bcftools norm on the truth file.", ) parser.add_argument( "--normalize-query", dest="normalize_query", default=False, action="store_true", help="Enable running of bcftools norm on the query file.", ) parser.add_argument( "-N", "--normalize-all", dest="normalize_all", default=False, action="store_true", help="Enable running of bcftools norm on both truth and query file.", ) parser.add_argument( "--fix-chr-query", dest="fixchr_query", default=False, action="store_true", help="Replace numeric chromosome names in the query by chr*-type names", ) parser.add_argument( "--fix-chr-truth", dest="fixchr_truth", default=False, action="store_true", help="Replace numeric chromosome names in the truth by chr*-type names", ) parser.add_argument( "--no-order-check", dest="disable_order_check", default=False, action="store_true", help="Disable checking the order of TP features (dev feature).", ) parser.add_argument( "--roc", dest="roc", default=None, choices=ROC.list(), help="Create a ROC-style table. This is caller specific " " - this will override the --feature-table switch!", ) parser.add_argument( "--logfile", dest="logfile", default=None, help="Write logging information into file rather than to stderr" ) verbosity_options = parser.add_mutually_exclusive_group(required=False) verbosity_options.add_argument( "--verbose", dest="verbose", default=False, action="store_true", help="Raise logging level from warning to info.", ) verbosity_options.add_argument( "--quiet", dest="quiet", default=False, action="store_true", help="Set logging level to output errors only." ) args = parser.parse_args() if args.verbose: loglevel = logging.INFO elif args.quiet: loglevel = logging.ERROR else: loglevel = logging.WARNING # reinitialize logging for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) logging.basicConfig(filename=args.logfile, format="%(asctime)s %(levelname)-8s %(message)s", level=loglevel) if args.normalize_all: args.normalize_truth = True args.normalize_query = True if args.roc: args.roc = ROC.make(args.roc) args.features = args.roc.ftname if args.scratch_prefix: scratch = os.path.abspath(args.scratch_prefix) args.delete_scratch = False Tools.mkdir_p(scratch) else: scratch = tempfile.mkdtemp() logging.info("Scratch path is %s" % scratch) try: bams = [] md = None for x in args.bams: bams.append(bamStats(x)) if bams: bres = pandas.concat(bams).groupby("CHROM").mean() md = {} for x in bres.index: logging.info("Mean coverage on %s is %f" % (x, bres.loc[x]["COVERAGE"])) md[x] = float(bres.loc[x]["COVERAGE"]) * 3.0 logging.info("Normalizing/reading inputs") ntpath = os.path.join(scratch, "normalized_truth.vcf.gz") if not (args.cont and os.path.exists(ntpath)): preprocessVCF( args.truth, ntpath, args.location, True, # pass_only args.fixchr_truth, # chrprefix args.normalize_truth, # norm, args.regions_bedfile, args.targets_bedfile, args.ref, ) else: logging.info("Continuing from %s" % ntpath) if not (args.cont and os.path.exists(ntpath + ".csi")): runBcftools("index", ntpath) nqpath = os.path.join(scratch, "normalized_query.vcf.gz") if not (args.cont and os.path.exists(nqpath)): preprocessVCF( args.query, nqpath, args.location, not args.inc_nonpass, # pass_only args.fixchr_query, # chrprefix args.normalize_query, # norm, args.regions_bedfile, args.targets_bedfile, args.ref, ) else: logging.info("Continuing from %s" % nqpath) if not (args.cont and os.path.exists(nqpath + ".csi")): runBcftools("index", nqpath) logging.info("Intersecting") tpfn_files = all( [ os.path.exists(os.path.join(scratch, "tpfn", "0000.vcf.gz")), os.path.exists(os.path.join(scratch, "tpfn", "0001.vcf.gz")), os.path.exists(os.path.join(scratch, "tpfn", "0002.vcf.gz")), ] ) tpfn_r_files = all( [ os.path.exists(os.path.join(scratch, "tpfn", "0000.vcf.gz")), os.path.exists(os.path.join(scratch, "tpfn", "0001.vcf.gz")), os.path.exists(os.path.join(scratch, "tpfn", "0002.vcf.gz")), ] ) if not (args.cont and tpfn_files): runBcftools("isec", ntpath, nqpath, "-p", os.path.join(scratch, "tpfn"), "-O", "z") else: logging.info("Continuing from %s" % os.path.join(scratch, "tpfn")) if args.features and not (args.cont and tpfn_r_files): # only need to do this for getting the feature table runBcftools("isec", nqpath, ntpath, "-p", os.path.join(scratch, "tpfn_r"), "-O", "z") logging.info("Getting FPs / Ambi / Unk") fppath = os.path.join(scratch, "fp.vcf.gz") unkpath = os.path.join(scratch, "unk.vcf.gz") ambipath = os.path.join(scratch, "ambi.vcf.gz") # get header to print to unk and ambi VCFs rununiquepath = os.path.join(scratch, "tpfn", "0001.vcf.gz") header = runBcftools("view", rununiquepath, "--header-only") fp = Tools.BGZipFile(fppath, True) fp.write(header) unk = Tools.BGZipFile(unkpath, True) unk.write(header) ambi = Tools.BGZipFile(ambipath, True) ambi.write(header) ambiClasses = Counter() ambiReasons = Counter() fpclasses = BedIntervalTree() if args.ambi: # can have multiple ambiguous BED files for aBED in args.ambi: # auto-label from first value after chr start end # new ambi files have the label in position 4 # old ones will look weird here. fpclasses.addFromBed(aBED, lambda xe: xe[4]) if args.FP: fpclasses.addFromBed(args.FP, "FP") has_fp = (fpclasses.count("FP") > 0) or (fpclasses.count("fp") > 0 and args.ambi_fp) # split VCF into FP, UNK and AMBI toProcess = gzip.open(rununiquepath, "rb") for entry in toProcess: if entry[0] == "#": continue fields = entry.strip().split("\t") chrom = fields[0] start = int(fields[1]) stop = int(fields[1]) + len(fields[3]) overlap = fpclasses.intersect(chrom, start, stop) is_fp = False is_ambi = False classes_this_pos = set() for o in overlap: reason = o.value[0] if reason == "fp" and args.ambi_fp: reason = "FP" elif reason == "fp": reason = "ambi-fp" elif reason == "unk": reason = "ambi-unk" classes_this_pos.add(reason) try: ambiReasons["%s: rep. count %s" % (reason, o.value[1])] += 1 except IndexError: ambiReasons["%s: rep. count *" % reason] += 1 for x in o.value[3:]: ambiReasons["%s: %s" % (reason, x)] += 1 if reason == "FP": is_fp = True else: is_ambi = True for reason in classes_this_pos: ambiClasses[reason] += 1 if is_fp: fp.write(entry) elif is_ambi: ambi.write(entry) elif not has_fp: # when we don't have FP regions, unk stuff becomes FP fp.write(entry) else: unk.write(entry) toProcess.close() # since 0001.vcf.gz should already be sorted, we can just convert to bgzipped vcf # and create index fp.close() ambi.close() unk.close() runBcftools("index", "--tbi", fppath) runBcftools("index", "--tbi", unkpath) runBcftools("index", "--tbi", ambipath) logging.info("Counting variants...") truthcounts = parseStats(runBcftools("stats", ntpath), "total.truth") querycounts = parseStats(runBcftools("stats", nqpath), "total.query") tpcounts = parseStats(runBcftools("stats", os.path.join(scratch, "tpfn", "0002.vcf.gz")), "tp") fncounts = parseStats(runBcftools("stats", os.path.join(scratch, "tpfn", "0000.vcf.gz")), "fn") fpcounts = parseStats(runBcftools("stats", fppath), "fp") ambicounts = parseStats(runBcftools("stats", ambipath), "ambi") unkcounts = parseStats(runBcftools("stats", unkpath), "unk") res = pandas.merge(truthcounts, querycounts, on="type") res = pandas.merge(res, tpcounts, on="type") res = pandas.merge(res, fpcounts, on="type") res = pandas.merge(res, fncounts, on="type") res = pandas.merge(res, unkcounts, on="type") res = pandas.merge(res, ambicounts, on="type") # no explicit guarantee that total.query is equal to unk + ambi + fp + tp # testSum = res["fp"] + res["tp"] + res["unk"] + res["ambi"] # filter and relabel res = res[res["type"] != "samples"] res = res[res["type"] != "multiallelic SNP sites"] res = res[res["type"] != "multiallelic sites"] res.loc[res["type"] == "SNPs", "type"] = "SNVs" res = res[(res["total.truth"] > 0) | (res["total.query"] > 0)] # use this to use plain row counts rather than stratified bcftools counts # truthcounts = countVCFRows(ntpath) # , "total.truth") # querycounts = countVCFRows(nqpath) # , "total.query") # # tpcounts = countVCFRows(os.path.join(scratch, "tpfn", "0002.vcf.gz")) #, "tp") # fncounts = countVCFRows(os.path.join(scratch, "tpfn", "0000.vcf.gz")) #, "fn") # fpcounts = countVCFRows(fppath) #, "fp") # ambicounts = countVCFRows(ambipath) #, "ambi") # unkcounts = countVCFRows(unkpath) #, "unk") # # res = pandas.DataFrame({ # "total.truth" : [ truthcounts ], # "total.query" : [ querycounts ], # "tp" : [ tpcounts ], # "fn" : [ fncounts ], # "fp" : [ fpcounts ], # "ambi" : [ ambicounts ], # "unk" : [ unkcounts ] # }) # # res["type"] = "records" # summary metrics res["recall"] = res["tp"] / (res["tp"] + res["fn"]) res["recall2"] = res["tp"] / (res["total.truth"]) res["precision"] = res["tp"] / (res["tp"] + res["fp"]) res["na"] = res["unk"] / (res["total.query"]) res["ambiguous"] = res["ambi"] / res["total.query"] metrics_output = makeMetricsObject("som.py.comparison") metrics_output["metrics"].append(dataframeToMetricsTable("result", res)) vstring = "som.py-%s" % Tools.version logging.info("\n" + res.to_string()) # in default mode, print result summary to stdout if not args.quiet and not args.verbose: print "\n" + res.to_string() res["sompyversion"] = vstring vstring = " ".join(sys.argv) res["sompycmd"] = vstring if args.ambi and args.explain_ambiguous: ac = list(ambiClasses.iteritems()) if ac: ambie = pandas.DataFrame(ac, columns=["class", "count"]) ambie.sort(["class"], inplace=True) pandas.set_option("display.max_rows", 1000) pandas.set_option("display.max_columns", 1000) pandas.set_option("display.width", 1000) pandas.set_option("display.height", 1100) logging.info( "FP/ambiguity classes with info (multiple classes can " "overlap):\n" + ambie.to_string(index=False) ) # in default mode, print result summary to stdout if not args.quiet and not args.verbose: print "FP/ambiguity classes with info (multiple classes can " "overlap):\n" + ambie.to_string( index=False ) ambie.to_csv(args.output + ".ambiclasses.csv") metrics_output["metrics"].append(dataframeToMetricsTable("ambiclasses", ambie)) else: logging.info("No ambiguous variants.") ar = list(ambiReasons.iteritems()) if ar: ambie = pandas.DataFrame(ar, columns=["reason", "count"]) ambie.sort(["reason"], inplace=True) pandas.set_option("display.max_rows", 1000) pandas.set_option("display.max_columns", 1000) pandas.set_option("display.width", 1000) pandas.set_option("display.height", 1100) logging.info( "Reasons for defining as ambiguous (multiple reasons can overlap):\n" + ambie.to_string( formatters={"reason": "{{:<{}s}}".format(ambie["reason"].str.len().max()).format}, index=False ) ) # in default mode, print result summary to stdout if not args.quiet and not args.verbose: print "Reasons for defining as ambiguous (multiple reasons can overlap):\n" + ambie.to_string( formatters={"reason": "{{:<{}s}}".format(ambie["reason"].str.len().max()).format}, index=False ) ambie.to_csv(args.output + ".ambireasons.csv") metrics_output["metrics"].append(dataframeToMetricsTable("ambireasons", ambie)) else: logging.info("No ambiguous variants.") res.to_csv(args.output + ".stats.csv") with open(args.output + ".metrics.json", "w") as fp: json.dump(metrics_output, fp) if args.features: logging.info("Extracting features...") fset = Somatic.FeatureSet.make(args.features) fset.setChrDepths(md) logging.info("Collecting TP info (1)...") tps = fset.collect(os.path.join(scratch, "tpfn", "0002.vcf.gz"), "TP") # TP_r is a hint for fset, they are both TPs logging.info("Collecting TP info (2)...") tps2 = fset.collect(os.path.join(scratch, "tpfn_r", "0002.vcf.gz"), "TP_r") # this is slow because it tries to sort # ... which we don't need to do since tps1 and tps2 have the same ordering logging.info("Sorting...") tps.sort(["CHROM", "POS"], inplace=True) tps2.sort(["CHROM", "POS"], inplace=True) tps = tps.reset_index(drop=True) tps2 = tps2.reset_index(drop=True) logging.info("Merging TP info...") columns_tps = list(tps) columns_tps2 = list(tps2) len1 = tps.shape[0] len2 = tps.shape[0] if len1 != len2: raise Exception("Cannot read TP features, lists have different lengths : %i != %i" % (len1, len2)) if not args.disable_order_check: logging.info("Checking order %i / %i" % (len1, len2)) for x in xrange(0, len1): for a in ["CHROM", "POS"]: if tps.loc[x][a] != tps2.loc[x][a]: raise Exception( "Cannot merge TP features, inputs are out of order at %s / %s" % (str(tps[x : x + 1]), str(tps2[x : x + 1])) ) logging.info("Merging...") cdata = {"CHROM": tps["CHROM"], "POS": tps["POS"], "tag": tps["tag"]} tpc = pandas.DataFrame(cdata, columns=["CHROM", "POS", "tag"]) all_columns = list(set(columns_tps + columns_tps2)) for a in all_columns: if a in columns_tps and not a in columns_tps2: tpc[a] = tps[a] elif not a in columns_tps and a in columns_tps2: tpc[a] = tps2[a] elif a not in ["CHROM", "POS", "tag"]: tpc[a] = tps2[a] tpc[a + ".truth"] = tps[a] logging.info("Collecting FP info...") fps = fset.collect(fppath, "FP") ambs = fset.collect(fppath, "AMBI") unks = fset.collect(fppath, "UNK") logging.info("Collecting FN info...") fns = fset.collect(os.path.join(scratch, "tpfn", "0000.vcf.gz"), "FN") renamed = {} tp_cols = list(tpc) for col in list(fns): if col + ".truth" in tp_cols: renamed[col] = col + ".truth" fns.rename(columns=renamed, inplace=True) featurelist = [tpc, fps, fns, ambs, unks] if unkpath is not None: logging.info("Collecting UNK info...") unk = fset.collect(unkpath, "UNK") featurelist.append(unk) logging.info("Making feature table...") featuretable = pandas.concat(featurelist) # reorder to make more legible first_columns = ["CHROM", "POS", "tag"] all_columns = list(featuretable) if "REF" in all_columns: first_columns.append("REF") if "REF.truth" in all_columns: first_columns.append("REF.truth") if "ALT" in all_columns: first_columns.append("ALT") if "ALT.truth" in all_columns: first_columns.append("ALT.truth") ordered_columns = first_columns + sorted([x for x in all_columns if x not in first_columns]) featuretable = featuretable[ordered_columns] # make sure positions are integers featuretable["POS"] = featuretable["POS"].astype(int) logging.info("Saving feature table...") featuretable.to_csv(args.output + ".features.csv", float_format="%.8f") if args.roc is not None: roc_table = args.roc.from_table(featuretable) roc_table.to_csv(args.output + ".roc.csv", float_format="%.8f") finally: if args.delete_scratch: shutil.rmtree(scratch) else: logging.info("Scratch kept at %s" % scratch)
def main(): parser = argparse.ArgumentParser("Somatic Comparison") parser.add_argument("truth", help="Truth VCF file") parser.add_argument("query", help="Query VCF file") parser.add_argument("-o", "--output", dest="output", required=True, help="Output file prefix for statistics and feature table (when selected)") parser.add_argument("-l", "--location", dest="location", default="", help="Location for bcftools view (e.g. chr1)") parser.add_argument("-R", "--restrict-regions", dest="regions_bedfile", default=None, type=str, help="Restrict analysis to given (sparse) regions (using -R in bcftools).") parser.add_argument("-T", "--target-regions", dest="targets_bedfile", default=None, type=str, help="Restrict analysis to given (dense) regions (using -T in bcftools).") parser.add_argument("-f", "--false-positives", dest="FP", help="False-positive region bed file to distinguish UNK from FP") parser.add_argument("-a", "--ambiguous", dest="ambi", action='append', help="Ambiguous region bed file(s) to distinguish from FP (e.g. variant only observed " "in some replicates)") parser.add_argument("--ambi-fp", dest="ambi_fp", action='store_true', default=False, help="Use FP calls from ambiguous region files also.") parser.add_argument("--no-ambi-fp", dest="ambi_fp", action='store_false', help="Do not use FP calls from ambiguous region files also.") parser.add_argument("--count-unk", dest="count_unk", action='store_true', default=False, help="Assume the truth set covers the whole genome and only count FPs in regions " "specified by the truth VCF or ambiguous/false-positive regions.") parser.add_argument("--no-count-unk", dest="count_unk", action='store_false', help="Do not use FP calls from ambiguous region files also.") parser.add_argument("-e", "--explain_ambiguous", dest="explain_ambiguous", required=False, default=False, action="store_true", help="print a table giving the number of ambiguous events per category") parser.add_argument("-r", "--reference", dest="ref", default=Tools.defaultReference(), help="Specify a reference file.") parser.add_argument("--scratch-prefix", dest="scratch_prefix", default=None, help="Filename prefix for scratch report output.") parser.add_argument("--keep-scratch", dest="delete_scratch", default=True, action="store_false", help="Filename prefix for scratch report output.") parser.add_argument("--continue", dest="cont", default=False, action="store_true", help="Continue from scratch space (i.e. use VCFs in there if they already exist).") parser.add_argument("-P", "--include-nonpass", dest="inc_nonpass", action="store_true", default=False, help="Use to include failing variants in comparison.") parser.add_argument("--feature-table", dest="features", default=False, choices=Somatic.FeatureSet.sets.keys(), help="Select a feature table to output.") parser.add_argument("--bam", dest="bams", default=[], action="append", help="pass one or more BAM files for feature table extraction") parser.add_argument("--normalize-truth", dest="normalize_truth", default=False, action="store_true", help="Enable running of bcftools norm on the truth file.") parser.add_argument("--normalize-query", dest="normalize_query", default=False, action="store_true", help="Enable running of bcftools norm on the query file.") parser.add_argument("-N", "--normalize-all", dest="normalize_all", default=False, action="store_true", help="Enable running of bcftools norm on both truth and query file.") parser.add_argument("--fixchr-truth", dest="fixchr_truth", action="store_true", default=True, help="Add chr prefix to truth file (default: true).") parser.add_argument("--fixchr-query", dest="fixchr_query", action="store_true", default=True, help="Add chr prefix to query file (default: true).") parser.add_argument("--fix-chr-truth", dest="fixchr_truth", action="store_true", default=None, help="Same as --fixchr-truth.") parser.add_argument("--fix-chr-query", dest="fixchr_query", action="store_true", default=None, help="Same as --fixchr-query.") parser.add_argument("--no-fixchr-truth", dest="fixchr_truth", action="store_false", default=False, help="Disable chr replacement for truth (default: false).") parser.add_argument("--no-fixchr-query", dest="fixchr_query", action="store_false", default=False, help="Add chr prefix to query file (default: false).") parser.add_argument("--no-order-check", dest="disable_order_check", default=False, action="store_true", help="Disable checking the order of TP features (dev feature).") parser.add_argument("--roc", dest="roc", default=None, choices=ROC.list(), help="Create a ROC-style table. This is caller specific " " - this will override the --feature-table switch!") parser.add_argument("--bin-afs", dest="af_strat", default=None, action="store_true", help="Stratify into different AF buckets. This needs to have features available" "for getting the AF both in truth and query variants.") parser.add_argument("--af-binsize", dest="af_strat_binsize", default=0.2, help="Bin size for AF binning (should be < 1). Multiple bin sizes can be specified using a comma, " "e.g. 0.1,0.2,0.5,0.2 will split at 0.1, 0.3, 0.8 and 1.0.") parser.add_argument("--af-truth", dest="af_strat_truth", default="I.T_ALT_RATE", help="Feature name to use for retrieving AF for truth variants (TP and FN)") parser.add_argument("--af-query", dest="af_strat_query", default="T_AF", help="Feature name to use for retrieving AF for query variants (FP/UNK/AMBI)") parser.add_argument("-FN", "--count-filtered-fn", dest="count_filtered_fn", action="store_true", help="Count filtered vs. absent FN numbers. This requires the -P switch (to use all " "variants) and either the --feature-table or --roc switch.") parser.add_argument("--fp-region-size", dest="fpr_size", help="How to obtain the normalisation constant for FP rate. By default, this will use the FP region bed file size when using" " --count-unk and the size of all reference contigs that overlap with the location specified in -l otherwise." " This can be overridden with: 1) a number of nucleotides, or 2) \"auto\" to use the lengths of all contigs that have calls." " The resulting value is used as fp.region.size.") parser.add_argument("--ci-level", dest="ci_level", default=0.95, type = float, help="Confidence level for precision/recall confidence intervals (default: 0.95)") parser.add_argument("--logfile", dest="logfile", default=None, help="Write logging information into file rather than to stderr") verbosity_options = parser.add_mutually_exclusive_group(required=False) verbosity_options.add_argument("--verbose", dest="verbose", default=False, action="store_true", help="Raise logging level from warning to info.") verbosity_options.add_argument("--quiet", dest="quiet", default=False, action="store_true", help="Set logging level to output errors only.") args = parser.parse_args() if args.verbose: loglevel = logging.INFO elif args.quiet: loglevel = logging.ERROR else: loglevel = logging.WARNING try: if type(args.af_strat_binsize) is str: args.af_strat_binsize = map(float, args.af_strat_binsize.split(",")) else: args.af_strat_binsize = map(float, [args.af_strat_binsize]) if not args.af_strat_binsize: raise Exception("Bin size list is empty") except: logging.error("Failed to parse stratification bin size: %s" % str(args.af_strat_binsize)) exit(1) # reinitialize logging for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) logging.basicConfig(filename=args.logfile, format='%(asctime)s %(levelname)-8s %(message)s', level=loglevel) if args.normalize_all: args.normalize_truth = True args.normalize_query = True if args.roc: args.roc = ROC.make(args.roc) args.features = args.roc.ftname if not args.inc_nonpass: logging.warn("When creating ROCs without the -P switch, the ROC data points will only " "include filtered variants (i.e. they will normally end at the caller's " "quality threshold).") if not (args.ci_level > 0.0 and args.ci_level < 1.0): raise Exception("Confidence interval level must be > 0.0 and < 1.0.") if args.af_strat and not args.features: raise Exception("To stratify by AFs, a feature table must be selected -- use this switch together " "with --feature-table or --roc") if args.count_filtered_fn and (not args.inc_nonpass or not args.features): raise Exception("Counting filtered / unfiltered FNs only works when a feature table is selected, " "and when using unfiltered variants. Specify -P --feature-table <...> or use " "--roc to select a ROC type.") if args.scratch_prefix: scratch = os.path.abspath(args.scratch_prefix) args.delete_scratch = False Tools.mkdir_p(scratch) else: scratch = tempfile.mkdtemp() logging.info("Scratch path is %s" % scratch) try: bams = [] md = None for x in args.bams: bams.append(bamStats(x)) if bams: bres = pandas.concat(bams).groupby("CHROM").mean() md = {} for x in bres.index: logging.info("Mean coverage on %s is %f" % (x, bres.loc[x]["COVERAGE"])) md[x] = float(bres.loc[x]["COVERAGE"]) * 3.0 logging.info("Normalizing/reading inputs") ntpath = os.path.join(scratch, "normalized_truth.vcf.gz") if not (args.cont and os.path.exists(ntpath)): preprocessVCF(args.truth, ntpath, args.location, True, # pass_only args.fixchr_truth, # chrprefix args.normalize_truth, # norm, args.regions_bedfile, args.targets_bedfile, args.ref) else: logging.info("Continuing from %s" % ntpath) if not (args.cont and os.path.exists(ntpath + ".csi")): runBcftools("index", ntpath) nqpath = os.path.join(scratch, "normalized_query.vcf.gz") if not (args.cont and os.path.exists(nqpath)): preprocessVCF(args.query, nqpath, args.location, not args.inc_nonpass, # pass_only args.fixchr_query, # chrprefix args.normalize_query, # norm, args.regions_bedfile, args.targets_bedfile, args.ref) else: logging.info("Continuing from %s" % nqpath) if not (args.cont and os.path.exists(nqpath + ".csi")): runBcftools("index", nqpath) logging.info("Intersecting") tpfn_files = all([os.path.exists(os.path.join(scratch, "tpfn", "0000.vcf.gz")), os.path.exists(os.path.join(scratch, "tpfn", "0001.vcf.gz")), os.path.exists(os.path.join(scratch, "tpfn", "0002.vcf.gz"))]) tpfn_r_files = all([os.path.exists(os.path.join(scratch, "tpfn", "0000.vcf.gz")), os.path.exists(os.path.join(scratch, "tpfn", "0001.vcf.gz")), os.path.exists(os.path.join(scratch, "tpfn", "0002.vcf.gz"))]) if not (args.cont and tpfn_files): runBcftools("isec", ntpath, nqpath, "-p", os.path.join(scratch, "tpfn"), "-O", "z") else: logging.info("Continuing from %s" % os.path.join(scratch, "tpfn")) if args.features and not (args.cont and tpfn_r_files): # only need to do this for getting the feature table runBcftools("isec", nqpath, ntpath, "-p", os.path.join(scratch, "tpfn_r"), "-O", "z") logging.info("Getting FPs / Ambi / Unk") fppath = os.path.join(scratch, "fp.vcf.gz") unkpath = os.path.join(scratch, "unk.vcf.gz") ambipath = os.path.join(scratch, "ambi.vcf.gz") # get header to print to unk and ambi VCFs rununiquepath = os.path.join(scratch, "tpfn", "0001.vcf.gz") header = runBcftools("view", rununiquepath, "--header-only") fp = Tools.BGZipFile(fppath, True) fp.write(header) unk = Tools.BGZipFile(unkpath, True) unk.write(header) ambi = Tools.BGZipFile(ambipath, True) ambi.write(header) ambiClasses = Counter() ambiReasons = Counter() fpclasses = BedIntervalTree() if args.ambi: # can have multiple ambiguous BED files for aBED in args.ambi: # auto-label from first value after chr start end # new ambi files have the label in position 4 # old ones will look weird here. fpclasses.addFromBed(aBED, lambda xe: xe[4], args.fixchr_truth) if args.FP: fpclasses.addFromBed(args.FP, "FP", args.fixchr_truth) # split VCF into FP, UNK and AMBI toProcess = gzip.open(rununiquepath, "rb") for entry in toProcess: if entry[0] == '#': continue fields = entry.strip().split("\t") chrom = fields[0] start = int(fields[1]) stop = int(fields[1]) + len(fields[3]) overlap = fpclasses.intersect(chrom, start, stop) is_fp = False is_ambi = False classes_this_pos = set() for o in overlap: reason = o.value[0] if reason == "fp" and args.ambi_fp: reason = "FP" elif reason == "fp": reason = "ambi-fp" elif reason == "unk": reason = "ambi-unk" classes_this_pos.add(reason) try: ambiReasons["%s: rep. count %s" % (reason, o.value[1])] += 1 except IndexError: ambiReasons["%s: rep. count *" % reason] += 1 for x in o.value[3:]: ambiReasons["%s: %s" % (reason, x)] += 1 if reason == "FP": is_fp = True else: is_ambi = True for reason in classes_this_pos: ambiClasses[reason] += 1 if is_fp: fp.write(entry) elif is_ambi: ambi.write(entry) elif not args.count_unk: # when we don't have FP regions, unk stuff becomes FP fp.write(entry) else: unk.write(entry) toProcess.close() # since 0001.vcf.gz should already be sorted, we can just convert to bgzipped vcf # and create index fp.close() ambi.close() unk.close() runBcftools("index", "--tbi", fppath) runBcftools("index", "--tbi", unkpath) runBcftools("index", "--tbi", ambipath) logging.info("Counting variants...") truthcounts = parseStats(runBcftools("stats", ntpath), "total.truth") querycounts = parseStats(runBcftools("stats", nqpath), "total.query") tpcounts = parseStats(runBcftools("stats", os.path.join(scratch, "tpfn", "0002.vcf.gz")), "tp") fncounts = parseStats(runBcftools("stats", os.path.join(scratch, "tpfn", "0000.vcf.gz")), "fn") fpcounts = parseStats(runBcftools("stats", fppath), "fp") ambicounts = parseStats(runBcftools("stats", ambipath), "ambi") unkcounts = parseStats(runBcftools("stats", unkpath), "unk") res = pandas.merge(truthcounts, querycounts, on="type") res = pandas.merge(res, tpcounts, on="type") res = pandas.merge(res, fpcounts, on="type") res = pandas.merge(res, fncounts, on="type") res = pandas.merge(res, unkcounts, on="type") res = pandas.merge(res, ambicounts, on="type") # no explicit guarantee that total.query is equal to unk + ambi + fp + tp # testSum = res["fp"] + res["tp"] + res["unk"] + res["ambi"] # filter and relabel res = res[res["type"] != "samples"] res = res[res["type"] != "multiallelic SNP sites"] res = res[res["type"] != "multiallelic sites"] res.loc[res["type"] == "SNPs", "type"] = "SNVs" metrics_output = makeMetricsObject("som.py.comparison") if args.ambi and args.explain_ambiguous: ac = list(ambiClasses.iteritems()) if ac: ambie = pandas.DataFrame(ac, columns=["class", "count"]) ambie.sort(["class"], inplace=True) pandas.set_option("display.max_rows", 1000) pandas.set_option("display.max_columns", 1000) pandas.set_option("display.width", 1000) pandas.set_option("display.height", 1100) logging.info("FP/ambiguity classes with info (multiple classes can " "overlap):\n" + ambie.to_string(index=False)) # in default mode, print result summary to stdout if not args.quiet and not args.verbose: print "FP/ambiguity classes with info (multiple classes can " \ "overlap):\n" + ambie.to_string(index=False) ambie.to_csv(args.output + ".ambiclasses.csv") metrics_output["metrics"].append(dataframeToMetricsTable("ambiclasses", ambie)) else: logging.info("No ambiguous variants.") ar = list(ambiReasons.iteritems()) if ar: ambie = pandas.DataFrame(ar, columns=["reason", "count"]) ambie.sort(["reason"], inplace=True) pandas.set_option("display.max_rows", 1000) pandas.set_option("display.max_columns", 1000) pandas.set_option("display.width", 1000) pandas.set_option("display.height", 1100) logging.info("Reasons for defining as ambiguous (multiple reasons can overlap):\n" + ambie.to_string( formatters={'reason': '{{:<{}s}}'.format(ambie['reason'].str.len().max()).format}, index=False)) # in default mode, print result summary to stdout if not args.quiet and not args.verbose: print "Reasons for defining as ambiguous (multiple reasons can overlap):\n" + ambie.to_string( formatters={'reason': '{{:<{}s}}'.format(ambie['reason'].str.len().max()).format}, index=False) ambie.to_csv(args.output + ".ambireasons.csv") metrics_output["metrics"].append(dataframeToMetricsTable("ambireasons", ambie)) else: logging.info("No ambiguous variants.") if args.features: logging.info("Extracting features...") fset = Somatic.FeatureSet.make(args.features) fset.setChrDepths(md) logging.info("Collecting TP info (1)...") tps = fset.collect(os.path.join(scratch, "tpfn", "0002.vcf.gz"), "TP") # TP_r is a hint for fset, they are both TPs logging.info("Collecting TP info (2)...") tps2 = fset.collect(os.path.join(scratch, "tpfn_r", "0002.vcf.gz"), "TP_r") # this is slow because it tries to sort # ... which we don't need to do since tps1 and tps2 have the same ordering logging.info("Sorting...") tps.sort(["CHROM", "POS"], inplace=True) tps2.sort(["CHROM", "POS"], inplace=True) tps = tps.reset_index(drop=True) tps2 = tps2.reset_index(drop=True) logging.info("Merging TP info...") columns_tps = list(tps) columns_tps2 = list(tps2) len1 = tps.shape[0] len2 = tps2.shape[0] if len1 != len2: raise Exception("Cannot read TP features, lists have different lengths : %i != %i" % (len1, len2)) if not args.disable_order_check: logging.info("Checking order %i / %i" % (len1, len2)) for x in xrange(0, len1): for a in ["CHROM", "POS"]: if tps.loc[x][a] != tps2.loc[x][a]: raise Exception("Cannot merge TP features, inputs are out of order at %s / %s" % ( str(tps[x:x + 1]), str(tps2[x:x + 1]))) logging.info("Merging...") cdata = { "CHROM": tps["CHROM"], "POS": tps["POS"], "tag": tps["tag"] } tpc = pandas.DataFrame(cdata, columns=["CHROM", "POS", "tag"]) all_columns = list(set(columns_tps + columns_tps2)) for a in all_columns: if a in columns_tps and a not in columns_tps2: tpc[a] = tps[a] elif a not in columns_tps and a in columns_tps2: tpc[a] = tps2[a] elif a not in ["CHROM", "POS", "tag"]: tpc[a] = tps2[a] tpc[a + ".truth"] = tps[a] logging.info("Collecting FP info...") fps = fset.collect(fppath, "FP") ambs = fset.collect(ambipath, "AMBI") logging.info("Collecting FN info...") fns = fset.collect(os.path.join(scratch, "tpfn", "0000.vcf.gz"), "FN") renamed = {} tp_cols = list(tpc) for col in list(fns): if col + ".truth" in tp_cols: renamed[col] = col + ".truth" fns.rename(columns=renamed, inplace=True) featurelist = [tpc, fps, fns, ambs] if unkpath is not None: logging.info("Collecting UNK info...") unk = fset.collect(unkpath, "UNK") featurelist.append(unk) logging.info("Making feature table...") featuretable = pandas.concat(featurelist) # reorder to make more legible first_columns = ["CHROM", "POS", "tag"] # noinspection PyTypeChecker all_columns = list(featuretable) if "REF" in all_columns: first_columns.append("REF") if "REF.truth" in all_columns: first_columns.append("REF.truth") if "ALT" in all_columns: first_columns.append("ALT") if "ALT.truth" in all_columns: first_columns.append("ALT.truth") ordered_columns = first_columns + sorted([x for x in all_columns if x not in first_columns]) featuretable = featuretable[ordered_columns] # make sure positions are integers featuretable["POS"] = featuretable["POS"].astype(int) logging.info("Saving feature table...") featuretable.to_csv(args.output + ".features.csv", float_format='%.8f') if args.roc is not None: roc_table = args.roc.from_table(featuretable) roc_table.to_csv(args.output + ".roc.csv", float_format='%.8f') featuretable["FILTER"].fillna("", inplace=True) featuretable.ix[featuretable["REF"].str.len() < 1, "absent"] = True featuretable.ix[featuretable["tag"] == "FN", "REF"] = featuretable.ix[featuretable["tag"] == "FN", "REF.truth"] featuretable.ix[featuretable["tag"] == "FN", "ALT"] = featuretable.ix[featuretable["tag"] == "FN", "ALT.truth"] af_t_feature = args.af_strat_truth af_q_feature = args.af_strat_query for vtype in ["records", "SNVs", "indels"]: if vtype == "SNVs": featuretable_this_type = featuretable[(featuretable["REF"].str.len() > 0) & (featuretable["ALT"].str.len() == featuretable["REF"].str.len())] elif vtype == "indels": featuretable_this_type = featuretable[(featuretable["REF"].str.len() != 1) | (featuretable["ALT"].str.len() != 1)] else: featuretable_this_type = featuretable if args.count_filtered_fn: res.ix[res["type"] == vtype, "fp.filtered"] = featuretable_this_type[ (featuretable_this_type["tag"] == "FP") & (featuretable_this_type["FILTER"] != "")].shape[0] res.ix[res["type"] == vtype, "tp.filtered"] = featuretable_this_type[ (featuretable_this_type["tag"] == "TP") & (featuretable_this_type["FILTER"] != "")].shape[0] res.ix[res["type"] == vtype, "unk.filtered"] = featuretable_this_type[ (featuretable_this_type["tag"] == "UNK") & (featuretable_this_type["FILTER"] != "")].shape[0] res.ix[res["type"] == vtype, "ambi.filtered"] = featuretable_this_type[ (featuretable_this_type["tag"] == "AMBI") & (featuretable_this_type["FILTER"] != "")].shape[0] if args.af_strat: start = 0.0 current_binsize = args.af_strat_binsize[0] next_binsize = 0 while start < 1.0: # include 1 in last interval end = min(1.000000001, start + current_binsize) n_tp = featuretable_this_type[(featuretable_this_type["tag"] == "TP") & (featuretable_this_type[af_t_feature] >= start) & (featuretable_this_type[af_t_feature] < end)] n_fn = featuretable_this_type[(featuretable_this_type["tag"] == "FN") & (featuretable_this_type[af_t_feature] >= start) & (featuretable_this_type[af_t_feature] < end)] n_fp = featuretable_this_type[(featuretable_this_type["tag"] == "FP") & (featuretable_this_type[af_q_feature] >= start) & (featuretable_this_type[af_q_feature] < end)] n_ambi = featuretable_this_type[(featuretable_this_type["tag"] == "AMBI") & (featuretable_this_type[af_q_feature] >= start) & (featuretable_this_type[af_q_feature] < end)] n_unk = featuretable_this_type[(featuretable_this_type["tag"] == "UNK") & (featuretable_this_type[af_q_feature] >= start) & (featuretable_this_type[af_q_feature] < end)] r = {"type": "%s.%f-%f" % (vtype, start, end), "total.truth": n_tp.shape[0] + n_fn.shape[0], "total.query": n_tp.shape[0] + n_fp.shape[0] + n_ambi.shape[0] + n_unk.shape[0], "tp": n_tp.shape[0], "fp": n_fp.shape[0], "fn": n_fn.shape[0], "unk": n_unk.shape[0], "ambi": n_ambi.shape[0], } if args.count_filtered_fn: r["fp.filtered"] = n_fp[n_fp["FILTER"] != ""].shape[0] r["tp.filtered"] = n_tp[n_tp["FILTER"] != ""].shape[0] r["unk.filtered"] = n_unk[n_unk["FILTER"] != ""].shape[0] r["ambi.filtered"] = n_ambi[n_ambi["FILTER"] != ""].shape[0] res = pandas.concat([res, pandas.DataFrame([r])]) if args.roc is not None and (n_tp.shape[0] + n_fn.shape[0] + n_fp.shape[0]) > 0: roc_table_strat = args.roc.from_table(pandas.concat([n_tp, n_fp, n_fn])) rtname = "%s.%s.%f-%f.roc.csv" % (args.output, vtype, start, end) roc_table_strat.to_csv(rtname, float_format='%.8f') start += current_binsize next_binsize += 1 if next_binsize >= len(args.af_strat_binsize): next_binsize = 0 current_binsize = args.af_strat_binsize[next_binsize] # remove things where we haven't seen any variants in truth and query res = res[(res["total.truth"] > 0) & (res["total.query"] > 0)] # summary metrics with confidence intervals ci_alpha = 1.0 - args.ci_level recall = binomialCI(res["tp"], res["tp"]+res["fn"], ci_alpha) precision = binomialCI(res["tp"], res["tp"]+res["fp"], ci_alpha) res["recall"], res["recall_lower"], res["recall_upper"] = recall res["recall2"] = res["tp"] / (res["total.truth"]) res["precision"], res["precision_lower"], res["precision_upper"] = precision res["na"] = res["unk"] / (res["total.query"]) res["ambiguous"] = res["ambi"] / res["total.query"] any_fp = fpclasses.countbases(label="FP") fp_region_count = 0 auto_size = True if args.fpr_size: try: fp_region_count = int(args.fpr_size) auto_size = False except: pass if auto_size: if any_fp: if args.location: chrom, _, rest = args.location.partition(":") if rest: start, _, end = rest.partition("_") if start: start = int(start) if end: end = int(end) else: fp_region_count += fpclasses.countbases(chrom, label="FP") else: fp_region_count = any_fp else: cs = fastaContigLengths(args.ref) if args.location: fp_region_count = calculateLength(cs, args.location) else: # use all locations we saw calls on h1 = Tools.vcfextract.extractHeadersJSON(ntpath) h1_chrs = h1["tabix"]["chromosomes"] if not h1_chrs: logging.warn("ntpath is empty") h1_chrs = [] h2 = Tools.vcfextract.extractHeadersJSON(nqpath) h2_chrs = h2["tabix"]["chromosomes"] if not h2_chrs: logging.warn("nqpath is empty") h2_chrs = [] combined_chrs = list(set(h1_chrs + h2_chrs)) if len(combined_chrs) > 0: qlocations = " ".join(combined_chrs) fp_region_count = calculateLength(cs, qlocations) else: fp_region_count = 0 res["fp.region.size"] = fp_region_count res["fp.rate"] = 1e6 * res["fp"] / res["fp.region.size"] if args.count_filtered_fn: res["recall.filtered"] = (res["tp"] - res["tp.filtered"]) / (res["tp"] + res["fn"]) res["precision.filtered"] = (res["tp"] - res["tp.filtered"]) / (res["tp"] - res["tp.filtered"] + res["fp"] - res["fp.filtered"]) res["fp.rate.filtered"] = 1e6 * (res["fp"] - res["fp.filtered"]) / res["fp.region.size"] res["na.filtered"] = (res["unk"] - res["unk.filtered"]) / (res["total.query"]) res["ambiguous.filtered"] = (res["ambi"] - res["ambi.filtered"]) / res["total.query"] # HAP-162 remove inf values res.replace([np.inf, -np.inf], 0) metrics_output["metrics"].append(dataframeToMetricsTable("result", res)) vstring = "som.py-%s" % Tools.version logging.info("\n" + res.to_string()) # in default mode, print result summary to stdout if not args.quiet and not args.verbose: print "\n" + res.to_string() res["sompyversion"] = vstring vstring = " ".join(sys.argv) res["sompycmd"] = vstring res.to_csv(args.output + ".stats.csv") with open(args.output + ".metrics.json", "w") as fp: json.dump(metrics_output, fp) finally: if args.delete_scratch: shutil.rmtree(scratch) else: logging.info("Scratch kept at %s" % scratch)
def quantify(args): """ Run quantify and write tables """ vcf_name = args.in_vcf[0] if not vcf_name or not os.path.exists(vcf_name): raise Exception("Cannot read input VCF.") logging.info("Counting variants...") truth_or_query_is_bcf = False try: truth_or_query_is_bcf = args.vcf1.endswith( ".bcf") and args.vcf2.endswith(".bcf") except: # args.vcf1 and args.vcf2 are only available when we're running # inside hap.py. pass if args.bcf or truth_or_query_is_bcf: internal_format_suffix = ".bcf" else: internal_format_suffix = ".vcf.gz" output_vcf = args.reports_prefix + internal_format_suffix roc_table = args.reports_prefix + ".roc.tsv" qfyregions = {} if args.fp_bedfile: if not os.path.exists(args.fp_bedfile): raise Exception("FP / Confident region file not found at %s" % args.fp_bedfile) qfyregions["CONF"] = args.fp_bedfile if args.strat_tsv: with open(args.strat_tsv) as sf: for l in sf: n, _, f = l.strip().partition("\t") if n in qfyregions: raise Exception("Duplicate stratification region ID: %s" % n) if not f: if n: raise Exception( "No file for stratification region %s" % n) else: continue if not os.path.exists(f): f = os.path.join( os.path.abspath(os.path.dirname(args.strat_tsv)), f) if not os.path.exists(f): raise Exception("Quantification region file %s not found" % f) qfyregions[n] = f if args.strat_regions: for r in args.strat_regions: n, _, f = r.partition(":") if not os.path.exists(f): raise Exception("Quantification region file %s not found" % f) qfyregions[n] = f if vcf_name == output_vcf or vcf_name == output_vcf + internal_format_suffix: raise Exception( "Cannot overwrite input VCF: %s would overwritten with output name %s." % (vcf_name, output_vcf)) roc_header = args.roc try: roc_header = args.roc_header except: pass Haplo.quantify.run_quantify(vcf_name, roc_table, output_vcf if args.write_vcf else False, qfyregions, args.ref, threads=args.threads, output_vtc=args.output_vtc, output_rocs=args.do_roc, qtype=args.type, roc_val=args.roc, roc_header=roc_header, roc_filter=args.roc_filter, roc_delta=args.roc_delta, roc_regions=args.roc_regions, clean_info=not args.preserve_info, strat_fixchr=args.strat_fixchr) metrics_output = makeMetricsObject("%s.comparison" % args.runner) filter_handling = None try: if args.engine == "vcfeval" or not args.usefiltered: filter_handling = "ALL" if args.usefiltered else "PASS" except AttributeError: # if we run this through qfy, these arguments are not present pass total_region_size = None headers = Tools.vcfextract.extractHeadersJSON(vcf_name) try: contigs_to_use = ",".join(headers["tabix"]["chromosomes"]) contig_lengths = fastasize.fastaNonNContigLengths(args.ref) total_region_size = fastasize.calculateLength(contig_lengths, contigs_to_use) logging.info("Subset.Size for * is %i, based on these contigs: %s " % (total_region_size, str(contigs_to_use))) except: pass res = Haplo.happyroc.roc(roc_table, args.reports_prefix + ".roc", filter_handling=filter_handling, ci_alpha=args.ci_alpha, total_region_size=total_region_size) df = res["all"] # only use summary numbers df = df[(df["QQ"] == "*") & (df["Filter"].isin(["ALL", "PASS"]))] summary_columns = [ "Type", "Filter", ] for additional_column in [ "TRUTH.TOTAL", "TRUTH.TP", "TRUTH.FN", "QUERY.TOTAL", "QUERY.FP", "QUERY.UNK", "FP.gt", "METRIC.Recall", "METRIC.Precision", "METRIC.Frac_NA", "METRIC.F1_Score", "TRUTH.TOTAL.TiTv_ratio", "QUERY.TOTAL.TiTv_ratio", "TRUTH.TOTAL.het_hom_ratio", "QUERY.TOTAL.het_hom_ratio" ]: summary_columns.append(additional_column) # Remove subtype summary_df = df[(df["Subtype"] == "*") & (df["Genotype"] == "*") & (df["Subset"] == "*")] summary_df[summary_columns].to_csv(args.reports_prefix + ".summary.csv", index=False) metrics_output["metrics"].append( dataframeToMetricsTable("summary.metrics", summary_df[summary_columns])) if args.write_counts: df.to_csv(args.reports_prefix + ".extended.csv", index=False) metrics_output["metrics"].append( dataframeToMetricsTable("all.metrics", df)) essential_numbers = summary_df[summary_columns] pandas.set_option('display.max_columns', 500) pandas.set_option('display.width', 1000) essential_numbers = essential_numbers[essential_numbers["Type"].isin( ["SNP", "INDEL"])] logging.info("\n" + essential_numbers.to_string(index=False)) # in default mode, print result summary to stdout if not args.quiet and not args.verbose: print "Benchmarking Summary:" print essential_numbers.to_string(index=False) # keep this for verbose output if not args.verbose: try: os.unlink(roc_table) except: pass for t in res.iterkeys(): metrics_output["metrics"].append( dataframeToMetricsTable("roc." + t, res[t])) # gzip JSON output if args.write_json: with gzip.open(args.reports_prefix + ".metrics.json.gz", "w") as fp: json.dump(metrics_output, fp)
def main(): parser = argparse.ArgumentParser("Haplotype Comparison") # input parser.add_argument( '--location', '-l', dest='locations', required=False, default=None, help= 'Add a location to the compare list (when not given, will use chr1-22, chrX, chrY).' ) parser.add_argument("-v", "--version", dest="version", action="store_true", help="Show version number and exit.") parser.add_argument( "-P", "--include-nonpass", dest="usefiltered", action="store_true", default=False, help="Use to include failing query variants in comparison.") parser.add_argument( "--include-nonpass-truth", dest="usefiltered_truth", action="store_true", default=False, help="Include failing variants from the truth dataset.") parser.add_argument( "-R", "--restrict-regions", dest="regions_bedfile", default=None, type=str, help= "Restrict analysis to given (sparse) regions (using -R in bcftools).") parser.add_argument( "-T", "--target-regions", dest="targets_bedfile", default=None, type=str, help= "Restrict analysis to given (dense) regions (using -T in bcftools).") parser.add_argument( "-f", "--false-positives", dest="fp_bedfile", default=None, type=str, help="False positive / confident call regions (.bed or .bed.gz).") parser.add_argument("-r", "--reference", dest="ref", default=None, help="Specify a reference file.") # output parser.add_argument("-o", "--report-prefix", dest="reports_prefix", default=None, help="Filename prefix for report output.") parser.add_argument("-V", "--write-vcf", dest="write_vcf", default=False, action="store_true", help="Write an annotated VCF.") parser.add_argument( "-B", "--write-bed", dest="write_bed", default=False, action="store_true", help="Write a bed file with the haplotype blocks that were used.") parser.add_argument("-X", "--write-counts", dest="write_counts", default=True, action="store_true", help="Write advanced counts and metrics.") parser.add_argument("--no-write-counts", dest="write_counts", default=True, action="store_false", help="Do not write advanced counts and metrics.") parser.add_argument( "--raw-counts", dest="raw_counts", default=False, action="store_true", help= "Count variants in unprocessed input VCFs and output as TOTAL.*.RAW.") parser.add_argument( "--roc", dest="roc", default=False, help="Select an INFO feature to produce a ROC on. This works best with " "--no-internal-preprocessing and --no-internal-leftshift since these " "flags preserve the most INFO flags from the input files.") parser.add_argument("--roc-filter", dest="roc_filter", default=False, help="Select a filter to ignore when making ROCs.") parser.add_argument( "--roc-reversed", dest="roc_reversed", default=False, help= "Change the meaning of the ROC feature to count the other way around (higher values=bad)." ) parser.add_argument("--scratch-prefix", dest="scratch_prefix", default=None, help="Directory for scratch files.") parser.add_argument("--keep-scratch", dest="delete_scratch", default=True, action="store_false", help="Filename prefix for scratch report output.") # detailed control of comparison parser.add_argument("--preprocess-truth", dest="preprocessing_truth", action="store_true", default=False, help="Preprocess truth file using bcftools.") parser.add_argument("--external-preprocessing", dest="preprocessing", action="store_true", default=False, help="Perform VCF preprocessing using bcftools.") parser.add_argument( "--bcftools-norm", dest="preprocessing_norm", action="store_true", default=False, help= "Enable preprocessing through bcftools norm -c x -D (requires external " " preprocessing to be switched on).") parser.add_argument("--fixchr-truth", dest="fixchr_truth", action="store_true", default=None, help="Add chr prefix to truth file (default: auto).") parser.add_argument("--fixchr-query", dest="fixchr_query", action="store_true", default=None, help="Add chr prefix to query file (default: auto).") parser.add_argument( "--no-fixchr-truth", dest="fixchr_truth", action="store_false", help="Disable chr replacement for truth (default: auto).") parser.add_argument("--no-fixchr-query", dest="fixchr_query", action="store_false", help="Add chr prefix to query file (default: auto).") parser.add_argument( "--partial-credit", dest="partial_credit", action="store_true", default=None, help="give credit for partially matched variants. " "this is equivalent to --internal-leftshift and --internal-preprocessing." ) parser.add_argument( "--no-partial-credit", dest="partial_credit", action="store_false", default=None, help="Give credit for partially matched variants. " "This is equivalent to --internal-leftshift and --no-internal-preprocessing." ) parser.add_argument( "--internal-leftshift", dest="int_preprocessing_ls", action="store_true", default=None, help="Switch off xcmp's internal VCF leftshift preprocessing.") parser.add_argument( "--internal-preprocessing", dest="int_preprocessing", action="store_true", default=None, help="Switch off xcmp's internal VCF leftshift preprocessing.") parser.add_argument( "--no-internal-leftshift", dest="int_preprocessing_ls", action="store_false", default=None, help="Switch off xcmp's internal VCF leftshift preprocessing.") parser.add_argument( "--no-internal-preprocessing", dest="int_preprocessing", action="store_false", default=None, help="Switch off xcmp's internal VCF leftshift preprocessing.") parser.add_argument( "--match-raw", dest="int_match_raw", action="store_true", default=False, help= "Add a matching step in xcmp which also matches raw variant calls. This helps" " when comparing files with very different representations.") parser.add_argument( "--no-haplotype-comparison", dest="no_hc", action="store_true", default=False, help= "Disable haplotype comparison (only count direct GT matches as TP).") parser.add_argument( "--unhappy", dest="unhappy", action="store_true", default=False, help= "Combination of --no-haplotype-comparison --no-internal-preprocessing " "--no-internal-leftshift.") parser.add_argument( "--no-auto-index", dest="auto_index", action="store_false", default=True, help="Disable automatic index creation for input files. " "The index is only necessary at this stage if we want to auto-detect locations. " "When used with -l, and when it is known that there are variants at all given locations " "this is not needed and can be switched off to save time.") parser.add_argument( "-w", "--window-size", dest="window", default=50, type=int, help= "Minimum distance between two variants such that they fall into different haplotype " "blocks") parser.add_argument( "--enumeration-threshold", dest="max_enum", default=16768, type=int, help= "Enumeration threshold / maximum number of sequences to enumerate per block." ) parser.add_argument( "-e", "--expand-hapblocks", dest="hb_expand", default=30, type=int, help="Expand haplotype blocks by this many basepairs left and right.") parser.add_argument("--threads", dest="threads", default=multiprocessing.cpu_count(), type=int, help="Number of threads to use.") parser.add_argument("--engine", dest="engine", default="xcmp", choices=["xcmp", "vcfeval"], help="Comparison engine to use.") parser.add_argument( "--engine-vcfeval-path", dest="engine_vcfeval", required=False, help="This parameter should give the path to the \"rtg\" executable.") parser.add_argument( "--engine-vcfeval-template", dest="engine_vcfeval_template", required=False, help= "Vcfeval needs the reference sequence formatted in its own file format " "(SDF -- run rtg format -o ref.SDF ref.fa).") if Tools.has_sge: parser.add_argument( "--force-interactive", dest="force_interactive", default=False, action="store_true", help= "Force running interactively (i.e. when JOB_ID is not in the environment)" ) parser.add_argument("_vcfs", help="Two VCF files.", default=[], nargs="*") parser.add_argument( "--logfile", dest="logfile", default=None, help="Write logging information into file rather than to stderr") verbosity_options = parser.add_mutually_exclusive_group(required=False) verbosity_options.add_argument( "--verbose", dest="verbose", default=False, action="store_true", help="Raise logging level from warning to info.") verbosity_options.add_argument( "--quiet", dest="quiet", default=False, action="store_true", help="Set logging level to output errors only.") args, unknown_args = parser.parse_known_args() if not Tools.has_sge: args.force_interactive = True if args.verbose: loglevel = logging.INFO elif args.quiet: loglevel = logging.ERROR else: loglevel = logging.WARNING # reinitialize logging for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) logging.basicConfig(filename=args.logfile, format='%(asctime)s %(levelname)-8s %(message)s', level=loglevel) # remove some safe unknown args unknown_args = [ x for x in unknown_args if x not in ["--force-interactive"] ] if len(sys.argv) < 2 or len(unknown_args) > 0: if unknown_args: logging.error("Unknown arguments specified : %s " % str(unknown_args)) parser.print_help() exit(0) if args.version: print "Hap.py %s" % Tools.version exit(0) if args.roc: args.write_vcf = True # disable all clever matching if args.unhappy: args.int_preprocessing = False args.int_preprocessing_ls = False args.no_hc = True # Counting with partial credit elif args.partial_credit: # partial_credit switch is overridden by --no-* switches args.int_preprocessing = True args.int_preprocessing_ls = True elif args.partial_credit is None: # in the default setting, we enable partial credit but only override the # preprocessing settings if they haven't been specified if args.int_preprocessing is None: args.int_preprocessing = True if args.int_preprocessing_ls is None: args.int_preprocessing_ls = True elif args.partial_credit is not None: # explicitly set to false args.int_preprocessing = False args.int_preprocessing_ls = True if args.int_preprocessing is None: args.int_preprocessing = False if args.int_preprocessing_ls is None: args.int_preprocessing_ls = False logging.info("Preprocessing settings: %s / %s / %s" % ("leftshift" if args.int_preprocessing_ls else "no-leftshift", "splitting" if args.int_preprocessing else "raw calls", "haplocompare" if not args.no_hc else "no-haplocompare")) # sanity-check regions bed file (HAP-57) if args.regions_bedfile: logging.info("Checking input regions.") if bedOverlapCheck(args.regions_bedfile): raise Exception( "The regions bed file (specified using -R) has overlaps, this will not work with xcmp." " You can either use -T, or run the file through bedtools merge" ) args.preprocessing_truth = True args.preprocessing = True if args.targets_bedfile or args.engine != "xcmp": args.preprocessing_truth = True args.preprocessing = True if args.fp_bedfile and not os.path.exists(args.fp_bedfile): raise Exception("FP/confident call region bed file does not exist.") tempfiles = [] try: if not args.force_interactive and "JOB_ID" not in os.environ: parser.print_help() raise Exception( "Please qsub me so I get approximately 1 GB of RAM per thread." ) if not args.ref: args.ref = Tools.defaultReference() if not os.path.exists(args.ref): raise Exception("Please specify a valid reference path using -r.") if not args.reports_prefix: raise Exception("Please specify an output prefix using -o ") if not os.path.exists( os.path.dirname(os.path.abspath(args.reports_prefix))): raise Exception( "The output path does not exist. Please specify a valid output path and prefix using -o" ) if os.path.basename(args.reports_prefix) == "" or os.path.isdir( args.reports_prefix): raise Exception( "The output path should specify a file name prefix. Please specify a valid output path " "and prefix using -o. For example, -o /tmp/test will create files named /tmp/test* ." ) # noinspection PyProtectedMember if not args._vcfs or len(args._vcfs) != 2: raise Exception("Please specify exactly two input VCFs.") # noinspection PyProtectedMember args.vcf1 = args._vcfs[0] # noinspection PyProtectedMember args.vcf2 = args._vcfs[1] if not os.path.exists(args.vcf1): raise Exception("Input file %s does not exist." % args.vcf1) if not os.path.exists(args.vcf2): raise Exception("Input file %s does not exist." % args.vcf2) logging.info("Comparing %s and %s" % (args.vcf1, args.vcf2)) h1 = vcfextract.extractHeadersJSON(args.vcf1) if args.auto_index and not h1["tabix"]: logging.info( "Creating indexed version of %s -- consider creating an index beforehand to save time here." % args.vcf1) vtf = tempfile.NamedTemporaryFile(delete=False, dir=args.scratch_prefix, prefix="truth.ix", suffix=".vcf.gz") vtf.close() tempfiles.append(vtf.name) tempfiles.append(vtf.name + ".tbi") args.vcf1 = Tools.bcftools.makeIndex(args.vcf1, vtf.name) h1 = vcfextract.extractHeadersJSON(args.vcf1) h2 = vcfextract.extractHeadersJSON(args.vcf2) if args.auto_index and not h2["tabix"]: logging.info( "Creating indexed version of %s -- consider creating an index beforehand to save time here." % args.vcf2) vtf = tempfile.NamedTemporaryFile(delete=False, dir=args.scratch_prefix, prefix="query.ix", suffix=".vcf.gz") vtf.close() tempfiles.append(vtf.name) tempfiles.append(vtf.name + ".tbi") args.vcf2 = Tools.bcftools.makeIndex(args.vcf2, vtf.name) h2 = vcfextract.extractHeadersJSON(args.vcf2) ref_check = True try: happy_ref = args.ref v1r = [_h for _h in h1["fields"] if _h["key"] == "reference"] v2r = [_h for _h in h2["fields"] if _h["key"] == "reference"] if args.verbose: logging.info("References used: hap.py: %s / truth: %s / " "query: %s" % (str(happy_ref), str(v1r), str(v2r))) v1_ref = ";".join([str(xxy["value"]) for xxy in v1r]).replace("file://", "") v2_ref = ";".join([str(xxy["value"]) for xxy in v2r]).replace("file://", "") if happy_ref == v1_ref and v1_ref == v2_ref: ref_check = True refids_found = 0 rids_vh = set() rids_v1 = set() rids_v2 = set() for refid in ["hg19", "hg38", "grc37", "grc38"]: if refid in happy_ref.lower(): rids_vh.add(refid) if refid in v1_ref.lower(): rids_v1.add(refid) if refid in v2_ref.lower(): rids_v2.add(refid) rids_v1 = sorted(list(rids_v1)) rids_v2 = sorted(list(rids_v2)) rids_vh = sorted(list(rids_vh)) to_cmp = None if rids_v1: to_cmp = rids_v1 if rids_v2: to_cmp = rids_v2 if rids_vh: to_cmp = rids_vh if to_cmp and rids_v1 and rids_v1 != to_cmp: ref_check = False if to_cmp and rids_v2 and rids_v2 != to_cmp: ref_check = False if to_cmp and rids_vh and rids_vh != to_cmp: ref_check = False except: pass if not ref_check: logging.warn( "Reference sequence check failed! " "Please ensure that truth and query VCF use the same reference sequence as " "hap.py. XCMP may fail if this is not the case, and the results will not be " " accurate.") if args.locations is None or len(args.locations) == 0: # all chromosomes args.locations = ["chr" + x for x in map(str, range(1, 23))] if type(args.locations) is not list and args.locations is not None: # noinspection PyUnresolvedReferences args.locations = args.locations.split(",") # HAP-143 fix the case where no chromosomes are in truth or query try: if not h1["tabix"]["chromosomes"]: h1["tabix"]["chromosomes"] = [] except: pass try: if not h2["tabix"]["chromosomes"]: h2["tabix"]["chromosomes"] = [] except: pass if not h1["tabix"]: args.preprocessing_truth = True logging.warn( "Truth file is not Tabix indexed. Switching on pre-processing + chr name conversion." ) if args.fixchr_truth is None: args.fixchr_truth = True elif args.fixchr_truth is None: logging.info(str(h1["tabix"])) # autodetect chr naming count_with_fix = len([ __ for __ in h1["tabix"]["chromosomes"] if ("chr%s" % str(__)) in args.locations ]) count_no_fix = len([ __ for __ in h1["tabix"]["chromosomes"] if str(__) in args.locations ]) logging.info( "Truth: Number of chromosome names matching with / without renaming : %i / %i " % (count_with_fix, count_no_fix)) if count_with_fix > count_no_fix: args.fixchr_truth = True logging.info("Will fix chromosome names (truth).") else: logging.info("Will not fix chromosome names (truth).") args.fixchr_truth = False if not h2["tabix"]: args.preprocessing = True logging.warn( "Query file is not Tabix indexed. Switching on pre-processing + chr name conversion." ) # don't overwrite setting, but if it's None, replace with True to be sure if args.fixchr_query is None: args.fixchr_query = True elif args.fixchr_query is None: # autodetect chr naming count_with_fix = len([ __ for __ in h2["tabix"]["chromosomes"] if ("chr%s" % str(__)) in args.locations ]) count_no_fix = len([ __ for __ in h2["tabix"]["chromosomes"] if str(__) in args.locations ]) logging.info( "Query: Number of chromosome names matching with / without renaming : %i / %i " % (count_with_fix, count_no_fix)) if count_with_fix > count_no_fix: args.fixchr_query = True logging.info("Will fix chromosome names (query).") else: logging.info("Will not fix chromosome names (query).") args.fixchr_query = False if args.fixchr_truth or args.preprocessing_norm: args.preprocessing_truth = True if args.fixchr_query or args.preprocessing_norm: args.preprocessing = True if args.preprocessing_truth: vtf = tempfile.NamedTemporaryFile(delete=False, dir=args.scratch_prefix, prefix="truth.pp", suffix=".vcf.gz") vtf.close() tempfiles.append(vtf.name) preprocessVCF( args.vcf1, vtf.name, ",".join(args.locations), not args.usefiltered_truth, # pass_only args.fixchr_truth, # chrprefix args.preprocessing_norm, # norm, args.regions_bedfile, args.targets_bedfile, args.ref) args.vcf1 = vtf.name # get headers again if we preprocessed h1 = vcfextract.extractHeadersJSON(args.vcf1) if args.preprocessing: vtf = tempfile.NamedTemporaryFile(delete=False, dir=args.scratch_prefix, prefix="query.pp", suffix=".vcf.gz") vtf.close() tempfiles.append(vtf.name) preprocessVCF( args.vcf2, vtf.name, ",".join(args.locations), not args.usefiltered, # pass_only args.fixchr_query, # chrprefix args.preprocessing_norm, # norm, args.regions_bedfile, args.targets_bedfile, args.ref) args.vcf2 = vtf.name # get headers again if we preprocessed h2 = vcfextract.extractHeadersJSON(args.vcf2) if not h1["tabix"]: raise Exception("Truth file is not Tabix indexed.") if not h2["tabix"]: raise Exception("Query file is not Tabix indexed.") newlocations = [] if not h1["tabix"]["chromosomes"]: h1["tabix"]["chromosomes"] = [] if not h2["tabix"]["chromosomes"]: h2["tabix"]["chromosomes"] = [] for _xc in args.locations: xc = _xc.split(":")[0] if xc not in h1["tabix"]["chromosomes"]: logging.warn("No calls for location %s in truth!" % xc) if xc not in h2["tabix"]["chromosomes"]: logging.warn("No calls for location %s in query!" % xc) if (xc not in h1["tabix"]["chromosomes"]) and ( xc not in h2["tabix"]["chromosomes"]): logging.warn( "Removing location %s because neither input file has calls there." % xc) else: newlocations.append(_xc) if not newlocations: raise Exception( "Location list is empty: the input files do not appear to have variants on any of %s" % str(args.locations)) args.locations = newlocations if args.threads > 1: logging.info("Running using %i parallel processes." % args.threads) pool = multiprocessing.Pool(int(args.threads)) # find balanced pieces args.pieces = (args.threads + len(args.locations) - 1) / len( args.locations) res = runParallel(pool, Haplo.blocksplit.blocksplitWrapper, args.locations, args) if None in res: raise Exception("One of the blocksplit processes failed.") tempfiles += res args.locations = [] for f in res: with open(f) as fp: for l in fp: ll = l.strip().split("\t", 3) if len(ll) < 3: continue xchr = ll[0] start = int(ll[1]) + 1 end = int(ll[2]) args.locations.append("%s:%i-%i" % (xchr, start, end)) else: pool = None # count variants before normalisation if "samples" not in h1 or not h1["samples"]: raise Exception("Cannot read sample names from truth VCF file") if args.raw_counts: counts_truth = Haplo.quantify.run_quantify( args.vcf1, None, None, {"CONF": args.fp_bedfile} if args.fp_bedfile else None, args.ref, h1["samples"][0], locations=args.locations) else: counts_truth = None if "samples" not in h2 or not h2["samples"]: raise Exception("Cannot read sample names from query VCF file") if args.raw_counts: counts_query = Haplo.quantify.run_quantify( args.vcf2, None, None, {"CONF": args.fp_bedfile} if args.fp_bedfile else None, args.ref, h2["samples"][0], locations=args.locations) else: counts_query = None tf = tempfile.NamedTemporaryFile(delete=False, dir=args.scratch_prefix, prefix="hap.py.result.", suffix=".vcf.gz") tf.close() tempfiles.append(tf.name) output_name = tf.name if args.engine == "xcmp": # do xcmp logging.info("Using xcmp for comparison") res = runParallel(pool, Haplo.xcmp.xcmpWrapper, args.locations, args) tempfiles += [x[0] for x in res if x is not None] # VCFs tempfiles += [ x[1] for x in res if x is not None and x[1] is not None ] # beds (if any) if None in res: raise Exception("One of the xcmp jobs failed.") if len(res) == 0: raise Exception( "Input files/regions do not contain variants (0 haplotype blocks were processed)." ) # concatenate + index bedfiles = [ x[1] for x in res if x is not None and x[1] is not None ] if args.write_bed and bedfiles: runme = " ".join(["cat"] + bedfiles + [ ">", args.reports_prefix.replace(" ", "\\ ") + ".blocks.bed" ]) logging.info("Concatenating block files: %s..." % runme) subprocess.check_call(runme, shell=True) logging.info("Concatenating variants...") runme_list = [x[0] for x in res if x is not None] if len(runme_list) == 0: raise Exception("No outputs to concatenate!") fo = Tools.BGZipFile(output_name, True) for i, x in enumerate(runme_list): f = gzip.GzipFile(x) for l in f: if i == 0 or not l[0] == "#": fo.write(l) fo.close() logging.info("Indexing...") to_run = "tabix -p vcf %s" % output_name.replace(" ", "\\ ") logging.info("Running '%s'" % to_run) subprocess.check_call(to_run, shell=True) elif args.engine == "vcfeval": tempfiles += Haplo.vcfeval.runVCFEval(args.vcf1, args.vcf2, output_name, args) else: raise Exception("Unknown comparison engine: %s" % args.engine) if args.write_counts: json_name = args.reports_prefix + ".counts.json" else: tf = tempfile.NamedTemporaryFile(delete=False, dir=args.scratch_prefix, prefix="counts.", suffix=".json") tf.close() json_name = tf.name logging.info("Counting variants...") counts = Haplo.quantify.run_quantify( output_name, json_name, args.reports_prefix + ".vcf.gz" if args.write_vcf else False, {"CONF": args.fp_bedfile} if args.fp_bedfile else None, args.ref) df = pandas.DataFrame(counts) if args.write_counts: df.to_csv(args.reports_prefix + ".counts.csv") metrics_output = makeMetricsObject("hap.py.comparison") if args.write_counts: metrics_output["metrics"].append( dataframeToMetricsTable("raw.counts", df)) # calculate precision / recall count_types = [] if args.raw_counts: simplified_truth_counts = Haplo.quantify.simplify_counts( counts_truth, h1["samples"][0:1]) simplified_query_counts = Haplo.quantify.simplify_counts( counts_query, h2["samples"][0:1]) count_types += simplified_truth_counts.keys() count_types += simplified_query_counts.keys() else: simplified_truth_counts = None simplified_query_counts = None simplified_numbers = Haplo.quantify.simplify_counts(counts) count_types += simplified_numbers.keys() count_types = sorted(list(set(count_types))) for vtype in count_types: if vtype not in simplified_numbers: simplified_numbers[vtype] = {} simplified_numbers[vtype]["METRIC.Recall"] = 0 simplified_numbers[vtype]["METRIC.Recall2"] = 0 simplified_numbers[vtype]["METRIC.Precision"] = 0 simplified_numbers[vtype]["METRIC.Frac_NA"] = 0 try: simplified_numbers[vtype]["METRIC.Recall"] = \ float(simplified_numbers[vtype]["TRUTH.TP"]) / \ float(simplified_numbers[vtype]["TRUTH.TP"] + simplified_numbers[vtype]["TRUTH.FN"]) except: pass try: simplified_numbers[vtype]["METRIC.Recall2"] = \ float(simplified_numbers[vtype]["TRUTH.TP"]) / \ float(simplified_numbers[vtype]["TRUTH.TOTAL"]) except: pass try: simplified_numbers[vtype]["METRIC.Precision"] = \ float(simplified_numbers[vtype]["QUERY.TP"]) / \ float(simplified_numbers[vtype]["QUERY.TP"] + simplified_numbers[vtype]["QUERY.FP"]) except: pass try: simplified_numbers[vtype]["METRIC.Frac_NA"] = \ float(simplified_numbers[vtype]["QUERY.UNK"]) / \ float(simplified_numbers[vtype]["QUERY.TOTAL"]) except: pass try: simplified_numbers[vtype][ "TRUTH.TOTAL.RAW"] = simplified_truth_counts[vtype][ h1["samples"][0] + ".TOTAL"] except: pass try: simplified_numbers[vtype][ "QUERY.TOTAL.RAW"] = simplified_query_counts[vtype][ h2["samples"][0] + ".TOTAL"] except: pass pandas.set_option("display.width", 120) pandas.set_option("display.max_columns", 1000) df = pandas.DataFrame(simplified_numbers).transpose() vstring = "hap.py-%s" % Tools.version vstring += " ".join(sys.argv) df.loc[vstring] = 0 # for x in df: # # everything not a metric is a count # if not x.startswith("METRIC"): # df[x] = df[x].astype("int64") df[[ "TRUTH.TOTAL", "QUERY.TOTAL", "METRIC.Recall", "METRIC.Precision", "METRIC.Frac_NA" ]].to_csv(args.reports_prefix + ".summary.csv") metrics_output["metrics"].append( dataframeToMetricsTable( "summary.metrics", df[[ "TRUTH.TOTAL", "QUERY.TOTAL", "METRIC.Recall", "METRIC.Precision", "METRIC.Frac_NA" ]])) if args.write_counts: df.to_csv(args.reports_prefix + ".extended.csv") metrics_output["metrics"].append( dataframeToMetricsTable("all.metrics", df)) essential_numbers = df[[ "TRUTH.TOTAL", "QUERY.TOTAL", "METRIC.Recall", "METRIC.Precision", "METRIC.Frac_NA" ]] pandas.set_option('display.max_columns', 500) pandas.set_option('display.width', 1000) essential_numbers = essential_numbers[essential_numbers.index.isin( ["Locations.SNP", "Locations.INDEL"])] logging.info("\n" + str(essential_numbers)) # in default mode, print result summary to stdout if not args.quiet and not args.verbose: print "Benchmarking Summary:" print str(essential_numbers) if args.roc: vcf = args.reports_prefix + ".vcf.gz" res = Haplo.happyroc.roc(vcf, args.roc, args.roc_filter, args.reports_prefix + ".roc", args.roc_reversed) for t in res.iterkeys(): rocdf = pandas.read_table(res[t]) metrics_output["metrics"].append( dataframeToMetricsTable("roc." + t, rocdf)) with open(args.reports_prefix + ".metrics.json", "w") as fp: json.dump(metrics_output, fp) finally: if args.delete_scratch: for x in tempfiles: try: os.remove(x) except: pass else: logging.info("Scratch files kept : %s" % (str(tempfiles)))
def quantify(args): """ Run quantify and write tables """ vcf_name = args.in_vcf[0] if not vcf_name or not os.path.exists(vcf_name): raise Exception("Cannot read input VCF.") json_name = args.reports_prefix + ".counts.json" logging.info("Counting variants...") output_vcf = args.reports_prefix + ".vcf.gz" roc_table = None if args.roc: roc_table = args.reports_prefix + ".roc.tsv" if args.verbose: # verbose writes internal summary file # this will be what we migrate to in 0.3.0 sum_file = args.reports_prefix + ".internal.summary.tsv" else: sum_file = None counts = Haplo.quantify.run_quantify(vcf_name, json_name, output_vcf if args.write_vcf else False, {"CONF": args.fp_bedfile} if args.fp_bedfile else None, args.ref, threads=args.threads, output_vtc=args.output_vtc, qtype=args.type, roc_val=args.roc, roc_file=roc_table, summary_file=sum_file, roc_filter=args.roc_filter, roc_delta=args.roc_delta, output_filter_rocs=args.output_filter_rocs, clean_info=not args.preserve_info) df = pandas.DataFrame(counts) metrics_output = makeMetricsObject("%s.comparison" % args.runner) if args.write_counts: df.to_csv(args.reports_prefix + ".counts.csv") metrics_output["metrics"].append(dataframeToMetricsTable("raw.counts", df)) # calculate precision / recall count_types = [] simplified_numbers = Haplo.quantify.simplify_counts(counts) count_types += simplified_numbers.keys() count_types = sorted(list(set(count_types))) for vtype in count_types: if vtype not in simplified_numbers: simplified_numbers[vtype] = {} simplified_numbers[vtype]["METRIC.Recall"] = 0 simplified_numbers[vtype]["METRIC.Recall2"] = 0 simplified_numbers[vtype]["METRIC.Precision"] = 0 simplified_numbers[vtype]["METRIC.Frac_NA"] = 0 try: simplified_numbers[vtype]["METRIC.Recall"] = \ float(simplified_numbers[vtype]["TRUTH.TP"]) / \ float(simplified_numbers[vtype]["TRUTH.TP"] + simplified_numbers[vtype]["TRUTH.FN"]) except: pass try: simplified_numbers[vtype]["METRIC.Recall2"] = \ float(simplified_numbers[vtype]["TRUTH.TP"]) / \ float(simplified_numbers[vtype]["TRUTH.TOTAL"]) except: pass try: simplified_numbers[vtype]["METRIC.Precision"] = \ float(simplified_numbers[vtype]["QUERY.TP"]) / \ float(simplified_numbers[vtype]["QUERY.TP"] + simplified_numbers[vtype]["QUERY.FP"]) except: pass try: simplified_numbers[vtype]["METRIC.Frac_NA"] = \ float(simplified_numbers[vtype]["QUERY.UNK"]) / \ float(simplified_numbers[vtype]["QUERY.TOTAL"]) except: pass pandas.set_option("display.width", 120) pandas.set_option("display.max_columns", 1000) df = pandas.DataFrame(simplified_numbers).transpose() vstring = "%s-%s" % (args.runner, Tools.version) vstring += " ".join(sys.argv) df.loc[vstring] = 0 summary_columns = ["TRUTH.TOTAL", "QUERY.TOTAL", "METRIC.Recall", "METRIC.Precision", "METRIC.Frac_NA"] for additional_column in ["TRUTH.TOTAL.TiTv_ratio", "QUERY.TOTAL.TiTv_ratio", "TRUTH.TOTAL.het_hom_ratio", "QUERY.TOTAL.het_hom_ratio"]: if additional_column in df.columns: summary_columns.append(additional_column) df[summary_columns].to_csv(args.reports_prefix + ".summary.csv") metrics_output["metrics"].append(dataframeToMetricsTable("summary.metrics", df[summary_columns])) if args.write_counts: df.to_csv(args.reports_prefix + ".extended.csv") metrics_output["metrics"].append(dataframeToMetricsTable("all.metrics", df)) essential_numbers = df[summary_columns] pandas.set_option('display.max_columns', 500) pandas.set_option('display.width', 1000) essential_numbers = essential_numbers[essential_numbers.index.isin( ["Locations.SNP", "Locations.INDEL"])] logging.info("\n" + str(essential_numbers)) # in default mode, print result summary to stdout if not args.quiet and not args.verbose: print "Benchmarking Summary:" print str(essential_numbers) if args.roc: res = Haplo.happyroc.roc(roc_table, args.reports_prefix + ".roc") # keep this for verbose output if not args.verbose: try: os.unlink(roc_table) except: pass for t in res.iterkeys(): metrics_output["metrics"].append(dataframeToMetricsTable("roc." + t, res[t])) with open(args.reports_prefix + ".metrics.json", "w") as fp: json.dump(metrics_output, fp)
def main(): parser = argparse.ArgumentParser("Somatic Comparison") parser.add_argument("truth", help="Truth VCF file") parser.add_argument("query", help="Query VCF file") parser.add_argument("-o", "--output", dest="output", required=True, help="Output file prefix for statistics and feature table (when selected)") parser.add_argument("-l", "--location", dest="location", default="", help="Location for bcftools view (e.g. chr1)") parser.add_argument("-R", "--restrict-regions", dest="regions_bedfile", default=None, type=str, help="Restrict analysis to given (sparse) regions (using -R in bcftools).") parser.add_argument("-T", "--target-regions", dest="targets_bedfile", default=None, type=str, help="Restrict analysis to given (dense) regions (using -T in bcftools).") parser.add_argument("-f", "--false-positives", dest="FP", help="False-positive region bed file to distinguish UNK from FP") parser.add_argument("-a", "--ambiguous", dest="ambi", action='append', help="Ambiguous region bed file(s) to distinguish from FP (e.g. variant only observed " "in some replicates)") parser.add_argument("--ambi-fp", dest="ambi_fp", action='store_true', default=False, help="Use FP calls from ambiguous region files also.") parser.add_argument("--no-ambi-fp", dest="ambi_fp", action='store_false', help="Do not use FP calls from ambiguous region files also.") parser.add_argument("--count-unk", dest="count_unk", action='store_true', default=False, help="Assume the truth set covers the whole genome and only count FPs in regions " "specified by the truth VCF or ambiguous/false-positive regions.") parser.add_argument("--no-count-unk", dest="count_unk", action='store_false', help="Do not use FP calls from ambiguous region files also.") parser.add_argument("-e", "--explain_ambiguous", dest="explain_ambiguous", required=False, default=False, action="store_true", help="print a table giving the number of ambiguous events per category") parser.add_argument("-r", "--reference", dest="ref", default=Tools.defaultReference(), help="Specify a reference file.") parser.add_argument("--scratch-prefix", dest="scratch_prefix", default=None, help="Filename prefix for scratch report output.") parser.add_argument("--keep-scratch", dest="delete_scratch", default=True, action="store_false", help="Filename prefix for scratch report output.") parser.add_argument("--continue", dest="cont", default=False, action="store_true", help="Continue from scratch space (i.e. use VCFs in there if they already exist).") parser.add_argument("-P", "--include-nonpass", dest="inc_nonpass", action="store_true", default=False, help="Use to include failing variants in comparison.") parser.add_argument("--feature-table", dest="features", default=False, choices=Somatic.FeatureSet.sets.keys(), help="Select a feature table to output.") parser.add_argument("--bam", dest="bams", default=[], action="append", help="pass one or more BAM files for feature table extraction") parser.add_argument("--normalize-truth", dest="normalize_truth", default=False, action="store_true", help="Enable running of bcftools norm on the truth file.") parser.add_argument("--normalize-query", dest="normalize_query", default=False, action="store_true", help="Enable running of bcftools norm on the query file.") parser.add_argument("-N", "--normalize-all", dest="normalize_all", default=False, action="store_true", help="Enable running of bcftools norm on both truth and query file.") parser.add_argument("--fixchr-truth", dest="fixchr_truth", action="store_true", default=True, help="Add chr prefix to truth file (default: true).") parser.add_argument("--fixchr-query", dest="fixchr_query", action="store_true", default=True, help="Add chr prefix to query file (default: true).") parser.add_argument("--fix-chr-truth", dest="fixchr_truth", action="store_true", default=None, help="Same as --fixchr-truth.") parser.add_argument("--fix-chr-query", dest="fixchr_query", action="store_true", default=None, help="Same as --fixchr-query.") parser.add_argument("--no-fixchr-truth", dest="fixchr_truth", action="store_false", default=False, help="Disable chr replacement for truth (default: false).") parser.add_argument("--no-fixchr-query", dest="fixchr_query", action="store_false", default=False, help="Add chr prefix to query file (default: false).") parser.add_argument("--no-order-check", dest="disable_order_check", default=False, action="store_true", help="Disable checking the order of TP features (dev feature).") parser.add_argument("--roc", dest="roc", default=None, choices=ROC.list(), help="Create a ROC-style table. This is caller specific " " - this will override the --feature-table switch!") parser.add_argument("--bin-afs", dest="af_strat", default=None, action="store_true", help="Stratify into different AF buckets. This needs to have features available" "for getting the AF both in truth and query variants.") parser.add_argument("--af-binsize", dest="af_strat_binsize", default=0.2, help="Bin size for AF binning (should be < 1). Multiple bin sizes can be specified using a comma, " "e.g. 0.1,0.2,0.5,0.2 will split at 0.1, 0.3, 0.8 and 1.0.") parser.add_argument("--af-truth", dest="af_strat_truth", default="I.T_ALT_RATE", help="Feature name to use for retrieving AF for truth variants (TP and FN)") parser.add_argument("--af-query", dest="af_strat_query", default="T_AF", help="Feature name to use for retrieving AF for query variants (FP/UNK/AMBI)") parser.add_argument("-FN", "--count-filtered-fn", dest="count_filtered_fn", action="store_true", help="Count filtered vs. absent FN numbers. This requires the -P switch (to use all " "variants) and either the --feature-table or --roc switch.") parser.add_argument("--fp-region-size", dest="fpr_size", help="How to obtain the normalisation constant for FP rate. By default, this will use the FP region bed file size when using" " --count-unk and the size of all reference contigs that overlap with the location specified in -l otherwise." " This can be overridden with: 1) a number of nucleotides, or 2) \"auto\" to use the lengths of all contigs that have calls." " The resulting value is used as fp.region.size.") parser.add_argument("--logfile", dest="logfile", default=None, help="Write logging information into file rather than to stderr") verbosity_options = parser.add_mutually_exclusive_group(required=False) verbosity_options.add_argument("--verbose", dest="verbose", default=False, action="store_true", help="Raise logging level from warning to info.") verbosity_options.add_argument("--quiet", dest="quiet", default=False, action="store_true", help="Set logging level to output errors only.") args = parser.parse_args() if args.verbose: loglevel = logging.INFO elif args.quiet: loglevel = logging.ERROR else: loglevel = logging.WARNING try: if type(args.af_strat_binsize) is str: args.af_strat_binsize = map(float, args.af_strat_binsize.split(",")) else: args.af_strat_binsize = map(float, [args.af_strat_binsize]) if not args.af_strat_binsize: raise Exception("Bin size list is empty") except: logging.error("Failed to parse stratification bin size: %s" % str(args.af_strat_binsize)) exit(1) # reinitialize logging for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) logging.basicConfig(filename=args.logfile, format='%(asctime)s %(levelname)-8s %(message)s', level=loglevel) if args.normalize_all: args.normalize_truth = True args.normalize_query = True if args.roc: args.roc = ROC.make(args.roc) args.features = args.roc.ftname if not args.inc_nonpass: logging.warn("When creating ROCs without the -P switch, the ROC data points will only " "include filtered variants (i.e. they will normally end at the caller's " "quality threshold).") if args.af_strat and not args.features: raise Exception("To stratify by AFs, a feature table must be selected -- use this switch together " "with --feature-table or --roc") if args.count_filtered_fn and (not args.inc_nonpass or not args.features): raise Exception("Counting filtered / unfiltered FNs only works when a feature table is selected, " "and when using unfiltered variants. Specify -P --feature-table <...> or use " "--roc to select a ROC type.") if args.scratch_prefix: scratch = os.path.abspath(args.scratch_prefix) args.delete_scratch = False Tools.mkdir_p(scratch) else: scratch = tempfile.mkdtemp() logging.info("Scratch path is %s" % scratch) try: bams = [] md = None for x in args.bams: bams.append(bamStats(x)) if bams: bres = pandas.concat(bams).groupby("CHROM").mean() md = {} for x in bres.index: logging.info("Mean coverage on %s is %f" % (x, bres.loc[x]["COVERAGE"])) md[x] = float(bres.loc[x]["COVERAGE"]) * 3.0 logging.info("Normalizing/reading inputs") ntpath = os.path.join(scratch, "normalized_truth.vcf.gz") if not (args.cont and os.path.exists(ntpath)): preprocessVCF(args.truth, ntpath, args.location, True, # pass_only args.fixchr_truth, # chrprefix args.normalize_truth, # norm, args.regions_bedfile, args.targets_bedfile, args.ref) else: logging.info("Continuing from %s" % ntpath) if not (args.cont and os.path.exists(ntpath + ".csi")): runBcftools("index", ntpath) nqpath = os.path.join(scratch, "normalized_query.vcf.gz") if not (args.cont and os.path.exists(nqpath)): preprocessVCF(args.query, nqpath, args.location, not args.inc_nonpass, # pass_only args.fixchr_query, # chrprefix args.normalize_query, # norm, args.regions_bedfile, args.targets_bedfile, args.ref) else: logging.info("Continuing from %s" % nqpath) if not (args.cont and os.path.exists(nqpath + ".csi")): runBcftools("index", nqpath) logging.info("Intersecting") tpfn_files = all([os.path.exists(os.path.join(scratch, "tpfn", "0000.vcf.gz")), os.path.exists(os.path.join(scratch, "tpfn", "0001.vcf.gz")), os.path.exists(os.path.join(scratch, "tpfn", "0002.vcf.gz"))]) tpfn_r_files = all([os.path.exists(os.path.join(scratch, "tpfn", "0000.vcf.gz")), os.path.exists(os.path.join(scratch, "tpfn", "0001.vcf.gz")), os.path.exists(os.path.join(scratch, "tpfn", "0002.vcf.gz"))]) if not (args.cont and tpfn_files): runBcftools("isec", ntpath, nqpath, "-p", os.path.join(scratch, "tpfn"), "-O", "z") else: logging.info("Continuing from %s" % os.path.join(scratch, "tpfn")) if args.features and not (args.cont and tpfn_r_files): # only need to do this for getting the feature table runBcftools("isec", nqpath, ntpath, "-p", os.path.join(scratch, "tpfn_r"), "-O", "z") logging.info("Getting FPs / Ambi / Unk") fppath = os.path.join(scratch, "fp.vcf.gz") unkpath = os.path.join(scratch, "unk.vcf.gz") ambipath = os.path.join(scratch, "ambi.vcf.gz") # get header to print to unk and ambi VCFs rununiquepath = os.path.join(scratch, "tpfn", "0001.vcf.gz") header = runBcftools("view", rununiquepath, "--header-only") fp = Tools.BGZipFile(fppath, True) fp.write(header) unk = Tools.BGZipFile(unkpath, True) unk.write(header) ambi = Tools.BGZipFile(ambipath, True) ambi.write(header) ambiClasses = Counter() ambiReasons = Counter() fpclasses = BedIntervalTree() if args.ambi: # can have multiple ambiguous BED files for aBED in args.ambi: # auto-label from first value after chr start end # new ambi files have the label in position 4 # old ones will look weird here. fpclasses.addFromBed(aBED, lambda xe: xe[4], args.fixchr_truth) if args.FP: fpclasses.addFromBed(args.FP, "FP", args.fixchr_truth) # split VCF into FP, UNK and AMBI toProcess = gzip.open(rununiquepath, "rb") for entry in toProcess: if entry[0] == '#': continue fields = entry.strip().split("\t") chrom = fields[0] start = int(fields[1]) stop = int(fields[1]) + len(fields[3]) overlap = fpclasses.intersect(chrom, start, stop) is_fp = False is_ambi = False classes_this_pos = set() for o in overlap: reason = o.value[0] if reason == "fp" and args.ambi_fp: reason = "FP" elif reason == "fp": reason = "ambi-fp" elif reason == "unk": reason = "ambi-unk" classes_this_pos.add(reason) try: ambiReasons["%s: rep. count %s" % (reason, o.value[1])] += 1 except IndexError: ambiReasons["%s: rep. count *" % reason] += 1 for x in o.value[3:]: ambiReasons["%s: %s" % (reason, x)] += 1 if reason == "FP": is_fp = True else: is_ambi = True for reason in classes_this_pos: ambiClasses[reason] += 1 if is_fp: fp.write(entry) elif is_ambi: ambi.write(entry) elif not args.count_unk: # when we don't have FP regions, unk stuff becomes FP fp.write(entry) else: unk.write(entry) toProcess.close() # since 0001.vcf.gz should already be sorted, we can just convert to bgzipped vcf # and create index fp.close() ambi.close() unk.close() runBcftools("index", "--tbi", fppath) runBcftools("index", "--tbi", unkpath) runBcftools("index", "--tbi", ambipath) logging.info("Counting variants...") truthcounts = parseStats(runBcftools("stats", ntpath), "total.truth") querycounts = parseStats(runBcftools("stats", nqpath), "total.query") tpcounts = parseStats(runBcftools("stats", os.path.join(scratch, "tpfn", "0002.vcf.gz")), "tp") fncounts = parseStats(runBcftools("stats", os.path.join(scratch, "tpfn", "0000.vcf.gz")), "fn") fpcounts = parseStats(runBcftools("stats", fppath), "fp") ambicounts = parseStats(runBcftools("stats", ambipath), "ambi") unkcounts = parseStats(runBcftools("stats", unkpath), "unk") res = pandas.merge(truthcounts, querycounts, on="type") res = pandas.merge(res, tpcounts, on="type") res = pandas.merge(res, fpcounts, on="type") res = pandas.merge(res, fncounts, on="type") res = pandas.merge(res, unkcounts, on="type") res = pandas.merge(res, ambicounts, on="type") # no explicit guarantee that total.query is equal to unk + ambi + fp + tp # testSum = res["fp"] + res["tp"] + res["unk"] + res["ambi"] # filter and relabel res = res[res["type"] != "samples"] res = res[res["type"] != "multiallelic SNP sites"] res = res[res["type"] != "multiallelic sites"] res.loc[res["type"] == "SNPs", "type"] = "SNVs" metrics_output = makeMetricsObject("som.py.comparison") if args.ambi and args.explain_ambiguous: ac = list(ambiClasses.iteritems()) if ac: ambie = pandas.DataFrame(ac, columns=["class", "count"]) ambie.sort(["class"], inplace=True) pandas.set_option("display.max_rows", 1000) pandas.set_option("display.max_columns", 1000) pandas.set_option("display.width", 1000) pandas.set_option("display.height", 1100) logging.info("FP/ambiguity classes with info (multiple classes can " "overlap):\n" + ambie.to_string(index=False)) # in default mode, print result summary to stdout if not args.quiet and not args.verbose: print "FP/ambiguity classes with info (multiple classes can " \ "overlap):\n" + ambie.to_string(index=False) ambie.to_csv(args.output + ".ambiclasses.csv") metrics_output["metrics"].append(dataframeToMetricsTable("ambiclasses", ambie)) else: logging.info("No ambiguous variants.") ar = list(ambiReasons.iteritems()) if ar: ambie = pandas.DataFrame(ar, columns=["reason", "count"]) ambie.sort(["reason"], inplace=True) pandas.set_option("display.max_rows", 1000) pandas.set_option("display.max_columns", 1000) pandas.set_option("display.width", 1000) pandas.set_option("display.height", 1100) logging.info("Reasons for defining as ambiguous (multiple reasons can overlap):\n" + ambie.to_string( formatters={'reason': '{{:<{}s}}'.format(ambie['reason'].str.len().max()).format}, index=False)) # in default mode, print result summary to stdout if not args.quiet and not args.verbose: print "Reasons for defining as ambiguous (multiple reasons can overlap):\n" + ambie.to_string( formatters={'reason': '{{:<{}s}}'.format(ambie['reason'].str.len().max()).format}, index=False) ambie.to_csv(args.output + ".ambireasons.csv") metrics_output["metrics"].append(dataframeToMetricsTable("ambireasons", ambie)) else: logging.info("No ambiguous variants.") if args.features: logging.info("Extracting features...") fset = Somatic.FeatureSet.make(args.features) fset.setChrDepths(md) logging.info("Collecting TP info (1)...") tps = fset.collect(os.path.join(scratch, "tpfn", "0002.vcf.gz"), "TP") # TP_r is a hint for fset, they are both TPs logging.info("Collecting TP info (2)...") tps2 = fset.collect(os.path.join(scratch, "tpfn_r", "0002.vcf.gz"), "TP_r") # this is slow because it tries to sort # ... which we don't need to do since tps1 and tps2 have the same ordering logging.info("Sorting...") tps.sort(["CHROM", "POS"], inplace=True) tps2.sort(["CHROM", "POS"], inplace=True) tps = tps.reset_index(drop=True) tps2 = tps2.reset_index(drop=True) logging.info("Merging TP info...") columns_tps = list(tps) columns_tps2 = list(tps2) len1 = tps.shape[0] len2 = tps2.shape[0] if len1 != len2: raise Exception("Cannot read TP features, lists have different lengths : %i != %i" % (len1, len2)) if not args.disable_order_check: logging.info("Checking order %i / %i" % (len1, len2)) for x in xrange(0, len1): for a in ["CHROM", "POS"]: if tps.loc[x][a] != tps2.loc[x][a]: raise Exception("Cannot merge TP features, inputs are out of order at %s / %s" % ( str(tps[x:x + 1]), str(tps2[x:x + 1]))) logging.info("Merging...") cdata = { "CHROM": tps["CHROM"], "POS": tps["POS"], "tag": tps["tag"] } tpc = pandas.DataFrame(cdata, columns=["CHROM", "POS", "tag"]) all_columns = list(set(columns_tps + columns_tps2)) for a in all_columns: if a in columns_tps and a not in columns_tps2: tpc[a] = tps[a] elif a not in columns_tps and a in columns_tps2: tpc[a] = tps2[a] elif a not in ["CHROM", "POS", "tag"]: tpc[a] = tps2[a] tpc[a + ".truth"] = tps[a] logging.info("Collecting FP info...") fps = fset.collect(fppath, "FP") ambs = fset.collect(ambipath, "AMBI") logging.info("Collecting FN info...") fns = fset.collect(os.path.join(scratch, "tpfn", "0000.vcf.gz"), "FN") renamed = {} tp_cols = list(tpc) for col in list(fns): if col + ".truth" in tp_cols: renamed[col] = col + ".truth" fns.rename(columns=renamed, inplace=True) featurelist = [tpc, fps, fns, ambs] if unkpath is not None: logging.info("Collecting UNK info...") unk = fset.collect(unkpath, "UNK") featurelist.append(unk) logging.info("Making feature table...") featuretable = pandas.concat(featurelist) # reorder to make more legible first_columns = ["CHROM", "POS", "tag"] # noinspection PyTypeChecker all_columns = list(featuretable) if "REF" in all_columns: first_columns.append("REF") if "REF.truth" in all_columns: first_columns.append("REF.truth") if "ALT" in all_columns: first_columns.append("ALT") if "ALT.truth" in all_columns: first_columns.append("ALT.truth") ordered_columns = first_columns + sorted([x for x in all_columns if x not in first_columns]) featuretable = featuretable[ordered_columns] # make sure positions are integers featuretable["POS"] = featuretable["POS"].astype(int) logging.info("Saving feature table...") featuretable.to_csv(args.output + ".features.csv", float_format='%.8f') if args.roc is not None: roc_table = args.roc.from_table(featuretable) roc_table.to_csv(args.output + ".roc.csv", float_format='%.8f') featuretable["FILTER"].fillna("", inplace=True) featuretable.ix[featuretable["REF"].str.len() < 1, "absent"] = True featuretable.ix[featuretable["tag"] == "FN", "REF"] = featuretable.ix[featuretable["tag"] == "FN", "REF.truth"] featuretable.ix[featuretable["tag"] == "FN", "ALT"] = featuretable.ix[featuretable["tag"] == "FN", "ALT.truth"] af_t_feature = args.af_strat_truth af_q_feature = args.af_strat_query for vtype in ["records", "SNVs", "indels"]: if vtype == "SNVs": featuretable_this_type = featuretable[(featuretable["REF"].str.len() > 0) & (featuretable["ALT"].str.len() == featuretable["REF"].str.len())] elif vtype == "indels": featuretable_this_type = featuretable[(featuretable["REF"].str.len() != 1) | (featuretable["ALT"].str.len() != 1)] else: featuretable_this_type = featuretable if args.count_filtered_fn: res.ix[res["type"] == vtype, "fp.filtered"] = featuretable_this_type[ (featuretable_this_type["tag"] == "FP") & (featuretable_this_type["FILTER"] != "")].shape[0] res.ix[res["type"] == vtype, "tp.filtered"] = featuretable_this_type[ (featuretable_this_type["tag"] == "TP") & (featuretable_this_type["FILTER"] != "")].shape[0] res.ix[res["type"] == vtype, "unk.filtered"] = featuretable_this_type[ (featuretable_this_type["tag"] == "UNK") & (featuretable_this_type["FILTER"] != "")].shape[0] res.ix[res["type"] == vtype, "ambi.filtered"] = featuretable_this_type[ (featuretable_this_type["tag"] == "AMBI") & (featuretable_this_type["FILTER"] != "")].shape[0] if args.af_strat: start = 0.0 current_binsize = args.af_strat_binsize[0] next_binsize = 0 while start < 1.0: # include 1 in last interval end = min(1.000000001, start + current_binsize) n_tp = featuretable_this_type[(featuretable_this_type["tag"] == "TP") & (featuretable_this_type[af_t_feature] >= start) & (featuretable_this_type[af_t_feature] < end)] n_fn = featuretable_this_type[(featuretable_this_type["tag"] == "FN") & (featuretable_this_type[af_t_feature] >= start) & (featuretable_this_type[af_t_feature] < end)] n_fp = featuretable_this_type[(featuretable_this_type["tag"] == "FP") & (featuretable_this_type[af_q_feature] >= start) & (featuretable_this_type[af_q_feature] < end)] n_ambi = featuretable_this_type[(featuretable_this_type["tag"] == "AMBI") & (featuretable_this_type[af_q_feature] >= start) & (featuretable_this_type[af_q_feature] < end)] n_unk = featuretable_this_type[(featuretable_this_type["tag"] == "UNK") & (featuretable_this_type[af_q_feature] >= start) & (featuretable_this_type[af_q_feature] < end)] r = {"type": "%s.%f-%f" % (vtype, start, end), "total.truth": n_tp.shape[0] + n_fn.shape[0], "total.query": n_tp.shape[0] + n_fp.shape[0] + n_ambi.shape[0] + n_unk.shape[0], "tp": n_tp.shape[0], "fp": n_fp.shape[0], "fn": n_fn.shape[0], "unk": n_unk.shape[0], "ambi": n_ambi.shape[0], } if args.count_filtered_fn: r["fp.filtered"] = n_fp[n_fp["FILTER"] != ""].shape[0] r["tp.filtered"] = n_tp[n_tp["FILTER"] != ""].shape[0] r["unk.filtered"] = n_unk[n_unk["FILTER"] != ""].shape[0] r["ambi.filtered"] = n_ambi[n_ambi["FILTER"] != ""].shape[0] res = pandas.concat([res, pandas.DataFrame([r])]) if args.roc is not None and (n_tp.shape[0] + n_fn.shape[0] + n_fp.shape[0]) > 0: roc_table_strat = args.roc.from_table(pandas.concat([n_tp, n_fp, n_fn])) rtname = "%s.%s.%f-%f.roc.csv" % (args.output, vtype, start, end) roc_table_strat.to_csv(rtname, float_format='%.8f') start += current_binsize next_binsize += 1 if next_binsize >= len(args.af_strat_binsize): next_binsize = 0 current_binsize = args.af_strat_binsize[next_binsize] # remove things where we haven't seen any variants in truth and query res = res[(res["total.truth"] > 0) & (res["total.query"] > 0)] # summary metrics res["recall"] = res["tp"] / (res["tp"] + res["fn"]) res["recall2"] = res["tp"] / (res["total.truth"]) res["precision"] = res["tp"] / (res["tp"] + res["fp"]) res["na"] = res["unk"] / (res["total.query"]) res["ambiguous"] = res["ambi"] / res["total.query"] any_fp = fpclasses.countbases(label="FP") fp_region_count = 0 auto_size = True if args.fpr_size: try: fp_region_count = int(args.fpr_size) auto_size = False except: pass if auto_size: if any_fp: if args.location: chrom, _, rest = args.location.partition(":") if rest: start, _, end = rest.partition("_") if start: start = int(start) if end: end = int(end) else: fp_region_count += fpclasses.countbases(chrom, label="FP") else: fp_region_count = any_fp else: cs = fastaContigLengths(args.ref) if args.location: fp_region_count = calculateLength(cs, args.location) else: # use all locations we saw calls on h1 = Tools.vcfextract.extractHeadersJSON(ntpath) h1_chrs = h1["tabix"]["chromosomes"] if not h1_chrs: logging.warn("ntpath is empty") h1_chrs = [] h2 = Tools.vcfextract.extractHeadersJSON(nqpath) h2_chrs = h2["tabix"]["chromosomes"] if not h2_chrs: logging.warn("nqpath is empty") h2_chrs = [] combined_chrs = list(set(h1_chrs + h2_chrs)) if len(combined_chrs) > 0: qlocations = " ".join(combined_chrs) fp_region_count = calculateLength(cs, qlocations) else: fp_region_count = 0 res["fp.region.size"] = fp_region_count res["fp.rate"] = 1e6 * res["fp"] / res["fp.region.size"] if args.count_filtered_fn: res["recall.filtered"] = (res["tp"] - res["tp.filtered"]) / (res["tp"] + res["fn"]) res["precision.filtered"] = (res["tp"] - res["tp.filtered"]) / (res["tp"] - res["tp.filtered"] + res["fp"] - res["fp.filtered"]) res["fp.rate.filtered"] = 1e6 * (res["fp"] - res["fp.filtered"]) / res["fp.region.size"] res["na.filtered"] = (res["unk"] - res["unk.filtered"]) / (res["total.query"]) res["ambiguous.filtered"] = (res["ambi"] - res["ambi.filtered"]) / res["total.query"] # HAP-162 remove inf values res.replace([np.inf, -np.inf], 0) metrics_output["metrics"].append(dataframeToMetricsTable("result", res)) vstring = "som.py-%s" % Tools.version logging.info("\n" + res.to_string()) # in default mode, print result summary to stdout if not args.quiet and not args.verbose: print "\n" + res.to_string() res["sompyversion"] = vstring vstring = " ".join(sys.argv) res["sompycmd"] = vstring res.to_csv(args.output + ".stats.csv") with open(args.output + ".metrics.json", "w") as fp: json.dump(metrics_output, fp) finally: if args.delete_scratch: shutil.rmtree(scratch) else: logging.info("Scratch kept at %s" % scratch)
def main(): args = parse_args() if args.scratch_prefix: scratch = os.path.abspath(args.scratch_prefix) args.delete_scratch = False Tools.mkdir_p(scratch) else: scratch = tempfile.mkdtemp() logging.info("Scratch path is %s" % scratch) try: bams = [] md = None for x in args.bams: bams.append(bamStats(x)) if bams: bres = pandas.concat(bams).groupby("CHROM").mean() md = {} for x in bres.index: logging.info("Mean coverage on %s is %f" % (x, bres.loc[x]["COVERAGE"])) md[x] = float(bres.loc[x]["COVERAGE"]) * 3.0 logging.info("Normalizing/reading inputs") ntpath = os.path.join(scratch, "normalized_truth.vcf.gz") if not (args.cont and os.path.exists(ntpath)): preprocessVCF( args.truth, ntpath, args.location, True, # pass_only args.fixchr_truth, # chrprefix args.normalize_truth, # norm, args.regions_bedfile, args.targets_bedfile, args.ref) else: logging.info("Continuing from %s" % ntpath) if not (args.cont and os.path.exists(ntpath + ".csi")): runBcftools("index", ntpath) nqpath = os.path.join(scratch, "normalized_query.vcf.gz") if not (args.cont and os.path.exists(nqpath)): preprocessVCF( args.query, nqpath, args.location, not args.inc_nonpass, # pass_only args.fixchr_query, # chrprefix args.normalize_query, # norm, args.regions_bedfile, args.targets_bedfile, args.ref) else: logging.info("Continuing from %s" % nqpath) if not (args.cont and os.path.exists(nqpath + ".csi")): runBcftools("index", nqpath) logging.info("Intersecting") tpfn_files = all([ os.path.exists(os.path.join(scratch, "tpfn", "0000.vcf.gz")), os.path.exists(os.path.join(scratch, "tpfn", "0001.vcf.gz")), os.path.exists(os.path.join(scratch, "tpfn", "0002.vcf.gz")) ]) tpfn_r_files = all([ os.path.exists(os.path.join(scratch, "tpfn", "0000.vcf.gz")), os.path.exists(os.path.join(scratch, "tpfn", "0001.vcf.gz")), os.path.exists(os.path.join(scratch, "tpfn", "0002.vcf.gz")) ]) if not (args.cont and tpfn_files): runBcftools("isec", ntpath, nqpath, "-p", os.path.join(scratch, "tpfn"), "-O", "z") else: logging.info("Continuing from %s" % os.path.join(scratch, "tpfn")) if args.features and not (args.cont and tpfn_r_files): # only need to do this for getting the feature table runBcftools("isec", nqpath, ntpath, "-p", os.path.join(scratch, "tpfn_r"), "-O", "z") logging.info("Getting FPs / Ambi / Unk") fppath = os.path.join(scratch, "fp.vcf.gz") unkpath = os.path.join(scratch, "unk.vcf.gz") ambipath = os.path.join(scratch, "ambi.vcf.gz") # get header to print to unk and ambi VCFs rununiquepath = os.path.join(scratch, "tpfn", "0001.vcf.gz") header = runBcftools("view", rununiquepath, "--header-only") fp = Tools.BGZipFile(fppath, True) fp.write(header) unk = Tools.BGZipFile(unkpath, True) unk.write(header) ambi = Tools.BGZipFile(ambipath, True) ambi.write(header) ambiClasses = Counter() ambiReasons = Counter() fpclasses = BedIntervalTree() if args.ambi: # can have multiple ambiguous BED files for aBED in args.ambi: # auto-label from first value after chr start end # new ambi files have the label in position 4 # old ones will look weird here. fpclasses.addFromBed(aBED, lambda xe: xe[4], args.fixchr_truth) if args.FP: fpclasses.addFromBed(args.FP, "FP", args.fixchr_truth) # split VCF into FP, UNK and AMBI toProcess = gzip.open(rununiquepath, "rb") for entry in toProcess: if entry[0] == '#': continue fields = entry.strip().split("\t") chrom = fields[0] start = int(fields[1]) stop = int(fields[1]) + len(fields[3]) overlap = fpclasses.intersect(chrom, start, stop) is_fp = False is_ambi = False classes_this_pos = set() for o in overlap: reason = o.value[0] if reason == "fp" and args.ambi_fp: reason = "FP" elif reason == "fp": reason = "ambi-fp" elif reason == "unk": reason = "ambi-unk" classes_this_pos.add(reason) try: ambiReasons["%s: rep. count %s" % (reason, o.value[1])] += 1 except IndexError: ambiReasons["%s: rep. count *" % reason] += 1 for x in o.value[3:]: ambiReasons["%s: %s" % (reason, x)] += 1 if reason == "FP": is_fp = True else: is_ambi = True for reason in classes_this_pos: ambiClasses[reason] += 1 if is_fp: fp.write(entry) elif is_ambi: ambi.write(entry) elif not args.count_unk: # when we don't have FP regions, unk stuff becomes FP fp.write(entry) else: unk.write(entry) toProcess.close() # since 0001.vcf.gz should already be sorted, we can just convert to bgzipped vcf # and create index fp.close() ambi.close() unk.close() runBcftools("index", "--tbi", fppath) runBcftools("index", "--tbi", unkpath) runBcftools("index", "--tbi", ambipath) logging.info("Counting variants...") truthcounts = parseStats(runBcftools("stats", ntpath), "total.truth") querycounts = parseStats(runBcftools("stats", nqpath), "total.query") tpcounts = parseStats( runBcftools("stats", os.path.join(scratch, "tpfn", "0002.vcf.gz")), "tp") fncounts = parseStats( runBcftools("stats", os.path.join(scratch, "tpfn", "0000.vcf.gz")), "fn") fpcounts = parseStats(runBcftools("stats", fppath), "fp") ambicounts = parseStats(runBcftools("stats", ambipath), "ambi") unkcounts = parseStats(runBcftools("stats", unkpath), "unk") res = pandas.merge(truthcounts, querycounts, on="type") res = pandas.merge(res, tpcounts, on="type") res = pandas.merge(res, fpcounts, on="type") res = pandas.merge(res, fncounts, on="type") res = pandas.merge(res, unkcounts, on="type") res = pandas.merge(res, ambicounts, on="type") # no explicit guarantee that total.query is equal to unk + ambi + fp + tp # testSum = res["fp"] + res["tp"] + res["unk"] + res["ambi"] # filter and relabel res = res[res["type"] != "samples"] res = res[res["type"] != "multiallelic SNP sites"] res = res[res["type"] != "multiallelic sites"] res.loc[res["type"] == "SNPs", "type"] = "SNVs" metrics_output = makeMetricsObject("som.py.comparison") if args.ambi and args.explain_ambiguous: ac = list(ambiClasses.iteritems()) if ac: ambie = pandas.DataFrame(ac, columns=["class", "count"]) ambie.sort_values(["class"], inplace=True) pandas.set_option("display.max_rows", 1000) pandas.set_option("display.max_columns", 1000) pandas.set_option("display.width", 1000) pandas.set_option("display.height", 1100) logging.info( "FP/ambiguity classes with info (multiple classes can " "overlap):\n" + ambie.to_string(index=False)) # in default mode, print result summary to stdout if not args.quiet and not args.verbose: print "FP/ambiguity classes with info (multiple classes can " \ "overlap):\n" + ambie.to_string(index=False) ambie.to_csv(args.output + ".ambiclasses.csv") metrics_output["metrics"].append( dataframeToMetricsTable("ambiclasses", ambie)) else: logging.info("No ambiguous variants.") ar = list(ambiReasons.iteritems()) if ar: ambie = pandas.DataFrame(ar, columns=["reason", "count"]) ambie.sort_values(["reason"], inplace=True) pandas.set_option("display.max_rows", 1000) pandas.set_option("display.max_columns", 1000) pandas.set_option("display.width", 1000) pandas.set_option("display.height", 1100) logging.info( "Reasons for defining as ambiguous (multiple reasons can overlap):\n" + ambie.to_string(formatters={ 'reason': '{{:<{}s}}'.format( ambie['reason'].str.len().max()).format }, index=False)) # in default mode, print result summary to stdout if not args.quiet and not args.verbose: print "Reasons for defining as ambiguous (multiple reasons can overlap):\n" + ambie.to_string( formatters={ 'reason': '{{:<{}s}}'.format( ambie['reason'].str.len().max()).format }, index=False) ambie.to_csv(args.output + ".ambireasons.csv") metrics_output["metrics"].append( dataframeToMetricsTable("ambireasons", ambie)) else: logging.info("No ambiguous variants.") if args.features: logging.info("Extracting features...") fset = Somatic.FeatureSet.make(args.features) fset.setChrDepths(md) logging.info("Collecting TP info (1)...") tps = fset.collect(os.path.join(scratch, "tpfn", "0002.vcf.gz"), "TP") # TP_r is a hint for fset, they are both TPs logging.info("Collecting TP info (2)...") tps2 = fset.collect(os.path.join(scratch, "tpfn_r", "0002.vcf.gz"), "TP_r") # this is slow because it tries to sort # ... which we don't need to do since tps1 and tps2 have the same ordering logging.info("Sorting...") tps.sort_values(["CHROM", "POS"], inplace=True) tps2.sort_values(["CHROM", "POS"], inplace=True) tps = tps.reset_index(drop=True) tps2 = tps2.reset_index(drop=True) logging.info("Merging TP info...") columns_tps = list(tps) columns_tps2 = list(tps2) len1 = tps.shape[0] len2 = tps2.shape[0] if len1 != len2: raise Exception( "Cannot read TP features, lists have different lengths : %i != %i" % (len1, len2)) if not args.disable_order_check: logging.info("Checking order %i / %i" % (len1, len2)) for x in xrange(0, len1): for a in ["CHROM", "POS"]: if tps.loc[x][a] != tps2.loc[x][a]: raise Exception( "Cannot merge TP features, inputs are out of order at %s / %s" % (str(tps[x:x + 1]), str(tps2[x:x + 1]))) logging.info("Merging...") cdata = { "CHROM": tps["CHROM"], "POS": tps["POS"], "tag": tps["tag"] } tpc = pandas.DataFrame(cdata, columns=["CHROM", "POS", "tag"]) all_columns = list(set(columns_tps + columns_tps2)) for a in all_columns: if a in columns_tps and a not in columns_tps2: tpc[a] = tps[a] elif a not in columns_tps and a in columns_tps2: tpc[a] = tps2[a] elif a not in ["CHROM", "POS", "tag"]: tpc[a] = tps2[a] tpc[a + ".truth"] = tps[a] logging.info("Collecting FP info...") fps = fset.collect(fppath, "FP") ambs = fset.collect(ambipath, "AMBI") logging.info("Collecting FN info...") fns = fset.collect(os.path.join(scratch, "tpfn", "0000.vcf.gz"), "FN") renamed = {} tp_cols = list(tpc) for col in list(fns): if col + ".truth" in tp_cols: renamed[col] = col + ".truth" fns.rename(columns=renamed, inplace=True) featurelist = [tpc, fps, fns, ambs] if unkpath is not None: logging.info("Collecting UNK info...") unk = fset.collect(unkpath, "UNK") featurelist.append(unk) logging.info("Making feature table...") featuretable = pandas.concat(featurelist) # reorder to make more legible first_columns = ["CHROM", "POS", "tag"] # noinspection PyTypeChecker all_columns = list(featuretable) if "REF" in all_columns: first_columns.append("REF") if "REF.truth" in all_columns: first_columns.append("REF.truth") if "ALT" in all_columns: first_columns.append("ALT") if "ALT.truth" in all_columns: first_columns.append("ALT.truth") ordered_columns = first_columns + sorted( [x for x in all_columns if x not in first_columns]) featuretable = featuretable[ordered_columns] # make sure positions are integers featuretable["POS"] = featuretable["POS"].astype(int) logging.info("Saving feature table...") featuretable.to_csv(args.output + ".features.csv", float_format='%.8f') if args.roc is not None: roc_table = args.roc.from_table(featuretable) roc_table.to_csv(args.output + ".roc.csv", float_format='%.8f') featuretable["FILTER"].fillna("", inplace=True) featuretable.ix[featuretable["REF"].str.len() < 1, "absent"] = True featuretable.ix[featuretable["tag"] == "FN", "REF"] = featuretable.ix[featuretable["tag"] == "FN", "REF.truth"] featuretable.ix[featuretable["tag"] == "FN", "ALT"] = featuretable.ix[featuretable["tag"] == "FN", "ALT.truth"] af_t_feature = args.af_strat_truth af_q_feature = args.af_strat_query for vtype in ["records", "SNVs", "indels"]: featuretable["vtype"] = resolve_vtype(args) featuretable_this_type = featuretable if args.count_filtered_fn: res.ix[res["type"] == vtype, "fp.filtered"] = featuretable_this_type[ (featuretable_this_type["tag"] == "FP") & (featuretable_this_type["FILTER"] != "" )].shape[0] res.ix[res["type"] == vtype, "tp.filtered"] = featuretable_this_type[ (featuretable_this_type["tag"] == "TP") & (featuretable_this_type["FILTER"] != "" )].shape[0] res.ix[res["type"] == vtype, "unk.filtered"] = featuretable_this_type[ (featuretable_this_type["tag"] == "UNK") & (featuretable_this_type["FILTER"] != "" )].shape[0] res.ix[res["type"] == vtype, "ambi.filtered"] = featuretable_this_type[ (featuretable_this_type["tag"] == "AMBI") & (featuretable_this_type["FILTER"] != "" )].shape[0] if args.af_strat: start = 0.0 end = 1.0 current_binsize = args.af_strat_binsize[0] next_binsize = 0 while start < 1.0: # include 1 in last interval end = start + current_binsize if end >= 1: end = 1.00000001 if start >= end: break n_tp = featuretable_this_type[ (featuretable_this_type["tag"] == "TP") & (featuretable_this_type[af_t_feature] >= start) & (featuretable_this_type[af_t_feature] < end)] n_fn = featuretable_this_type[ (featuretable_this_type["tag"] == "FN") & (featuretable_this_type[af_t_feature] >= start) & (featuretable_this_type[af_t_feature] < end)] n_fp = featuretable_this_type[ (featuretable_this_type["tag"] == "FP") & (featuretable_this_type[af_q_feature] >= start) & (featuretable_this_type[af_q_feature] < end)] n_ambi = featuretable_this_type[ (featuretable_this_type["tag"] == "AMBI") & (featuretable_this_type[af_q_feature] >= start) & (featuretable_this_type[af_q_feature] < end)] n_unk = featuretable_this_type[ (featuretable_this_type["tag"] == "UNK") & (featuretable_this_type[af_q_feature] >= start) & (featuretable_this_type[af_q_feature] < end)] r = { "type": "%s.%f-%f" % (vtype, start, end), "total.truth": n_tp.shape[0] + n_fn.shape[0], "total.query": n_tp.shape[0] + n_fp.shape[0] + n_ambi.shape[0] + n_unk.shape[0], "tp": n_tp.shape[0], "fp": n_fp.shape[0], "fn": n_fn.shape[0], "unk": n_unk.shape[0], "ambi": n_ambi.shape[0] } if args.count_filtered_fn: r["fp.filtered"] = n_fp[ n_fp["FILTER"] != ""].shape[0] r["tp.filtered"] = n_tp[ n_tp["FILTER"] != ""].shape[0] r["unk.filtered"] = n_unk[ n_unk["FILTER"] != ""].shape[0] r["ambi.filtered"] = n_ambi[ n_ambi["FILTER"] != ""].shape[0] res = pandas.concat([res, pandas.DataFrame([r])]) if args.roc is not None and (n_tp.shape[0] + n_fn.shape[0] + n_fp.shape[0]) > 0: roc_table_strat = args.roc.from_table( pandas.concat([n_tp, n_fp, n_fn])) rtname = "%s.%s.%f-%f.roc.csv" % ( args.output, vtype, start, end) roc_table_strat.to_csv(rtname, float_format='%.8f') start = end next_binsize += 1 if next_binsize >= len(args.af_strat_binsize): next_binsize = 0 current_binsize = args.af_strat_binsize[next_binsize] if not args.af_strat: res = res[(res["total.truth"] > 0)] # summary metrics with confidence intervals ci_alpha = 1.0 - args.ci_level recall = binomialCI(res["tp"], res["tp"] + res["fn"], ci_alpha) precision = binomialCI(res["tp"], res["tp"] + res["fp"], ci_alpha) res["recall"], res["recall_lower"], res["recall_upper"] = recall res["recall2"] = res["tp"] / (res["total.truth"]) res["precision"], res["precision_lower"], res[ "precision_upper"] = precision res["na"] = res["unk"] / (res["total.query"]) res["ambiguous"] = res["ambi"] / res["total.query"] any_fp = fpclasses.countbases(label="FP") fp_region_count = 0 auto_size = True if args.fpr_size: try: fp_region_count = int(args.fpr_size) auto_size = False except: pass if auto_size: if any_fp: if args.location: chrom, _, rest = args.location.partition(":") if rest: start, _, end = rest.partition("_") if start: start = int(start) if end: end = int(end) else: fp_region_count += fpclasses.countbases(chrom, label="FP") else: fp_region_count = any_fp else: cs = fastaContigLengths(args.ref) if args.location: fp_region_count = calculateLength(cs, args.location) else: # use all locations we saw calls on h1 = Tools.vcfextract.extractHeadersJSON(ntpath) h1_chrs = h1["tabix"]["chromosomes"] if not h1_chrs: logging.warn("No contigs in truth file") h1_chrs = [] if len(h1_chrs) > 0: qlocations = " ".join(h1_chrs) fp_region_count = calculateLength(cs, qlocations) else: fp_region_count = 0 res["fp.region.size"] = fp_region_count res["fp.rate"] = 1e6 * res["fp"] / res["fp.region.size"] if args.count_filtered_fn: res["recall.filtered"] = (res["tp"] - res["tp.filtered"]) / ( res["tp"] + res["fn"]) res["precision.filtered"] = (res["tp"] - res["tp.filtered"]) / ( res["tp"] - res["tp.filtered"] + res["fp"] - res["fp.filtered"]) res["fp.rate.filtered"] = 1e6 * ( res["fp"] - res["fp.filtered"]) / res["fp.region.size"] res["na.filtered"] = (res["unk"] - res["unk.filtered"]) / (res["total.query"]) res["ambiguous.filtered"] = ( res["ambi"] - res["ambi.filtered"]) / res["total.query"] # HAP-162 remove inf values res.replace([np.inf, -np.inf], 0) metrics_output["metrics"].append(dataframeToMetricsTable( "result", res)) vstring = "som.py-%s" % Tools.version logging.info("\n" + res.to_string()) # in default mode, print result summary to stdout if not args.quiet and not args.verbose: print "\n" + res.to_string() res["sompyversion"] = vstring vstring = " ".join(sys.argv) res["sompycmd"] = vstring # save results res.to_csv(args.output + ".stats.csv") with open(args.output + ".metrics.json", "w") as fp: json.dump(metrics_output, fp) if args.happy_stats: # parse saved feature table as the one in memory has been updated featuretable = pandas.read_csv(args.output + ".features.csv", low_memory=False, dtype={"FILTER": str}) # hap.py summary.csv summary = summary_from_featuretable(featuretable, args) summary.to_csv(args.output + ".summary.csv") # hap.py extended.csv if args.af_strat: extended = extended_from_featuretable(featuretable, args) extended.to_csv(args.output + ".extended.csv", index=False, na_rep="NA") finally: if args.delete_scratch: shutil.rmtree(scratch) else: logging.info("Scratch kept at %s" % scratch)