def extractVarscan2SNVFeatures(vcfname, tag, avg_depth=None): """ Return a data frame with features collected from the given VCF, tagged by given type """ records = [] if not avg_depth: logging.warn("No average depths available, normalized depth features cannot be calculated") hdrs = extractHeadersJSON(vcfname) tsn = "" nsn = "" n_sample = "NORMAL" t_sample = "TUMOR" logging.info("Normal sample name : %s (prefix %s) / tumour sample name : %s (prefix %s)" % (nsn, n_sample, tsn, t_sample)) features = ["CHROM", "POS", "REF", "ALT", "FILTER", "I.SSC", "I.GPV", "I.SPV", n_sample + "GT", t_sample + "GT", # Genotype n_sample + "GQ", t_sample + "GQ", # Genotype quality n_sample + "DP", t_sample + "DP", # Read depth n_sample + "RD", t_sample + "RD", # Reference depth n_sample + "AD", t_sample + "AD", # Alternative depth n_sample + "FREQ", t_sample + "FREQ" # Alt. frequence (FA in MuTect) ] has_warned = {} for vr in vcfExtract(vcfname, features): rec = {} for i, ff in enumerate(features): rec[ff] = vr[i] for q in [n_sample + "GT", t_sample + "GT"]: if not q in rec or rec[q] is None: rec[q] = "." if not ("feat:" + q) in has_warned: logging.warn("Missing feature %s" % q) has_warned["feat:" + q] = True # fix missing features for q in [n_sample + "GT", t_sample + "GT", n_sample + "GQ", t_sample + "GQ", n_sample + "DP", t_sample + "DP", n_sample + "AD", t_sample + "AD", n_sample + "RD", t_sample + "RD", n_sample + "FREQ", t_sample + "FREQ"]: if not q in rec or rec[q] is None: rec[q] = 0 if not ("feat:" + q) in has_warned: logging.warn("Missing feature %s" % q) has_warned["feat:" + q] = True else: if q.endswith("FREQ"): try: rec[q] = float(rec[q]) except ValueError: rec[q] = float("NaN") else: try: rec[q] = int(rec[q]) except ValueError: rec[q] = -1 rec["tag"] = tag n_DP = float(rec[n_sample + "DP"]) t_DP = float(rec[t_sample + "DP"]) n_DP_ratio = 0 t_DP_ratio = 0 if avg_depth: if rec["CHROM"] in avg_depth: n_DP_ratio = n_DP/float(avg_depth[rec["CHROM"]]) t_DP_ratio = t_DP/float(avg_depth[rec["CHROM"]]) elif not rec["CHROM"] in has_warned: logging.warn("Cannot normalize depths on %s" % rec["CHROM"]) has_warned[rec["CHROM"]] = True elif not "DPnorm" in has_warned: logging.warn("Cannot normalize depths.") has_warned["DPnorm"] = True n_allele_ref_count = rec[n_sample + "RD"] alleles_alt = rec["ALT"] if alleles_alt == ['.']: n_allele_alt_count = 0 else: n_allele_alt_count = rec[n_sample + "AD"] if n_allele_alt_count + n_allele_ref_count == 0: n_allele_rate = 0 else: n_allele_rate = n_allele_alt_count / float(n_allele_alt_count + n_allele_ref_count) t_allele_ref_count = rec[t_sample + "RD"] alleles_alt = rec["ALT"] if alleles_alt == ['.']: t_allele_alt_count = 0 else: t_allele_alt_count = rec[t_sample + "AD"] if t_allele_alt_count + t_allele_ref_count == 0: t_allele_rate = 0 else: t_allele_rate = t_allele_alt_count / float(t_allele_alt_count + t_allele_ref_count) # Gather the computed data into a dict qrec = { "CHROM": rec["CHROM"], "POS": int(rec["POS"]), "REF": rec["REF"], "ALT": ",".join(rec["ALT"]), "FILTER": ",".join(rec["FILTER"]), "SSC": rec["I.SSC"], "GPV": rec["I.GPV"], "SPV": rec["I.SPV"], "N_DP": n_DP, "T_DP": t_DP, "N_DP_RATE" : n_DP_ratio, "T_DP_RATE" : t_DP_ratio, "N_GT": rec[n_sample + "GT"], "T_GT": rec[t_sample + "GT"], "N_GQ": rec[n_sample +"GQ"], "T_GQ": rec[t_sample +"GQ"], "N_AD": rec[n_sample + "AD"], "T_AD": rec[t_sample + "AD"], "N_FA": rec[n_sample + "FREQ"], "T_FA": rec[t_sample + "FREQ"], "N_ALT_RATE": n_allele_rate, "T_ALT_RATE": t_allele_rate, "tag" : tag } records.append(qrec) cols = [ "CHROM", "POS", "REF", "ALT", "FILTER", "SSC", "GPV", "SPV", "N_DP", "T_DP", "N_DP_RATE", "T_DP_RATE", "N_GT", "T_GT", "N_GQ", "T_GQ", "N_AD", "T_AD", "N_FA", "T_FA", "N_ALT_RATE", "T_ALT_RATE", "tag"] if records: df = pandas.DataFrame(records, columns=cols) else: df = pandas.DataFrame(columns=cols) return df
def main(): parser = argparse.ArgumentParser("Haplotype Comparison") # input parser.add_argument('--location', '-l', dest='locations', required=False, default=None, help='Add a location to the compare list (when not given, will use chr1-22, chrX, chrY).') parser.add_argument("-v", "--version", dest="version", action="store_true", help="Show version number and exit.") parser.add_argument("-P", "--include-nonpass", dest="usefiltered", action="store_true", default=False, help="Use to include failing query variants in comparison.") parser.add_argument("--include-nonpass-truth", dest="usefiltered_truth", action="store_true", default=False, help="Include failing variants from the truth dataset.") parser.add_argument("-R", "--restrict-regions", dest="regions_bedfile", default=None, type=str, help="Restrict analysis to given (sparse) regions (using -R in bcftools).") parser.add_argument("-T", "--target-regions", dest="targets_bedfile", default=None, type=str, help="Restrict analysis to given (dense) regions (using -T in bcftools).") parser.add_argument("-f", "--false-positives", dest="fp_bedfile", default=None, type=str, help="False positive / confident call regions (.bed or .bed.gz).") parser.add_argument("-r", "--reference", dest="ref", default=None, help="Specify a reference file.") # output parser.add_argument("-o", "--report-prefix", dest="reports_prefix", default=None, help="Filename prefix for report output.") parser.add_argument("-V", "--write-vcf", dest="write_vcf", default=False, action="store_true", help="Write an annotated VCF.") parser.add_argument("-B", "--write-bed", dest="write_bed", default=False, action="store_true", help="Write a bed file with the haplotype blocks that were used.") parser.add_argument("-X", "--write-counts", dest="write_counts", default=True, action="store_true", help="Write advanced counts and metrics.") parser.add_argument("--no-write-counts", dest="write_counts", default=True, action="store_false", help="Do not write advanced counts and metrics.") parser.add_argument("--raw-counts", dest="raw_counts", default=False, action="store_true", help="Count variants in unprocessed input VCFs and output as TOTAL.*.RAW.") parser.add_argument("--roc", dest="roc", default=False, help="Select an INFO feature to produce a ROC on. This works best with " "--no-internal-preprocessing and --no-internal-leftshift since these " "flags preserve the most INFO flags from the input files.") parser.add_argument("--roc-filter", dest="roc_filter", default=False, help="Select a filter to ignore when making ROCs.") parser.add_argument("--roc-reversed", dest="roc_reversed", default=False, help="Change the meaning of the ROC feature to count the other way around (higher values=bad).") parser.add_argument("--scratch-prefix", dest="scratch_prefix", default=None, help="Directory for scratch files.") parser.add_argument("--keep-scratch", dest="delete_scratch", default=True, action="store_false", help="Filename prefix for scratch report output.") # detailed control of comparison parser.add_argument("--preprocess-truth", dest="preprocessing_truth", action="store_true", default=False, help="Preprocess truth file using bcftools.") parser.add_argument("--external-preprocessing", dest="preprocessing", action="store_true", default=False, help="Perform VCF preprocessing using bcftools.") parser.add_argument("--bcftools-norm", dest="preprocessing_norm", action="store_true", default=False, help="Enable preprocessing through bcftools norm -c x -D (requires external " " preprocessing to be switched on).") parser.add_argument("--fixchr-truth", dest="fixchr_truth", action="store_true", default=None, help="Add chr prefix to truth file (default: auto).") parser.add_argument("--fixchr-query", dest="fixchr_query", action="store_true", default=None, help="Add chr prefix to query file (default: auto).") parser.add_argument("--no-fixchr-truth", dest="fixchr_truth", action="store_false", help="Disable chr replacement for truth (default: auto).") parser.add_argument("--no-fixchr-query", dest="fixchr_query", action="store_false", help="Add chr prefix to query file (default: auto).") parser.add_argument("--partial-credit", dest="partial_credit", action="store_true", default=None, help="give credit for partially matched variants. " "this is equivalent to --internal-leftshift and --internal-preprocessing.") parser.add_argument("--no-partial-credit", dest="partial_credit", action="store_false", default=None, help="Give credit for partially matched variants. " "This is equivalent to --internal-leftshift and --no-internal-preprocessing.") parser.add_argument("--internal-leftshift", dest="int_preprocessing_ls", action="store_true", default=None, help="Switch off xcmp's internal VCF leftshift preprocessing.") parser.add_argument("--internal-preprocessing", dest="int_preprocessing", action="store_true", default=None, help="Switch off xcmp's internal VCF leftshift preprocessing.") parser.add_argument("--no-internal-leftshift", dest="int_preprocessing_ls", action="store_false", default=None, help="Switch off xcmp's internal VCF leftshift preprocessing.") parser.add_argument("--no-internal-preprocessing", dest="int_preprocessing", action="store_false", default=None, help="Switch off xcmp's internal VCF leftshift preprocessing.") parser.add_argument("--match-raw", dest="int_match_raw", action="store_true", default=False, help="Add a matching step in xcmp which also matches raw variant calls. This helps" " when comparing files with very different representations.") parser.add_argument("--no-haplotype-comparison", dest="no_hc", action="store_true", default=False, help="Disable haplotype comparison (only count direct GT matches as TP).") parser.add_argument("--unhappy", dest="unhappy", action="store_true", default=False, help="Combination of --no-haplotype-comparison --no-internal-preprocessing " "--no-internal-leftshift.") parser.add_argument("--no-auto-index", dest="auto_index", action="store_false", default=True, help="Disable automatic index creation for input files. " "The index is only necessary at this stage if we want to auto-detect locations. " "When used with -l, and when it is known that there are variants at all given locations " "this is not needed and can be switched off to save time.") parser.add_argument("-w", "--window-size", dest="window", default=50, type=int, help="Minimum distance between two variants such that they fall into different haplotype " "blocks") parser.add_argument("--enumeration-threshold", dest="max_enum", default=16768, type=int, help="Enumeration threshold / maximum number of sequences to enumerate per block.") parser.add_argument("-e", "--expand-hapblocks", dest="hb_expand", default=30, type=int, help="Expand haplotype blocks by this many basepairs left and right.") parser.add_argument("--threads", dest="threads", default=multiprocessing.cpu_count(), type=int, help="Number of threads to use.") parser.add_argument("--engine", dest="engine", default="xcmp", choices=["xcmp", "vcfeval"], help="Comparison engine to use.") parser.add_argument("--engine-vcfeval-path", dest="engine_vcfeval", required=False, help="This parameter should give the path to the \"rtg\" executable.") parser.add_argument("--engine-vcfeval-template", dest="engine_vcfeval_template", required=False, help="Vcfeval needs the reference sequence formatted in its own file format " "(SDF -- run rtg format -o ref.SDF ref.fa).") if Tools.has_sge: parser.add_argument("--force-interactive", dest="force_interactive", default=False, action="store_true", help="Force running interactively (i.e. when JOB_ID is not in the environment)") parser.add_argument("_vcfs", help="Two VCF files.", default=[], nargs="*") parser.add_argument("--logfile", dest="logfile", default=None, help="Write logging information into file rather than to stderr") verbosity_options = parser.add_mutually_exclusive_group(required=False) verbosity_options.add_argument("--verbose", dest="verbose", default=False, action="store_true", help="Raise logging level from warning to info.") verbosity_options.add_argument("--quiet", dest="quiet", default=False, action="store_true", help="Set logging level to output errors only.") args, unknown_args = parser.parse_known_args() if not Tools.has_sge: args.force_interactive = True if args.verbose: loglevel = logging.INFO elif args.quiet: loglevel = logging.ERROR else: loglevel = logging.WARNING # reinitialize logging for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) logging.basicConfig(filename=args.logfile, format='%(asctime)s %(levelname)-8s %(message)s', level=loglevel) # remove some safe unknown args unknown_args = [x for x in unknown_args if x not in ["--force-interactive"]] if len(sys.argv) < 2 or len(unknown_args) > 0: if unknown_args: logging.error("Unknown arguments specified : %s " % str(unknown_args)) parser.print_help() exit(0) if args.version: print "Hap.py %s" % Tools.version exit(0) if args.roc: args.write_vcf = True # disable all clever matching if args.unhappy: args.int_preprocessing = False args.int_preprocessing_ls = False args.no_hc = True # Counting with partial credit elif args.partial_credit: # partial_credit switch is overridden by --no-* switches args.int_preprocessing = True args.int_preprocessing_ls = True elif args.partial_credit is None: # in the default setting, we enable partial credit but only override the # preprocessing settings if they haven't been specified if args.int_preprocessing is None: args.int_preprocessing = True if args.int_preprocessing_ls is None: args.int_preprocessing_ls = True elif args.partial_credit is not None: # explicitly set to false args.int_preprocessing = False args.int_preprocessing_ls = True if args.int_preprocessing is None: args.int_preprocessing = False if args.int_preprocessing_ls is None: args.int_preprocessing_ls = False logging.info("Preprocessing settings: %s / %s / %s" % ("leftshift" if args.int_preprocessing_ls else "no-leftshift", "splitting" if args.int_preprocessing else "raw calls", "haplocompare" if not args.no_hc else "no-haplocompare")) # sanity-check regions bed file (HAP-57) if args.regions_bedfile: logging.info("Checking input regions.") if bedOverlapCheck(args.regions_bedfile): raise Exception("The regions bed file (specified using -R) has overlaps, this will not work with xcmp." " You can either use -T, or run the file through bedtools merge") args.preprocessing_truth = True args.preprocessing = True if args.targets_bedfile or args.engine != "xcmp": args.preprocessing_truth = True args.preprocessing = True if args.fp_bedfile and not os.path.exists(args.fp_bedfile): raise Exception("FP/confident call region bed file does not exist.") tempfiles = [] try: if not args.force_interactive and "JOB_ID" not in os.environ: parser.print_help() raise Exception("Please qsub me so I get approximately 1 GB of RAM per thread.") if not args.ref: args.ref = Tools.defaultReference() if not os.path.exists(args.ref): raise Exception("Please specify a valid reference path using -r.") if not args.reports_prefix: raise Exception("Please specify an output prefix using -o ") if not os.path.exists(os.path.dirname(args.reports_prefix)): raise Exception("The output path does not exist. Please specify a valid output path and prefix using -o") if os.path.basename(args.reports_prefix) == "" or os.path.isdir(args.reports_prefix): raise Exception("The output path should specify a file name prefix. Please specify a valid output path " "and prefix using -o. For example, -o /tmp/test will create files named /tmp/test* .") # noinspection PyProtectedMember if not args._vcfs or len(args._vcfs) != 2: raise Exception("Please specify exactly two input VCFs.") # noinspection PyProtectedMember args.vcf1 = args._vcfs[0] # noinspection PyProtectedMember args.vcf2 = args._vcfs[1] if not os.path.exists(args.vcf1): raise Exception("Input file %s does not exist." % args.vcf1) if not os.path.exists(args.vcf2): raise Exception("Input file %s does not exist." % args.vcf2) logging.info("Comparing %s and %s" % (args.vcf1, args.vcf2)) h1 = vcfextract.extractHeadersJSON(args.vcf1) if args.auto_index and not h1["tabix"]: logging.info("Creating indexed version of %s -- consider creating an index beforehand to save time here." % args.vcf1) vtf = tempfile.NamedTemporaryFile(delete=False, dir=args.scratch_prefix, prefix="truth.ix", suffix=".vcf.gz") vtf.close() tempfiles.append(vtf.name) tempfiles.append(vtf.name + ".tbi") args.vcf1 = Tools.bcftools.makeIndex(args.vcf1, vtf.name) h1 = vcfextract.extractHeadersJSON(args.vcf1) h2 = vcfextract.extractHeadersJSON(args.vcf2) if args.auto_index and not h2["tabix"]: logging.info("Creating indexed version of %s -- consider creating an index beforehand to save time here." % args.vcf2) vtf = tempfile.NamedTemporaryFile(delete=False, dir=args.scratch_prefix, prefix="query.ix", suffix=".vcf.gz") vtf.close() tempfiles.append(vtf.name) tempfiles.append(vtf.name + ".tbi") args.vcf2 = Tools.bcftools.makeIndex(args.vcf2, vtf.name) h2 = vcfextract.extractHeadersJSON(args.vcf2) ref_check = False try: happy_ref = args.ref v1r = [_h for _h in h1["fields"] if _h["key"] == "reference"] v2r = [_h for _h in h2["fields"] if _h["key"] == "reference"] if args.verbose: logging.info("References used: hap.py: %s / truth: %s / " "query: %s" % (str(happy_ref), str(v1r), str(v2r))) v1_ref = ";".join([str(xxy["value"]) for xxy in v1r]).replace("file://", "") v2_ref = ";".join([str(xxy["value"]) for xxy in v2r]).replace("file://", "") if happy_ref == v1_ref and v1_ref == v2_ref: ref_check = True refids_found = 0 for refid in ["hg19", "hg38", "grc37", "grc38"]: if refid in happy_ref.lower() and refid in v1_ref.lower() and refid in v2_ref.lower(): if args.verbose: logging.info("Reference matches pattern: %s" % refid) refids_found += 1 if refids_found == 1: ref_check = True except: pass if not ref_check: logging.warn("Reference sequence check failed! " "Please ensure that truth and query VCF use the same reference sequence as " "hap.py. XCMP may fail if this is not the case, and the results will not be " " accurate.") if args.locations is None or len(args.locations) == 0: # all chromosomes args.locations = ["chr" + x for x in map(str, range(1, 23))] if type(args.locations) is not list and args.locations is not None: # noinspection PyUnresolvedReferences args.locations = args.locations.split(",") if not h1["tabix"]: args.preprocessing_truth = True logging.warn("Truth file is not Tabix indexed. Switching on pre-processing + chr name conversion.") if args.fixchr_truth is None: args.fixchr_truth = True elif args.fixchr_truth is None: # autodetect chr naming count_with_fix = len([__ for __ in h1["tabix"]["chromosomes"] if ("chr%s" % str(__)) in args.locations]) count_no_fix = len([__ for __ in h1["tabix"]["chromosomes"] if str(__) in args.locations]) logging.info("Truth: Number of chromosome names matching with / without renaming : %i / %i " % ( count_with_fix, count_no_fix)) if count_with_fix > count_no_fix: args.fixchr_truth = True logging.info("Will fix chromosome names (truth).") else: logging.info("Will not fix chromosome names (truth).") args.fixchr_truth = False if not h2["tabix"]: args.preprocessing = True logging.warn("Query file is not Tabix indexed. Switching on pre-processing + chr name conversion.") # don't overwrite setting, but if it's None, replace with True to be sure if args.fixchr_query is None: args.fixchr_query = True elif args.fixchr_query is None: # autodetect chr naming count_with_fix = len([__ for __ in h2["tabix"]["chromosomes"] if ("chr%s" % str(__)) in args.locations]) count_no_fix = len([__ for __ in h2["tabix"]["chromosomes"] if str(__) in args.locations]) logging.info("Query: Number of chromosome names matching with / without renaming : %i / %i " % ( count_with_fix, count_no_fix)) if count_with_fix > count_no_fix: args.fixchr_query = True logging.info("Will fix chromosome names (query).") else: logging.info("Will not fix chromosome names (query).") args.fixchr_query = False if args.fixchr_truth or args.preprocessing_norm: args.preprocessing_truth = True if args.fixchr_query or args.preprocessing_norm: args.preprocessing = True if args.preprocessing_truth: vtf = tempfile.NamedTemporaryFile(delete=False, dir=args.scratch_prefix, prefix="truth.pp", suffix=".vcf.gz") vtf.close() tempfiles.append(vtf.name) preprocessVCF(args.vcf1, vtf.name, ",".join(args.locations), not args.usefiltered_truth, # pass_only args.fixchr_truth, # chrprefix args.preprocessing_norm, # norm, args.regions_bedfile, args.targets_bedfile, args.ref) args.vcf1 = vtf.name # get headers again if we preprocessed h1 = vcfextract.extractHeadersJSON(args.vcf1) if args.preprocessing: vtf = tempfile.NamedTemporaryFile(delete=False, dir=args.scratch_prefix, prefix="query.pp", suffix=".vcf.gz") vtf.close() tempfiles.append(vtf.name) preprocessVCF(args.vcf2, vtf.name, ",".join(args.locations), not args.usefiltered, # pass_only args.fixchr_query, # chrprefix args.preprocessing_norm, # norm, args.regions_bedfile, args.targets_bedfile, args.ref) args.vcf2 = vtf.name # get headers again if we preprocessed h2 = vcfextract.extractHeadersJSON(args.vcf2) if not h1["tabix"]: raise Exception("Truth file is not Tabix indexed.") if not h2["tabix"]: raise Exception("Truth file is not Tabix indexed.") newlocations = [] if not h1["tabix"]["chromosomes"]: h1["tabix"]["chromosomes"] = [] if not h2["tabix"]["chromosomes"]: h2["tabix"]["chromosomes"] = [] for _xc in args.locations: xc = _xc.split(":")[0] if xc not in h1["tabix"]["chromosomes"]: logging.warn("No calls for location %s in truth!" % xc) if xc not in h2["tabix"]["chromosomes"]: logging.warn("No calls for location %s in query!" % xc) if (xc not in h1["tabix"]["chromosomes"]) and (xc not in h2["tabix"]["chromosomes"]): logging.warn("Removing location %s because neither input file has calls there." % xc) else: newlocations.append(_xc) if not newlocations: raise Exception("Location list is empty: the input files do not appear to have variants on any of %s" % str(args.locations)) args.locations = newlocations if args.threads > 1: logging.info("Running using %i parallel processes." % args.threads) pool = multiprocessing.Pool(int(args.threads)) # find balanced pieces args.pieces = (args.threads + len(args.locations) - 1) / len(args.locations) res = runParallel(pool, Haplo.blocksplit.blocksplitWrapper, args.locations, args) if None in res: raise Exception("One of the blocksplit processes failed.") tempfiles += res args.locations = [] for f in res: with open(f) as fp: for l in fp: ll = l.strip().split("\t", 3) if len(ll) < 3: continue xchr = ll[0] start = int(ll[1]) + 1 end = int(ll[2]) args.locations.append("%s:%i-%i" % (xchr, start, end)) else: pool = None # count variants before normalisation if "samples" not in h1 or not h1["samples"]: raise Exception("Cannot read sample names from truth input file") if args.raw_counts: counts_truth = Haplo.quantify.run_quantify(args.vcf1, None, None, {"CONF": args.fp_bedfile} if args.fp_bedfile else None, args.ref, h1["samples"][0], locations=args.locations) else: counts_truth = None if "samples" not in h2 or not h2["samples"]: raise Exception("Cannot read sample names from truth input file") if args.raw_counts: counts_query = Haplo.quantify.run_quantify(args.vcf2, None, None, {"CONF": args.fp_bedfile} if args.fp_bedfile else None, args.ref, h2["samples"][0], locations=args.locations) else: counts_query = None tf = tempfile.NamedTemporaryFile(delete=False, dir=args.scratch_prefix, prefix="hap.py.result.", suffix=".vcf.gz") tf.close() tempfiles.append(tf.name) output_name = tf.name if args.engine == "xcmp": # do xcmp logging.info("Using xcmp for comparison") res = runParallel(pool, Haplo.xcmp.xcmpWrapper, args.locations, args) tempfiles += [x[0] for x in res if x is not None] # VCFs tempfiles += [x[1] for x in res if x is not None and x[1] is not None] # beds (if any) if None in res: raise Exception("One of the xcmp jobs failed.") if len(res) == 0: raise Exception("Input files/regions do not contain variants (0 haplotype blocks were processed).") # concatenate + index bedfiles = [x[1] for x in res if x is not None and x[1] is not None] if args.write_bed and bedfiles: runme = " ".join(["cat"] + bedfiles + [">", args.reports_prefix.replace(" ", "\\ ") + ".blocks.bed"]) logging.info("Concatenating block files: %s..." % runme) subprocess.check_call(runme, shell=True) logging.info("Concatenating variants...") runme_list = [x[0] for x in res if x is not None] if len(runme_list) == 0: raise Exception("No outputs to concatenate!") fo = Tools.BGZipFile(output_name, True) for i, x in enumerate(runme_list): f = gzip.GzipFile(x) for l in f: if i == 0 or not l[0] == "#": fo.write(l) fo.close() logging.info("Indexing...") to_run = "tabix -p vcf %s" % output_name.replace(" ", "\\ ") logging.info("Running '%s'" % to_run) subprocess.check_call(to_run, shell=True) elif args.engine == "vcfeval": tempfiles += Haplo.vcfeval.runVCFEval(args.vcf1, args.vcf2, output_name, args) else: raise Exception("Unknown comparison engine: %s" % args.engine) if args.write_counts: json_name = args.reports_prefix + ".counts.json" else: tf = tempfile.NamedTemporaryFile(delete=False, dir=args.scratch_prefix, prefix="counts.", suffix=".json") tf.close() json_name = tf.name logging.info("Counting variants...") counts = Haplo.quantify.run_quantify(output_name, json_name, args.reports_prefix + ".vcf.gz" if args.write_vcf else False, {"CONF": args.fp_bedfile} if args.fp_bedfile else None, args.ref) df = pandas.DataFrame(counts) if args.write_counts: df.to_csv(args.reports_prefix + ".counts.csv") metrics_output = makeMetricsObject("hap.py.comparison") if args.write_counts: metrics_output["metrics"].append(dataframeToMetricsTable("raw.counts", df)) # calculate precision / recall count_types = [] if args.raw_counts: simplified_truth_counts = Haplo.quantify.simplify_counts(counts_truth, h1["samples"][0:1]) simplified_query_counts = Haplo.quantify.simplify_counts(counts_query, h2["samples"][0:1]) count_types += simplified_truth_counts.keys() count_types += simplified_query_counts.keys() else: simplified_truth_counts = None simplified_query_counts = None simplified_numbers = Haplo.quantify.simplify_counts(counts) count_types += simplified_numbers.keys() count_types = sorted(list(set(count_types))) for vtype in count_types: if vtype not in simplified_numbers: simplified_numbers[vtype] = {} simplified_numbers[vtype]["METRIC.Recall"] = 0 simplified_numbers[vtype]["METRIC.Recall2"] = 0 simplified_numbers[vtype]["METRIC.Precision"] = 0 simplified_numbers[vtype]["METRIC.Frac_NA"] = 0 try: simplified_numbers[vtype]["METRIC.Recall"] = \ float(simplified_numbers[vtype]["TRUTH.TP"]) / \ float(simplified_numbers[vtype]["TRUTH.TP"] + simplified_numbers[vtype]["TRUTH.FN"]) except: pass try: simplified_numbers[vtype]["METRIC.Recall2"] = \ float(simplified_numbers[vtype]["TRUTH.TP"]) / \ float(simplified_numbers[vtype]["TRUTH.TOTAL"]) except: pass try: simplified_numbers[vtype]["METRIC.Precision"] = \ float(simplified_numbers[vtype]["QUERY.TP"]) / \ float(simplified_numbers[vtype]["QUERY.TP"] + simplified_numbers[vtype]["QUERY.FP"]) except: pass try: simplified_numbers[vtype]["METRIC.Frac_NA"] = \ float(simplified_numbers[vtype]["QUERY.UNK"]) / \ float(simplified_numbers[vtype]["QUERY.TOTAL"]) except: pass try: simplified_numbers[vtype]["TRUTH.TOTAL.RAW"] = simplified_truth_counts[vtype][h1["samples"][0] + ".TOTAL"] except: pass try: simplified_numbers[vtype]["QUERY.TOTAL.RAW"] = simplified_query_counts[vtype][h2["samples"][0] + ".TOTAL"] except: pass pandas.set_option("display.width", 120) pandas.set_option("display.max_columns", 1000) df = pandas.DataFrame(simplified_numbers).transpose() vstring = "hap.py-%s" % Tools.version vstring += " ".join(sys.argv) df.loc[vstring] = 0 # for x in df: # # everything not a metric is a count # if not x.startswith("METRIC"): # df[x] = df[x].astype("int64") df[["TRUTH.TOTAL", "QUERY.TOTAL", "METRIC.Recall", "METRIC.Precision", "METRIC.Frac_NA"]].to_csv(args.reports_prefix + ".summary.csv") metrics_output["metrics"].append(dataframeToMetricsTable("summary.metrics", df[["TRUTH.TOTAL", "QUERY.TOTAL", "METRIC.Recall", "METRIC.Precision", "METRIC.Frac_NA"]])) if args.write_counts: df.to_csv(args.reports_prefix + ".extended.csv") metrics_output["metrics"].append(dataframeToMetricsTable("all.metrics", df)) essential_numbers = df[["TRUTH.TOTAL", "QUERY.TOTAL", "METRIC.Recall", "METRIC.Precision", "METRIC.Frac_NA"]] pandas.set_option('display.max_columns', 500) pandas.set_option('display.width', 1000) essential_numbers = essential_numbers[essential_numbers.index.isin( ["Locations.SNP", "Locations.INDEL"])] logging.info("\n" + str(essential_numbers)) # in default mode, print result summary to stdout if not args.quiet and not args.verbose: print "Benchmarking Summary:" print str(essential_numbers) if args.roc: vcf = args.reports_prefix + ".vcf.gz" res = Haplo.happyroc.roc(vcf, args.roc, args.roc_filter, args.reports_prefix + ".roc", args.roc_reversed) for t in res.iterkeys(): rocdf = pandas.read_table(res[t]) metrics_output["metrics"].append(dataframeToMetricsTable("roc." + t, rocdf)) with open(args.reports_prefix + ".metrics.json", "w") as fp: json.dump(metrics_output, fp) finally: if args.delete_scratch: for x in tempfiles: try: os.remove(x) except: pass else: logging.info("Scratch files kept : %s" % (str(tempfiles)))
def main(): parser = argparse.ArgumentParser("Haplotype Comparison") # input parser.add_argument('--location', '-l', dest='locations', required=False, default=None, help='Add a location to the compare list (when not given, will use chr1-22, chrX, chrY).') parser.add_argument("-v", "--version", dest="version", action="store_true", help="Show version number and exit.") parser.add_argument("-P", "--include-nonpass", dest="usefiltered", action="store_true", default=False, help="Use to include failing query variants in comparison.") parser.add_argument("--include-nonpass-truth", dest="usefiltered_truth", action="store_true", default=False, help="Include failing variants from the truth dataset.") parser.add_argument("-R", "--restrict-regions", dest="regions_bedfile", default=None, type=str, help="Restrict analysis to given (sparse) regions (using -R in bcftools).") parser.add_argument("-T", "--target-regions", dest="targets_bedfile", default=None, type=str, help="Restrict analysis to given (dense) regions (using -T in bcftools).") parser.add_argument("-r", "--reference", dest="ref", default=None, help="Specify a reference file.") # output parser.add_argument("-o", "--report-prefix", dest="reports_prefix", default=None, help="Filename prefix for report output.") # DEPRECATED: we don't write bed files after 0.2.9 parser.add_argument("-B", "--write-bed", dest="write_bed", default=False, action="store_true", help="This option is deprecated. BED files will not be written anymore.") # add quantification args qfy.updateArgs(parser) parser.add_argument("--scratch-prefix", dest="scratch_prefix", default=None, help="Directory for scratch files.") parser.add_argument("--keep-scratch", dest="delete_scratch", default=True, action="store_false", help="Filename prefix for scratch report output.") # detailed control of comparison parser.add_argument("--preprocess-truth", dest="preprocessing_truth", action="store_true", default=False, help="Preprocess truth file using bcftools.") parser.add_argument("--external-preprocessing", dest="preprocessing", action="store_true", default=False, help="Perform VCF preprocessing using bcftools.") parser.add_argument("--bcftools-norm", dest="preprocessing_norm", action="store_true", default=False, help="Enable preprocessing through bcftools norm -c x -D (requires external " " preprocessing to be switched on).") parser.add_argument("-N", "--numeric-chromosomes", dest="numeric_chrs", action="store_true", default=None, help="Use numeric chromosome names for truth and query. This is a shortcut for " "-l 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y " "--no-fixchr-truth --no-fixchr-query") parser.add_argument("-C", "--no-numeric-chromosomes", dest="numeric_chrs", action="store_false", help="Use chr-prefixed chromosome names for truth and query. This is a shortcut for " "-l chr1,...,chrY" "--fixchr-truth --fixchr-query") parser.add_argument("--fixchr-truth", dest="fixchr_truth", action="store_true", default=None, help="Add chr prefix to truth file (default: auto).") parser.add_argument("--fixchr-query", dest="fixchr_query", action="store_true", default=None, help="Add chr prefix to query file (default: auto).") parser.add_argument("--no-fixchr-truth", dest="fixchr_truth", action="store_false", help="Disable chr replacement for truth (default: auto).") parser.add_argument("--no-fixchr-query", dest="fixchr_query", action="store_false", help="Add chr prefix to query file (default: auto).") parser.add_argument("--partial-credit", dest="partial_credit", action="store_true", default=None, help="give credit for partially matched variants. " "this is equivalent to --internal-leftshift and --internal-preprocessing.") parser.add_argument("--no-partial-credit", dest="partial_credit", action="store_false", default=None, help="Give credit for partially matched variants. " "This is equivalent to --internal-leftshift and --no-internal-preprocessing.") parser.add_argument("--internal-leftshift", dest="int_preprocessing_ls", action="store_true", default=None, help="Switch off xcmp's internal VCF leftshift preprocessing.") parser.add_argument("--internal-preprocessing", dest="int_preprocessing", action="store_true", default=None, help="Switch off xcmp's internal VCF leftshift preprocessing.") parser.add_argument("--no-internal-leftshift", dest="int_preprocessing_ls", action="store_false", default=None, help="Switch off xcmp's internal VCF leftshift preprocessing.") parser.add_argument("--no-internal-preprocessing", dest="int_preprocessing", action="store_false", default=None, help="Switch off xcmp's internal VCF leftshift preprocessing.") parser.add_argument("--no-haplotype-comparison", dest="no_hc", action="store_true", default=False, help="Disable haplotype comparison (only count direct GT matches as TP).") parser.add_argument("--unhappy", dest="unhappy", action="store_true", default=False, help="Combination of --no-haplotype-comparison --no-internal-preprocessing " "--no-internal-leftshift.") parser.add_argument("--no-auto-index", dest="auto_index", action="store_false", default=True, help="Disable automatic index creation for input files. " "The index is only necessary at this stage if we want to auto-detect locations. " "When used with -l, and when it is known that there are variants at all given locations " "this is not needed and can be switched off to save time.") parser.add_argument("-w", "--window-size", dest="window", default=50, type=int, help="Minimum distance between two variants such that they fall into different haplotype " "blocks") parser.add_argument("--enumeration-threshold", dest="max_enum", default=16768, type=int, help="Enumeration threshold / maximum number of sequences to enumerate per block.") parser.add_argument("-e", "--expand-hapblocks", dest="hb_expand", default=30, type=int, help="Expand haplotype blocks by this many basepairs left and right.") parser.add_argument("--threads", dest="threads", default=multiprocessing.cpu_count(), type=int, help="Number of threads to use.") parser.add_argument("--engine", dest="engine", default="xcmp", choices=["xcmp", "vcfeval"], help="Comparison engine to use.") parser.add_argument("--engine-vcfeval-path", dest="engine_vcfeval", required=False, help="This parameter should give the path to the \"rtg\" executable.") parser.add_argument("--engine-vcfeval-template", dest="engine_vcfeval_template", required=False, help="Vcfeval needs the reference sequence formatted in its own file format " "(SDF -- run rtg format -o ref.SDF ref.fa).") if Tools.has_sge: parser.add_argument("--force-interactive", dest="force_interactive", default=False, action="store_true", help="Force running interactively (i.e. when JOB_ID is not in the environment)") parser.add_argument("_vcfs", help="Two VCF files.", default=[], nargs="*") parser.add_argument("--logfile", dest="logfile", default=None, help="Write logging information into file rather than to stderr") verbosity_options = parser.add_mutually_exclusive_group(required=False) verbosity_options.add_argument("--verbose", dest="verbose", default=False, action="store_true", help="Raise logging level from warning to info.") verbosity_options.add_argument("--quiet", dest="quiet", default=False, action="store_true", help="Set logging level to output errors only.") args, unknown_args = parser.parse_known_args() if not Tools.has_sge: args.force_interactive = True if args.verbose: loglevel = logging.INFO elif args.quiet: loglevel = logging.ERROR else: loglevel = logging.WARNING # reinitialize logging for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) logging.basicConfig(filename=args.logfile, format='%(asctime)s %(levelname)-8s %(message)s', level=loglevel) # remove some safe unknown args unknown_args = [x for x in unknown_args if x not in ["--force-interactive"]] if len(sys.argv) < 2 or len(unknown_args) > 0: if unknown_args: logging.error("Unknown arguments specified : %s " % str(unknown_args)) parser.print_help() exit(0) if args.version: print "Hap.py %s" % Tools.version exit(0) if args.write_bed: logging.warn("The -B / --write-bed switches are deprecated in versions 0.2.9+, ") if args.roc: args.write_vcf = True # disable all clever matching if args.unhappy: args.int_preprocessing = False args.int_preprocessing_ls = False args.no_hc = True # Counting with partial credit elif args.partial_credit: # partial_credit switch is overridden by --no-* switches args.int_preprocessing = True args.int_preprocessing_ls = True elif args.partial_credit is None: # in the default setting, we enable partial credit but only override the # preprocessing settings if they haven't been specified if args.int_preprocessing is None: args.int_preprocessing = True if args.int_preprocessing_ls is None: args.int_preprocessing_ls = True elif args.partial_credit is not None: # explicitly set to false args.int_preprocessing = False args.int_preprocessing_ls = True if args.int_preprocessing is None: args.int_preprocessing = False if args.int_preprocessing_ls is None: args.int_preprocessing_ls = False logging.info("Preprocessing settings: %s / %s / %s" % ("leftshift" if args.int_preprocessing_ls else "no-leftshift", "splitting" if args.int_preprocessing else "raw calls", "haplocompare" if not args.no_hc else "no-haplocompare")) # sanity-check regions bed file (HAP-57) if args.regions_bedfile: logging.info("Checking input regions.") if bedOverlapCheck(args.regions_bedfile): raise Exception("The regions bed file (specified using -R) has overlaps, this will not work with xcmp." " You can either use -T, or run the file through bedtools merge") args.preprocessing_truth = True args.preprocessing = True if args.targets_bedfile or args.engine != "xcmp": args.preprocessing_truth = True args.preprocessing = True if args.fp_bedfile and not os.path.exists(args.fp_bedfile): raise Exception("FP/confident call region bed file does not exist.") tempfiles = [] try: if not args.force_interactive and "JOB_ID" not in os.environ: parser.print_help() raise Exception("Please qsub me so I get approximately 1 GB of RAM per thread.") if not args.ref: args.ref = Tools.defaultReference() if not os.path.exists(args.ref): raise Exception("Please specify a valid reference path using -r.") if not args.reports_prefix: raise Exception("Please specify an output prefix using -o ") if not os.path.exists(os.path.dirname(os.path.abspath(args.reports_prefix))): raise Exception("The output path does not exist. Please specify a valid output path and prefix using -o") if os.path.basename(args.reports_prefix) == "" or os.path.isdir(args.reports_prefix): raise Exception("The output path should specify a file name prefix. Please specify a valid output path " "and prefix using -o. For example, -o /tmp/test will create files named /tmp/test* .") # noinspection PyProtectedMember if not args._vcfs or len(args._vcfs) != 2: raise Exception("Please specify exactly two input VCFs.") # noinspection PyProtectedMember args.vcf1 = args._vcfs[0] # noinspection PyProtectedMember args.vcf2 = args._vcfs[1] if not os.path.exists(args.vcf1): raise Exception("Input file %s does not exist." % args.vcf1) if not os.path.exists(args.vcf2): raise Exception("Input file %s does not exist." % args.vcf2) logging.info("Comparing %s and %s" % (args.vcf1, args.vcf2)) # detect numeric chromosome names if args.numeric_chrs is None: cts = fastaContigLengths(args.ref) cts = set(cts.keys()) numeric_names = set(map(str, range(1, 23)) + ["X", "Y", "M"]) non_numeric_names = set(["chr" + x for x in numeric_names]) numeric_names &= cts non_numeric_names &= cts numeric_names = len(list(numeric_names)) non_numeric_names = len(list(non_numeric_names)) if numeric_names != 0 and non_numeric_names == 0: args.numeric_chrs = True logging.info("Auto-detected numeric chromosome names") elif numeric_names == 0 and non_numeric_names != 0: args.numeric_chrs = False logging.info("Auto-detected chr-prefixed chromosome names") if args.numeric_chrs: args.fixchr_truth = False args.fixchr_query = False elif args.numeric_chrs is not None: args.fixchr_truth = True args.fixchr_query = True h1 = vcfextract.extractHeadersJSON(args.vcf1) if args.auto_index and not h1["tabix"]: logging.info("Creating indexed version of %s -- consider creating an index beforehand to save time here." % args.vcf1) vtf = tempfile.NamedTemporaryFile(delete=False, dir=args.scratch_prefix, prefix="truth.ix", suffix=".vcf.gz") vtf.close() tempfiles.append(vtf.name) tempfiles.append(vtf.name + ".tbi") args.vcf1 = Tools.bcftools.makeIndex(args.vcf1, vtf.name) h1 = vcfextract.extractHeadersJSON(args.vcf1) h2 = vcfextract.extractHeadersJSON(args.vcf2) if args.auto_index and not h2["tabix"]: logging.info("Creating indexed version of %s -- consider creating an index beforehand to save time here." % args.vcf2) vtf = tempfile.NamedTemporaryFile(delete=False, dir=args.scratch_prefix, prefix="query.ix", suffix=".vcf.gz") vtf.close() tempfiles.append(vtf.name) tempfiles.append(vtf.name + ".tbi") args.vcf2 = Tools.bcftools.makeIndex(args.vcf2, vtf.name) h2 = vcfextract.extractHeadersJSON(args.vcf2) ref_check = True try: happy_ref = args.ref v1r = [_h for _h in h1["fields"] if _h["key"] == "reference"] v2r = [_h for _h in h2["fields"] if _h["key"] == "reference"] if args.verbose: logging.info("References used: hap.py: %s / truth: %s / " "query: %s" % (str(happy_ref), str(v1r), str(v2r))) v1_ref = ";".join([str(xxy["value"]) for xxy in v1r]).replace("file://", "") v2_ref = ";".join([str(xxy["value"]) for xxy in v2r]).replace("file://", "") if happy_ref == v1_ref and v1_ref == v2_ref: ref_check = True rids_vh = set() rids_v1 = set() rids_v2 = set() for refid in ["hg19", "hg38", "grc37", "grc38"]: if refid in happy_ref.lower(): rids_vh.add(refid) if refid in v1_ref.lower(): rids_v1.add(refid) if refid in v2_ref.lower(): rids_v2.add(refid) rids_v1 = sorted(list(rids_v1)) rids_v2 = sorted(list(rids_v2)) rids_vh = sorted(list(rids_vh)) to_cmp = None if rids_v1: to_cmp = rids_v1 if rids_v2: to_cmp = rids_v2 if rids_vh: to_cmp = rids_vh if to_cmp and rids_v1 and rids_v1 != to_cmp: ref_check = False if to_cmp and rids_v2 and rids_v2 != to_cmp: ref_check = False if to_cmp and rids_vh and rids_vh != to_cmp: ref_check = False except: pass if not ref_check: logging.warn("Reference sequence check failed! " "Please ensure that truth and query VCF use the same reference sequence as " "hap.py. XCMP may fail if this is not the case, and the results will not be " " accurate.") if args.locations is None or len(args.locations) == 0: # all chromosomes if args.numeric_chrs: args.locations = [x for x in map(str, range(1, 23))] else: args.locations = ["chr" + x for x in map(str, range(1, 23))] if type(args.locations) is not list and args.locations is not None: # noinspection PyUnresolvedReferences args.locations = args.locations.split(",") # HAP-143 fix the case where no chromosomes are in truth or query try: if not h1["tabix"]["chromosomes"]: h1["tabix"]["chromosomes"] = [] except: pass try: if not h2["tabix"]["chromosomes"]: h2["tabix"]["chromosomes"] = [] except: pass if not h1["tabix"]: args.preprocessing_truth = True logging.warn("Truth file is not Tabix indexed. Switching on pre-processing + chr name conversion.") if args.fixchr_truth is None: args.fixchr_truth = True elif args.fixchr_truth is None: logging.info(str(h1["tabix"])) # autodetect chr naming count_with_fix = len([__ for __ in h1["tabix"]["chromosomes"] if ("chr%s" % str(__)) in args.locations]) count_no_fix = len([__ for __ in h1["tabix"]["chromosomes"] if str(__) in args.locations]) logging.info("Truth: Number of chromosome names matching with / without renaming : %i / %i " % ( count_with_fix, count_no_fix)) if count_with_fix > count_no_fix: args.fixchr_truth = True logging.info("Will fix chromosome names (truth).") else: logging.info("Will not fix chromosome names (truth).") args.fixchr_truth = False if not h2["tabix"]: args.preprocessing = True logging.warn("Query file is not Tabix indexed. Switching on pre-processing + chr name conversion.") # don't overwrite setting, but if it's None, replace with True to be sure if args.fixchr_query is None: args.fixchr_query = True elif args.fixchr_query is None: # autodetect chr naming count_with_fix = len([__ for __ in h2["tabix"]["chromosomes"] if ("chr%s" % str(__)) in args.locations]) count_no_fix = len([__ for __ in h2["tabix"]["chromosomes"] if str(__) in args.locations]) logging.info("Query: Number of chromosome names matching with / without renaming : %i / %i " % ( count_with_fix, count_no_fix)) if count_with_fix > count_no_fix: args.fixchr_query = True logging.info("Will fix chromosome names (query).") else: logging.info("Will not fix chromosome names (query).") args.fixchr_query = False if args.fixchr_truth or args.preprocessing_norm: args.preprocessing_truth = True if args.fixchr_query or args.preprocessing_norm: args.preprocessing = True if args.preprocessing_truth: vtf = tempfile.NamedTemporaryFile(delete=False, dir=args.scratch_prefix, prefix="truth.pp", suffix=".vcf.gz") vtf.close() tempfiles.append(vtf.name) preprocessVCF(args.vcf1, vtf.name, ",".join(args.locations), not args.usefiltered_truth, # pass_only args.fixchr_truth, # chrprefix args.preprocessing_norm, # norm, args.regions_bedfile, args.targets_bedfile, args.ref) args.vcf1 = vtf.name # get headers again if we preprocessed h1 = vcfextract.extractHeadersJSON(args.vcf1) if args.preprocessing: vtf = tempfile.NamedTemporaryFile(delete=False, dir=args.scratch_prefix, prefix="query.pp", suffix=".vcf.gz") vtf.close() tempfiles.append(vtf.name) preprocessVCF(args.vcf2, vtf.name, ",".join(args.locations), False, # query filters are handled further down in matching args.fixchr_query, # chrprefix args.preprocessing_norm, # norm, args.regions_bedfile, args.targets_bedfile, args.ref) args.vcf2 = vtf.name # get headers again if we preprocessed h2 = vcfextract.extractHeadersJSON(args.vcf2) if not h1["tabix"]: raise Exception("Truth file is not Tabix indexed.") if not h2["tabix"]: raise Exception("Query file is not Tabix indexed.") newlocations = [] if not h1["tabix"]["chromosomes"]: h1["tabix"]["chromosomes"] = [] if not h2["tabix"]["chromosomes"]: h2["tabix"]["chromosomes"] = [] for _xc in args.locations: xc = _xc.split(":")[0] if xc not in h1["tabix"]["chromosomes"]: logging.warn("No calls for location %s in truth!" % xc) if xc not in h2["tabix"]["chromosomes"]: logging.warn("No calls for location %s in query!" % xc) if (xc not in h1["tabix"]["chromosomes"]) and (xc not in h2["tabix"]["chromosomes"]): logging.warn("Removing location %s because neither input file has calls there." % xc) else: newlocations.append(_xc) if not newlocations: raise Exception("Location list is empty: the input files do not appear to have variants on any of %s" % str(args.locations)) args.locations = newlocations if args.threads > 1 and args.engine == "xcmp": logging.info("Running using %i parallel processes." % args.threads) pool = multiprocessing.Pool(int(args.threads)) # find balanced pieces args.pieces = (args.threads + len(args.locations) - 1) / len(args.locations) res = runParallel(pool, Haplo.blocksplit.blocksplitWrapper, args.locations, args) if None in res: raise Exception("One of the blocksplit processes failed.") tempfiles += res args.locations = [] for f in res: with open(f) as fp: for l in fp: ll = l.strip().split("\t", 3) if len(ll) < 3: continue xchr = ll[0] start = int(ll[1]) + 1 end = int(ll[2]) args.locations.append("%s:%i-%i" % (xchr, start, end)) else: pool = None # count variants before normalisation if "samples" not in h1 or not h1["samples"]: raise Exception("Cannot read sample names from truth VCF file") if "samples" not in h2 or not h2["samples"]: raise Exception("Cannot read sample names from query VCF file") tf = tempfile.NamedTemporaryFile(delete=False, dir=args.scratch_prefix, prefix="hap.py.result.", suffix=".vcf.gz") tf.close() tempfiles.append(tf.name) output_name = tf.name if args.engine == "xcmp": # do xcmp logging.info("Using xcmp for comparison") res = runParallel(pool, Haplo.xcmp.xcmpWrapper, args.locations, args) tempfiles += [x[0] for x in res if x is not None] # VCFs tempfiles += [x[1] for x in res if x is not None and x[1] is not None] # beds (if any) if None in res: raise Exception("One of the xcmp jobs failed.") if len(res) == 0: raise Exception("Input files/regions do not contain variants (0 haplotype blocks were processed).") # concatenate + index logging.info("Concatenating variants...") runme_list = [x[0] for x in res if x is not None] if len(runme_list) == 0: raise Exception("No outputs to concatenate!") fo = Tools.BGZipFile(output_name, True) for i, x in enumerate(runme_list): f = gzip.GzipFile(x) for l in f: if i == 0 or not l[0] == "#": fo.write(l) fo.close() logging.info("Indexing...") to_run = "tabix -p vcf %s" % output_name.replace(" ", "\\ ") logging.info("Running '%s'" % to_run) subprocess.check_call(to_run, shell=True) # passed to quantify args.type = "xcmp" elif args.engine == "vcfeval": tempfiles += Haplo.vcfeval.runVCFEval(args.vcf1, args.vcf2, output_name, args) # passed to quantify args.type = "ga4gh" else: raise Exception("Unknown comparison engine: %s" % args.engine) logging.info("Counting variants...") args.in_vcf = [output_name] args.runner = "hap.py" qfy.quantify(args) finally: if args.delete_scratch: for x in tempfiles: try: os.remove(x) except: pass else: logging.info("Scratch files kept : %s" % (str(tempfiles)))
def main(): parser = argparse.ArgumentParser("Haplotype Comparison") # input parser.add_argument("-v", "--version", dest="version", action="store_true", help="Show version number and exit.") parser.add_argument("-r", "--reference", dest="ref", default=None, help="Specify a reference file.") # output parser.add_argument("-o", "--report-prefix", dest="reports_prefix", default=None, help="Filename prefix for report output.") parser.add_argument("--scratch-prefix", dest="scratch_prefix", default=None, help="Directory for scratch files.") parser.add_argument("--keep-scratch", dest="delete_scratch", default=True, action="store_false", help="Filename prefix for scratch report output.") # add quantification args qfy.updateArgs(parser) # control preprocessing pre.updateArgs(parser) parser.add_argument("--preprocess-truth", dest="preprocessing_truth", action="store_true", default=False, help="Preprocess truth file with same settings as query (default is to accept truth in original format).") parser.add_argument("--usefiltered-truth", dest="usefiltered_truth", action="store_true", default=False, help="Preprocess truth file with same settings as query (default is to accept truth in original format).") parser.add_argument("--preprocessing-window-size", dest="preprocess_window", default=10000, type=int, help="Preprocessing window size (variants further apart than that size are not expected to interfere).") # detailed control of comparison parser.add_argument("--unhappy", "--no-haplotype-comparison", dest="no_hc", action="store_true", default=False, help="Disable haplotype comparison (only count direct GT matches as TP).") parser.add_argument("-w", "--window-size", dest="window", default=50, type=int, help="Minimum distance between variants such that they fall into the same superlocus.") # xcmp-specific stuff parser.add_argument("--xcmp-enumeration-threshold", dest="max_enum", default=16768, type=int, help="Enumeration threshold / maximum number of sequences to enumerate per block.") parser.add_argument("--xcmp-expand-hapblocks", dest="hb_expand", default=30, type=int, help="Expand haplotype blocks by this many basepairs left and right.") parser.add_argument("--threads", dest="threads", default=multiprocessing.cpu_count(), type=int, help="Number of threads to use.") parser.add_argument("--engine", dest="engine", default="xcmp", choices=["xcmp", "vcfeval"], help="Comparison engine to use.") parser.add_argument("--engine-vcfeval-path", dest="engine_vcfeval", required=False, default=Haplo.vcfeval.findVCFEval(), help="This parameter should give the path to the \"rtg\" executable. " "The default is %s" % Haplo.vcfeval.findVCFEval()) parser.add_argument("--engine-vcfeval-template", dest="engine_vcfeval_template", required=False, help="Vcfeval needs the reference sequence formatted in its own file format " "(SDF -- run rtg format -o ref.SDF ref.fa). You can specify this here " "to save time when running hap.py with vcfeval. If no SDF folder is " "specified, hap.py will create a temporary one.") if Tools.has_sge: parser.add_argument("--force-interactive", dest="force_interactive", default=False, action="store_true", help="Force running interactively (i.e. when JOB_ID is not in the environment)") parser.add_argument("_vcfs", help="Two VCF files.", default=[], nargs="*") parser.add_argument("--logfile", dest="logfile", default=None, help="Write logging information into file rather than to stderr") verbosity_options = parser.add_mutually_exclusive_group(required=False) verbosity_options.add_argument("--verbose", dest="verbose", default=False, action="store_true", help="Raise logging level from warning to info.") verbosity_options.add_argument("--quiet", dest="quiet", default=False, action="store_true", help="Set logging level to output errors only.") args, unknown_args = parser.parse_known_args() if not Tools.has_sge: args.force_interactive = True if args.verbose: loglevel = logging.INFO elif args.quiet: loglevel = logging.ERROR else: loglevel = logging.WARNING # reinitialize logging for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) logging.basicConfig(filename=args.logfile, format='%(asctime)s %(levelname)-8s %(message)s', level=loglevel) # remove some safe unknown args unknown_args = [x for x in unknown_args if x not in ["--force-interactive"]] if len(sys.argv) < 2 or len(unknown_args) > 0: if unknown_args: logging.error("Unknown arguments specified : %s " % str(unknown_args)) parser.print_help() exit(1) if args.version: print "Hap.py %s" % Tools.version exit(0) if args.roc: args.write_vcf = True # sanity-check regions bed file (HAP-57) if args.regions_bedfile: logging.info("Checking input regions.") if bedOverlapCheck(args.regions_bedfile): raise Exception("The regions bed file (specified using -R) has overlaps, this will not work with xcmp." " You can either use -T, or run the file through bedtools merge") if args.fp_bedfile and not os.path.exists(args.fp_bedfile): raise Exception("FP/confident call region bed file does not exist.") if not args.force_interactive and "JOB_ID" not in os.environ: parser.print_help() raise Exception("Please qsub me so I get approximately 1 GB of RAM per thread.") if not args.ref: args.ref = Tools.defaultReference() if not os.path.exists(args.ref): raise Exception("Please specify a valid reference path using -r.") if not args.reports_prefix: raise Exception("Please specify an output prefix using -o ") if not os.path.exists(os.path.dirname(os.path.abspath(args.reports_prefix))): raise Exception("The output path does not exist. Please specify a valid output path and prefix using -o") if os.path.basename(args.reports_prefix) == "" or os.path.isdir(args.reports_prefix): raise Exception("The output path should specify a file name prefix. Please specify a valid output path " "and prefix using -o. For example, -o /tmp/test will create files named /tmp/test* .") # noinspection PyProtectedMember if not args._vcfs or len(args._vcfs) != 2: raise Exception("Please specify exactly two input VCFs.") # noinspection PyProtectedMember args.vcf1 = args._vcfs[0] # noinspection PyProtectedMember args.vcf2 = args._vcfs[1] if not os.path.exists(args.vcf1): raise Exception("Input file %s does not exist." % args.vcf1) if not os.path.exists(args.vcf2): raise Exception("Input file %s does not exist." % args.vcf2) tempfiles = [] # xcmp supports bcf; others don't if args.engine == "xcmp" and (args.bcf or (args.vcf1.endswith(".bcf") and args.vcf2.endswith(".bcf"))): internal_format_suffix = ".bcf" else: internal_format_suffix = ".vcf.gz" try: logging.info("Comparing %s and %s" % (args.vcf1, args.vcf2)) logging.info("Preprocessing truth: %s" % args.vcf1) starttime = time.time() ttf = tempfile.NamedTemporaryFile(delete=False, dir=args.scratch_prefix, prefix="truth.pp", suffix=internal_format_suffix) ttf.close() tempfiles.append(ttf.name) tempfiles.append(ttf.name + ".csi") tempfiles.append(ttf.name + ".tbi") pre.preprocess(args.vcf1, ttf.name, args.ref, args.locations, None if args.usefiltered_truth else "*", # filters args.fixchr, args.regions_bedfile, args.targets_bedfile, args.preprocessing_leftshift if args.preprocessing_truth else False, args.preprocessing_decompose if args.preprocessing_truth else False, args.preprocessing_norm if args.preprocessing_truth else False, args.preprocess_window, args.threads) args.vcf1 = ttf.name h1 = vcfextract.extractHeadersJSON(args.vcf1) elapsed = time.time() - starttime logging.info("preprocess for %s -- time taken %.2f" % (args.vcf1, elapsed)) # once we have preprocessed the truth file we can resolve the locations # doing this here improves the time for query preprocessing below reference_contigs = set(fastaContigLengths(args.ref).keys()) if not args.locations: # default set of locations is the overlap between truth and reference args.locations = list(reference_contigs & set(h1["tabix"]["chromosomes"])) if not args.locations: raise Exception("Truth and reference have no chromosomes in common!") elif type(args.locations) is not list: args.locations = [args.locations] args.locations = sorted(args.locations) logging.info("Preprocessing query: %s" % args.vcf2) starttime = time.time() if args.pass_only: filtering = "*" else: filtering = args.filters_only qtf = tempfile.NamedTemporaryFile(delete=False, dir=args.scratch_prefix, prefix="query.pp", suffix=internal_format_suffix) qtf.close() tempfiles.append(qtf.name) tempfiles.append(qtf.name + ".csi") tempfiles.append(qtf.name + ".tbi") pre.preprocess(args.vcf2, qtf.name, args.ref, str(",".join(args.locations)), filtering, args.fixchr, args.regions_bedfile, args.targets_bedfile, args.preprocessing_leftshift, args.preprocessing_decompose, args.preprocessing_norm, args.preprocess_window, args.threads) args.vcf2 = qtf.name h2 = vcfextract.extractHeadersJSON(args.vcf2) elapsed = time.time() - starttime logging.info("preprocess for %s -- time taken %.2f" % (args.vcf2, elapsed)) if not h1["tabix"]: raise Exception("Truth file is not indexed after preprocesing.") if not h2["tabix"]: raise Exception("Query file is not indexed after preprocessing.") for _xc in args.locations: if _xc not in h2["tabix"]["chromosomes"]: logging.warn("No calls for location %s in query!" % _xc) pool = getPool(args.threads) if args.threads > 1 and args.engine == "xcmp": logging.info("Running using %i parallel processes." % args.threads) # find balanced pieces # cap parallelism at 64 since otherwise bcftools concat below might run out # of file handles args.pieces = min(args.threads, 64) res = runParallel(pool, Haplo.blocksplit.blocksplitWrapper, args.locations, args) if None in res: raise Exception("One of the blocksplit processes failed.") tempfiles += res args.locations = [] for f in res: with open(f) as fp: for l in fp: ll = l.strip().split("\t", 3) if len(ll) < 3: continue xchr = ll[0] start = int(ll[1]) + 1 end = int(ll[2]) args.locations.append("%s:%i-%i" % (xchr, start, end)) # count variants before normalisation if "samples" not in h1 or not h1["samples"]: raise Exception("Cannot read sample names from truth VCF file") if "samples" not in h2 or not h2["samples"]: raise Exception("Cannot read sample names from query VCF file") tf = tempfile.NamedTemporaryFile(delete=False, dir=args.scratch_prefix, prefix="hap.py.result.", suffix=internal_format_suffix) tf.close() tempfiles.append(tf.name) tempfiles.append(tf.name + ".tbi") tempfiles.append(tf.name + ".csi") output_name = tf.name if args.engine == "xcmp": # do xcmp logging.info("Using xcmp for comparison") res = runParallel(pool, Haplo.xcmp.xcmpWrapper, args.locations, args) tempfiles += [x for x in res if x is not None] # VCFs if None in res: raise Exception("One of the xcmp jobs failed.") if len(res) == 0: raise Exception("Input files/regions do not contain variants (0 haplotype blocks were processed).") # concatenate + index logging.info("Concatenating variants...") runme_list = [x for x in res if x is not None] if len(runme_list) == 0: raise Exception("No outputs to concatenate!") logging.info("Concatenating...") bcftools.concatenateParts(output_name, *runme_list) logging.info("Indexing...") bcftools.runBcftools("index", output_name) # passed to quantify args.type = "xcmp" # xcmp extracts whichever field we're using into the QQ info field args.roc = "IQQ" elif args.engine == "vcfeval": tempfiles += Haplo.vcfeval.runVCFEval(args.vcf1, args.vcf2, output_name, args) # passed to quantify args.type = "ga4gh" else: raise Exception("Unknown comparison engine: %s" % args.engine) args.in_vcf = [output_name] args.runner = "hap.py" qfy.quantify(args) finally: if args.delete_scratch: for x in tempfiles: try: os.remove(x) except: pass else: logging.info("Scratch files kept : %s" % (str(tempfiles)))
def extractVarscan2SNVFeatures(vcfname, tag, avg_depth=None): """ Return a data frame with features collected from the given VCF, tagged by given type """ records = [] if not avg_depth: logging.warn( "No average depths available, normalized depth features cannot be calculated" ) hdrs = extractHeadersJSON(vcfname) # TODO could figure this out automatically nsn = "NORMAL" tsn = "TUMOR" n_sample = "S.1." t_sample = "S.2." logging.info( "Normal sample name : %s (prefix %s) / tumour sample name : %s (prefix %s)" % (nsn, n_sample, tsn, t_sample)) features = [ "CHROM", "POS", "REF", "ALT", "FILTER", "I.SSC", "I.GPV", "I.SPV", n_sample + "GT", t_sample + "GT", # Genotype n_sample + "GQ", t_sample + "GQ", # Genotype quality n_sample + "DP", t_sample + "DP", # Read depth n_sample + "RD", t_sample + "RD", # Reference depth n_sample + "AD", t_sample + "AD", # Alternative depth n_sample + "FREQ", t_sample + "FREQ" # Alt. frequence (FA in MuTect) ] has_warned = {} for vr in vcfExtract(vcfname, features): rec = {} for i, ff in enumerate(features): rec[ff] = vr[i] for q in [n_sample + "GT", t_sample + "GT"]: if not q in rec or rec[q] is None: rec[q] = "." if not ("feat:" + q) in has_warned: logging.warn("Missing feature %s" % q) has_warned["feat:" + q] = True # fix missing features for q in [ n_sample + "GT", t_sample + "GT", n_sample + "GQ", t_sample + "GQ", n_sample + "DP", t_sample + "DP", n_sample + "AD", t_sample + "AD", n_sample + "RD", t_sample + "RD", n_sample + "FREQ", t_sample + "FREQ" ]: if not q in rec or rec[q] is None: rec[q] = 0 if not ("feat:" + q) in has_warned: logging.warn("Missing feature %s" % q) has_warned["feat:" + q] = True else: if q.endswith("FREQ"): try: rec[q] = float(rec[q]) except ValueError: rec[q] = float("NaN") else: try: rec[q] = int(rec[q]) except ValueError: rec[q] = -1 rec["tag"] = tag n_DP = float(rec[n_sample + "DP"]) t_DP = float(rec[t_sample + "DP"]) n_DP_ratio = 0 t_DP_ratio = 0 if avg_depth: if rec["CHROM"] in avg_depth: n_DP_ratio = n_DP / float(avg_depth[rec["CHROM"]]) t_DP_ratio = t_DP / float(avg_depth[rec["CHROM"]]) elif not rec["CHROM"] in has_warned: logging.warn("Cannot normalize depths on %s" % rec["CHROM"]) has_warned[rec["CHROM"]] = True elif not "DPnorm" in has_warned: logging.warn("Cannot normalize depths.") has_warned["DPnorm"] = True n_allele_ref_count = rec[n_sample + "RD"] alleles_alt = rec["ALT"] if alleles_alt == ['.']: n_allele_alt_count = 0 else: n_allele_alt_count = rec[n_sample + "AD"] if n_allele_alt_count + n_allele_ref_count == 0: n_allele_rate = 0 else: n_allele_rate = n_allele_alt_count / float(n_allele_alt_count + n_allele_ref_count) t_allele_ref_count = rec[t_sample + "RD"] alleles_alt = rec["ALT"] if alleles_alt == ['.']: t_allele_alt_count = 0 else: t_allele_alt_count = rec[t_sample + "AD"] if t_allele_alt_count + t_allele_ref_count == 0: t_allele_rate = 0 else: t_allele_rate = t_allele_alt_count / float(t_allele_alt_count + t_allele_ref_count) # Gather the computed data into a dict qrec = { "CHROM": rec["CHROM"], "POS": int(rec["POS"]), "REF": rec["REF"], "ALT": ",".join(rec["ALT"]), "FILTER": ",".join(rec["FILTER"]), "SSC": rec["I.SSC"], "GPV": rec["I.GPV"], "SPV": rec["I.SPV"], "N_DP": n_DP, "T_DP": t_DP, "N_DP_RATE": n_DP_ratio, "T_DP_RATE": t_DP_ratio, "N_GT": rec[n_sample + "GT"], "T_GT": rec[t_sample + "GT"], "N_GQ": rec[n_sample + "GQ"], "T_GQ": rec[t_sample + "GQ"], "N_AD": rec[n_sample + "AD"], "T_AD": rec[t_sample + "AD"], "N_FA": rec[n_sample + "FREQ"], "T_FA": rec[t_sample + "FREQ"], "N_ALT_RATE": n_allele_rate, "T_ALT_RATE": t_allele_rate, "tag": tag } records.append(qrec) cols = [ "CHROM", "POS", "REF", "ALT", "FILTER", "SSC", "GPV", "SPV", "N_DP", "T_DP", "N_DP_RATE", "T_DP_RATE", "N_GT", "T_GT", "N_GQ", "T_GQ", "N_AD", "T_AD", "N_FA", "T_FA", "N_ALT_RATE", "T_ALT_RATE", "tag" ] if records: df = pandas.DataFrame(records, columns=cols) else: df = pandas.DataFrame(columns=cols) return df
def partialCredit(vcfname, outputname, reference, locations, threads=1, window=10000, leftshift=True, decompose=True): """ Partial-credit-process a VCF file according to our args """ pool = getPool(int(threads)) if threads > 1: logging.info("Partial credit processing uses %i parallel processes." % threads) if not locations: h = extractHeadersJSON(vcfname) if not h["tabix"]["chromosomes"]: logging.warn("Empty input or not tabix indexed") if outputname.endswith(".bcf"): runBcftools("view", "-O", "b", "-o", outputname, vcfname) runBcftools("index", outputname) else: runBcftools("view", "-O", "z", "-o", outputname, vcfname) runBcftools("index", "-t", outputname) # just return the same file return locations = h["tabix"]["chromosomes"] elif type(locations) is str or type(locations) is unicode: locations = locations.split(",") # use blocksplit to subdivide input res = runParallel( pool, blocksplitWrapper, locations, {"vcf": vcfname, "dist": window, "pieces": min(40, threads * 4)} ) if None in res: raise Exception("One of the blocksplit processes failed.") locations = list(itertools.chain.from_iterable(res)) if not len(locations): logging.warn("Blocksplit returned no blocks. This can happen when " "an input contains no valid variants.") locations = [""] else: locations = [""] res = [] try: res = runParallel( pool, preprocessWrapper, itertools.izip(itertools.repeat(vcfname), locations), { "reference": reference, "decompose": decompose, "leftshift": leftshift, "bcf": outputname.endswith(".bcf"), }, ) if None in res: raise Exception("One of the preprocess jobs failed") if not res: raise Exception("No blocks were processed. List of locations: %s" % str(list(locations))) concatenateParts(outputname, *res) if outputname.endswith(".vcf.gz"): runBcftools("index", "-t", outputname) else: # use bcf runBcftools("index", outputname) finally: for r in res: try: os.unlink(r) except: pass try: os.unlink(r + ".tbi") except: pass try: os.unlink(r + ".csi") except: pass
def extractMutectSNVFeatures(vcfname, tag, avg_depth=None): """ Return a data frame with features collected from the given VCF, tagged by given type """ records = [] if not avg_depth: logging.warn("No average depths available, normalized depth features cannot be calculated") hdrs = extractHeadersJSON(vcfname) tsn = "" nsn = "" t_sample = "S.1." n_sample = "S.2." try: samples = hdrs["samples"] for f in hdrs["fields"]: if f["key"] == "GATKCommandLine" and f["values"]["ID"].lower() == "mutect": clopts = f["values"]["CommandLineOptions"] # ... tumor_sample_name=HCC2218_tumour ... normal_sample_name=HCC2218_normal m = re.search("tumor_sample_name=([^\s]+)", clopts) if m: tsn = m.group(1) for i, x in enumerate(samples): if x == tsn: t_sample = "S.%i." % (i+1) break m = re.search("normal_sample_name=([^\s]+)", clopts) if m: nsn = m.group(1) for i, x in enumerate(samples): if x == nsn: n_sample = "S.%i." % (i+1) break except: logging.warn("Unable to detect tumour / normal sample order from VCF header") logging.info("Normal sample name : %s (prefix %s) / tumour sample name : %s (prefix %s)" % (nsn, n_sample, tsn, t_sample)) features = ["CHROM", "POS", "REF", "ALT", "FILTER", "I.DB", "I.TLOD", "I.NLOD", "I.ECNT", "I.HCNT", "I.MAX_ED", "I.MIN_ED", n_sample + "GT", t_sample + "GT", n_sample + "DP", t_sample + "DP", n_sample + "QSS", t_sample + "QSS", n_sample + "AD", t_sample + "AD"] has_warned = {} for vr in vcfExtract(vcfname, features): rec = {} for i, ff in enumerate(features): rec[ff] = vr[i] for q in [n_sample + "GT", t_sample + "GT"]: if not q in rec or rec[q] is None: rec[q] = "." if not ("feat:" + q) in has_warned: logging.warn("Missing feature %s" % q) has_warned["feat:" + q] = True # fix missing features for q in ["I.DB", "I.TLOD", "I.NLOD", "I.ECNT", "I.HCNT", "I.MAX_ED", "I.MIN_ED", n_sample + "GT", t_sample + "GT", n_sample + "DP", t_sample + "DP", n_sample + "QSS", t_sample + "QSS", n_sample + "AD", t_sample + "AD"]: if not q in rec or rec[q] is None: rec[q] = 0 if not ("feat:" + q) in has_warned: logging.warn("Missing feature %s" % q) has_warned["feat:" + q] = True else: # list features if q.endswith("AD") or q.endswith("QSS"): if type(rec[q]) is not list: if not has_warned[q + "_PARSE_FAIL"]: logging.warn("Cannot parse %s: %s" % (q, str(rec[q]))) has_warned[q + "_PARSE_FAIL"] = True rec[q] = [0] * (1 + len(rec["ALT"])) for xx in range(0, 1 + len(rec["ALT"])): if len(rec[q]) <= xx: rec[q].append(0) else: try: rec[q][xx] = float(rec[q][xx]) except ValueError: rec[q][xx] = 0 else: try: rec[q] = int(rec[q]) except ValueError: rec[q] = -1 rec["tag"] = tag TLOD = float(rec["I.TLOD"]) NLOD = float(rec["I.NLOD"]) n_DP = float(rec[n_sample + "DP"]) t_DP = float(rec[t_sample + "DP"]) n_DP_ratio = 0 t_DP_ratio = 0 if avg_depth: if rec["CHROM"] in avg_depth: n_DP_ratio = n_DP/float(avg_depth[rec["CHROM"]]) t_DP_ratio = t_DP/float(avg_depth[rec["CHROM"]]) elif not rec["CHROM"] in has_warned: logging.warn("Cannot normalize depths on %s" % rec["CHROM"]) has_warned[rec["CHROM"]] = True elif not "DPnorm" in has_warned: logging.warn("Cannot normalize depths.") has_warned["DPnorm"] = True n_allele_ref_count = rec[n_sample + "AD"][0] alleles_alt = rec["ALT"] if alleles_alt == ['.']: n_allele_alt_count = 0 else: n_allele_alt_count = 0 for a in xrange(0, len(alleles_alt)): n_allele_alt_count += float(rec[n_sample + "AD"][a + 1]) if n_allele_alt_count + n_allele_ref_count == 0: n_allele_rate = 0 else: n_allele_rate = n_allele_alt_count / float(n_allele_alt_count + n_allele_ref_count) t_allele_ref_count = rec[t_sample + "AD"][0] alleles_alt = rec["ALT"] if alleles_alt == ['.']: t_allele_alt_count = 0 else: t_allele_alt_count = 0 for a in xrange(0, len(alleles_alt)): t_allele_alt_count += float(rec[t_sample + "AD"][a + 1]) if t_allele_alt_count + t_allele_ref_count == 0: t_allele_rate = 0 else: t_allele_rate = t_allele_alt_count / float(t_allele_alt_count + t_allele_ref_count) # Gather the computed data into a dict qrec = { "CHROM": rec["CHROM"], "POS": int(rec["POS"]), "REF": rec["REF"], "ALT": ",".join(rec["ALT"]), "FILTER": ",".join(rec["FILTER"]), "DBSNP": rec["I.DB"], "TLOD": TLOD, "NLOD": NLOD, "N_DP": n_DP, "T_DP": t_DP, "N_DP_RATE" : n_DP_ratio, "T_DP_RATE" : t_DP_ratio, "N_GT": rec[n_sample + "GT"], "T_GT": rec[t_sample + "GT"], "N_AD": rec[n_sample + "AD"], "T_AD": rec[t_sample + "AD"], "N_QSS": rec[n_sample + "QSS"], "T_QSS": rec[t_sample + "QSS"], "N_AF": n_allele_rate, "T_AF": t_allele_rate, "ECNT": rec["I.ECNT"], "HCNT": rec["I.HCNT"], "MAX_ED": rec["I.MAX_ED"], "MIN_ED": rec["I.MIN_ED"], "tag" : tag } records.append(qrec) cols = ["CHROM", "POS", "REF", "ALT", "FILTER", "TLOD", "NLOD", "DBSNP", "N_DP", "T_DP", "N_DP_RATE", "T_DP_RATE", "N_GT", "T_GT", "N_AD", "T_AD", "N_QSS", "T_QSS", "N_AF", "T_AF", "tag"] if records: df = pandas.DataFrame(records, columns=cols) else: df = pandas.DataFrame(columns=cols) return df
def main(): parser = argparse.ArgumentParser("Haplotype Comparison") # input parser.add_argument("-v", "--version", dest="version", action="store_true", help="Show version number and exit.") parser.add_argument("-r", "--reference", dest="ref", default=None, help="Specify a reference file.") # output parser.add_argument("-o", "--report-prefix", dest="reports_prefix", default=None, help="Filename prefix for report output.") parser.add_argument("--scratch-prefix", dest="scratch_prefix", default=None, help="Directory for scratch files.") parser.add_argument("--keep-scratch", dest="delete_scratch", default=True, action="store_false", help="Filename prefix for scratch report output.") # add quantification args qfy.updateArgs(parser) # control preprocessing pre.updateArgs(parser) parser.add_argument("--preprocess-truth", dest="preprocessing_truth", action="store_true", default=False, help="Preprocess truth file with same settings as query (default is to accept truth in original format).") parser.add_argument("--usefiltered-truth", dest="usefiltered_truth", action="store_true", default=False, help="Use filtered variant calls in truth file (by default, only PASS calls in the truth file are used)") parser.add_argument("--preprocessing-window-size", dest="preprocess_window", default=10000, type=int, help="Preprocessing window size (variants further apart than that size are not expected to interfere).") parser.add_argument("--adjust-conf-regions", dest="preprocessing_truth_confregions", action="store_true", default=True, help="Adjust confident regions to include variant locations.") parser.add_argument("--no-adjust-conf-regions", dest="preprocessing_truth_confregions", action="store_false", help="Adjust confident regions to include variant locations.") # detailed control of comparison parser.add_argument("--unhappy", "--no-haplotype-comparison", dest="no_hc", action="store_true", default=False, help="Disable haplotype comparison (only count direct GT matches as TP).") parser.add_argument("-w", "--window-size", dest="window", default=50, type=int, help="Minimum distance between variants such that they fall into the same superlocus.") # xcmp-specific stuff parser.add_argument("--xcmp-enumeration-threshold", dest="max_enum", default=16768, type=int, help="Enumeration threshold / maximum number of sequences to enumerate per block.") parser.add_argument("--xcmp-expand-hapblocks", dest="hb_expand", default=30, type=int, help="Expand haplotype blocks by this many basepairs left and right.") parser.add_argument("--threads", dest="threads", default=multiprocessing.cpu_count(), type=int, help="Number of threads to use.") parser.add_argument("--engine", dest="engine", default="xcmp", choices=["xcmp", "vcfeval", "scmp-somatic"], help="Comparison engine to use.") parser.add_argument("--engine-vcfeval-path", dest="engine_vcfeval", required=False, default=Haplo.vcfeval.findVCFEval(), help="This parameter should give the path to the \"rtg\" executable. " "The default is %s" % Haplo.vcfeval.findVCFEval()) parser.add_argument("--engine-vcfeval-template", dest="engine_vcfeval_template", required=False, help="Vcfeval needs the reference sequence formatted in its own file format " "(SDF -- run rtg format -o ref.SDF ref.fa). You can specify this here " "to save time when running hap.py with vcfeval. If no SDF folder is " "specified, hap.py will create a temporary one.") if Tools.has_sge: parser.add_argument("--force-interactive", dest="force_interactive", default=False, action="store_true", help="Force running interactively (i.e. when JOB_ID is not in the environment)") parser.add_argument("_vcfs", help="Two VCF files.", default=[], nargs="*") parser.add_argument("--logfile", dest="logfile", default=None, help="Write logging information into file rather than to stderr") verbosity_options = parser.add_mutually_exclusive_group(required=False) verbosity_options.add_argument("--verbose", dest="verbose", default=False, action="store_true", help="Raise logging level from warning to info.") verbosity_options.add_argument("--quiet", dest="quiet", default=False, action="store_true", help="Set logging level to output errors only.") args, unknown_args = parser.parse_known_args() if not Tools.has_sge: args.force_interactive = True if args.verbose: loglevel = logging.INFO elif args.quiet: loglevel = logging.ERROR else: loglevel = logging.WARNING # reinitialize logging for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) logging.basicConfig(filename=args.logfile, format='%(asctime)s %(levelname)-8s %(message)s', level=loglevel) # remove some safe unknown args unknown_args = [x for x in unknown_args if x not in ["--force-interactive"]] if len(sys.argv) < 2 or len(unknown_args) > 0: if unknown_args: logging.error("Unknown arguments specified : %s " % str(unknown_args)) parser.print_help() exit(1) if args.version: print "Hap.py %s" % Tools.version exit(0) if args.roc: args.write_vcf = True # sanity-check regions bed file (HAP-57) if args.regions_bedfile: logging.info("Checking input regions.") if bedOverlapCheck(args.regions_bedfile): raise Exception("The regions bed file (specified using -R) has overlaps, this will not work with xcmp." " You can either use -T, or run the file through bedtools merge") if args.fp_bedfile and not os.path.exists(args.fp_bedfile): raise Exception("FP/confident call region bed file does not exist.") if not args.force_interactive and "JOB_ID" not in os.environ: parser.print_help() raise Exception("Please qsub me so I get approximately 1 GB of RAM per thread.") if not args.ref: args.ref = Tools.defaultReference() if not os.path.exists(args.ref): raise Exception("Please specify a valid reference path using -r.") if not args.reports_prefix: raise Exception("Please specify an output prefix using -o ") if not os.path.exists(os.path.dirname(os.path.abspath(args.reports_prefix))): raise Exception("The output path does not exist. Please specify a valid output path and prefix using -o") if os.path.basename(args.reports_prefix) == "" or os.path.isdir(args.reports_prefix): raise Exception("The output path should specify a file name prefix. Please specify a valid output path " "and prefix using -o. For example, -o /tmp/test will create files named /tmp/test* .") # noinspection PyProtectedMember if not args._vcfs or len(args._vcfs) != 2: raise Exception("Please specify exactly two input VCFs.") # noinspection PyProtectedMember args.vcf1 = args._vcfs[0] # noinspection PyProtectedMember args.vcf2 = args._vcfs[1] if not os.path.exists(args.vcf1): raise Exception("Input file %s does not exist." % args.vcf1) if not os.path.exists(args.vcf2): raise Exception("Input file %s does not exist." % args.vcf2) tempfiles = [] # turn on allele conversion if args.engine == "scmp-somatic" and args.somatic_allele_conversion == False: args.somatic_allele_conversion = True # somatic allele conversion should also switch off decomposition if args.somatic_allele_conversion == True and "--decompose" not in sys.argv: args.preprocessing_decompose = False # xcmp/scmp support bcf; others don't if args.engine in ["xcmp", "scmp-somatic"] and (args.bcf or (args.vcf1.endswith(".bcf") and args.vcf2.endswith(".bcf"))): internal_format_suffix = ".bcf" else: internal_format_suffix = ".vcf.gz" try: logging.info("Comparing %s and %s" % (args.vcf1, args.vcf2)) logging.info("Preprocessing truth: %s" % args.vcf1) starttime = time.time() ttf = tempfile.NamedTemporaryFile(delete=False, dir=args.scratch_prefix, prefix="truth.pp", suffix=internal_format_suffix) ttf.close() if args.engine.endswith("somatic") and \ args.preprocessing_truth and \ (args.preprocessing_leftshift or args.preprocessing_norm or args.preprocessing_decompose): args.preprocessing_truth = False logging.info("Turning off pre.py preprocessing for somatic comparisons") tempfiles.append(ttf.name) tempfiles.append(ttf.name + ".csi") tempfiles.append(ttf.name + ".tbi") args.gender = pre.preprocess(args.vcf1, ttf.name, args.ref, args.locations, None if args.usefiltered_truth else "*", # filters args.fixchr, args.regions_bedfile, args.targets_bedfile, args.preprocessing_leftshift if args.preprocessing_truth else False, args.preprocessing_decompose if args.preprocessing_truth else False, args.preprocessing_norm if args.preprocessing_truth else False, args.preprocess_window, args.threads, args.gender, args.somatic_allele_conversion) args.vcf1 = ttf.name if args.fp_bedfile and args.preprocessing_truth_confregions: conf_temp = Haplo.gvcf2bed.gvcf2bed(args.vcf1, args.ref, args.fp_bedfile, args.scratch_prefix) tempfiles.append(conf_temp) args.strat_regions.append("CONF_VARS:" + conf_temp) h1 = vcfextract.extractHeadersJSON(args.vcf1) elapsed = time.time() - starttime logging.info("preprocess for %s -- time taken %.2f" % (args.vcf1, elapsed)) # once we have preprocessed the truth file we can resolve the locations # doing this here improves the time for query preprocessing below reference_contigs = set(fastaContigLengths(args.ref).keys()) if not args.locations: # default set of locations is the overlap between truth and reference args.locations = list(reference_contigs & set(h1["tabix"]["chromosomes"])) if not args.locations: raise Exception("Truth and reference have no chromosomes in common!") elif type(args.locations) is not list: args.locations = args.locations.split(",") args.locations = sorted(args.locations) logging.info("Preprocessing query: %s" % args.vcf2) starttime = time.time() if args.pass_only: filtering = "*" else: filtering = args.filters_only qtf = tempfile.NamedTemporaryFile(delete=False, dir=args.scratch_prefix, prefix="query.pp", suffix=internal_format_suffix) qtf.close() tempfiles.append(qtf.name) tempfiles.append(qtf.name + ".csi") tempfiles.append(qtf.name + ".tbi") if args.engine.endswith("somatic") and \ (args.preprocessing_leftshift or args.preprocessing_norm or args.preprocessing_decompose): args.preprocessing_leftshift = False args.preprocessing_norm = False args.preprocessing_decompose = False logging.info("Turning off pre.py preprocessing (query) for somatic comparisons") pre.preprocess(args.vcf2, qtf.name, args.ref, str(",".join(args.locations)), filtering, args.fixchr, args.regions_bedfile, args.targets_bedfile, args.preprocessing_leftshift, args.preprocessing_decompose, args.preprocessing_norm, args.preprocess_window, args.threads, args.gender, args.somatic_allele_conversion) # same gender as truth above args.vcf2 = qtf.name h2 = vcfextract.extractHeadersJSON(args.vcf2) elapsed = time.time() - starttime logging.info("preprocess for %s -- time taken %.2f" % (args.vcf2, elapsed)) if not h1["tabix"]: raise Exception("Truth file is not indexed after preprocesing.") if not h2["tabix"]: raise Exception("Query file is not indexed after preprocessing.") for _xc in args.locations: if _xc not in h2["tabix"]["chromosomes"]: logging.warn("No calls for location %s in query!" % _xc) pool = getPool(args.threads) if args.threads > 1 and args.engine == "xcmp": logging.info("Running using %i parallel processes." % args.threads) # find balanced pieces # cap parallelism at 64 since otherwise bcftools concat below might run out # of file handles args.pieces = min(args.threads, 64) res = runParallel(pool, Haplo.blocksplit.blocksplitWrapper, args.locations, args) if None in res: raise Exception("One of the blocksplit processes failed.") tempfiles += res args.locations = [] for f in res: with open(f) as fp: for l in fp: ll = l.strip().split("\t", 3) if len(ll) < 3: continue xchr = ll[0] start = int(ll[1]) + 1 end = int(ll[2]) args.locations.append("%s:%i-%i" % (xchr, start, end)) # count variants before normalisation if "samples" not in h1 or not h1["samples"]: raise Exception("Cannot read sample names from truth VCF file") if "samples" not in h2 or not h2["samples"]: raise Exception("Cannot read sample names from query VCF file") tf = tempfile.NamedTemporaryFile(delete=False, dir=args.scratch_prefix, prefix="hap.py.result.", suffix=internal_format_suffix) tf.close() tempfiles.append(tf.name) tempfiles.append(tf.name + ".tbi") tempfiles.append(tf.name + ".csi") output_name = tf.name if args.engine == "xcmp": # do xcmp logging.info("Using xcmp for comparison") res = runParallel(pool, Haplo.xcmp.xcmpWrapper, args.locations, args) tempfiles += [x for x in res if x is not None] # VCFs if None in res: raise Exception("One of the xcmp jobs failed.") if len(res) == 0: raise Exception("Input files/regions do not contain variants (0 haplotype blocks were processed).") # concatenate + index logging.info("Concatenating variants...") runme_list = [x for x in res if x is not None] if len(runme_list) == 0: raise Exception("No outputs to concatenate!") logging.info("Concatenating...") bcftools.concatenateParts(output_name, *runme_list) logging.info("Indexing...") bcftools.runBcftools("index", output_name) # passed to quantify args.type = "xcmp" # xcmp extracts whichever field we're using into the QQ info field args.roc_header = args.roc args.roc = "IQQ" elif args.engine == "vcfeval": tempfiles += Haplo.vcfeval.runVCFEval(args.vcf1, args.vcf2, output_name, args) # passed to quantify args.type = "ga4gh" elif args.engine == "scmp-somatic": tempfiles += Haplo.scmp.runSCmp(args.vcf1, args.vcf2, output_name, args) # passed to quantify args.type = "ga4gh" else: raise Exception("Unknown comparison engine: %s" % args.engine) args.in_vcf = [output_name] args.runner = "hap.py" qfy.quantify(args) finally: if args.delete_scratch: for x in tempfiles: try: os.remove(x) except: pass else: logging.info("Scratch files kept : %s" % (str(tempfiles)))
def extractMutectIndelFeatures(vcfname, tag, avg_depth=None): """ Return a data frame with features collected from the given VCF, tagged by given type """ records = [] if not avg_depth: logging.warn("No average depths available, normalized depth features cannot be calculated") hdrs = extractHeadersJSON(vcfname) tsn = "" nsn = "" t_sample = "S.1." n_sample = "S.2." try: samples = hdrs["samples"] for f in hdrs["fields"]: if f["key"] == "GATKCommandLine" and f["values"]["ID"].lower() == "mutect": clopts = f["values"]["CommandLineOptions"] # ... tumor_sample_name=HCC2218_tumour ... normal_sample_name=HCC2218_normal m = re.search("tumor_sample_name=([^\s]+)", clopts) if m: tsn = m.group(1) for i, x in enumerate(samples): if x == tsn: t_sample = "S.%i." % (i+1) break m = re.search("normal_sample_name=([^\s]+)", clopts) if m: nsn = m.group(1) for i, x in enumerate(samples): if x == nsn: n_sample = "S.%i." % (i+1) break except: logging.warn("Unable to detect tumour / normal sample order from VCF header") logging.info("Normal sample name : %s (prefix %s) / tumour sample name : %s (prefix %s)" % (nsn, n_sample, tsn, t_sample)) has_warned = {} ##FORMAT=<ID=MM,Number=2,Type=Float,Description="Average # of mismatches per ref-/consensus indel-supporting read"> ##FORMAT=<ID=MQS,Number=2,Type=Float,Description="Average mapping qualities of ref-/consensus indel-supporting reads"> ##FORMAT=<ID=NQSBQ,Number=2,Type=Float,Description="Within NQS window: average quality of bases in ref-/consensus indel-supporting reads"> ##FORMAT=<ID=NQSMM,Number=2,Type=Float,Description="Within NQS window: fraction of mismatching bases in ref/consensus indel-supporting reads"> ##FORMAT=<ID=REnd,Number=2,Type=Integer,Description="Median/mad of indel offsets from the ends of the reads"> ##FORMAT=<ID=RStart,Number=2,Type=Integer,Description="Median/mad of indel offsets from the starts of the reads"> ##FORMAT=<ID=SC,Number=4,Type=Integer,Description="Strandness: counts of forward-/reverse-aligned reference and indel-supporting reads (FwdRef,RevRef,FwdIndel,RevIndel)"> features = ["CHROM", "POS", "REF", "ALT", "FILTER", n_sample + "GT", t_sample + "GT", n_sample + "DP", t_sample + "DP", n_sample + "AD", t_sample + "AD", n_sample + "MM", t_sample + "MM", n_sample + "MQS", t_sample + "MQS", n_sample + "NQSBQ", t_sample + "NQSBQ", n_sample + "NQSMM", t_sample + "NQSMM", n_sample + "RStart", t_sample + "RStart", n_sample + "REnd", t_sample + "REnd", n_sample + "SC", t_sample + "SC"] for vr in vcfExtract(vcfname, features): rec = {} for i, ff in enumerate(features): rec[ff] = vr[i] for q in [n_sample + "GT", t_sample + "GT"]: if not q in rec or rec[q] is None: rec[q] = "." if not ("feat:" + q) in has_warned: logging.warn("Missing feature %s" % q) has_warned["feat:" + q] = True # fix missing features for q in [n_sample + "GT", t_sample + "GT", n_sample + "DP", t_sample + "DP", n_sample + "AD", t_sample + "AD", n_sample + "MM", t_sample + "MM", n_sample + "MQS", t_sample + "MQS", n_sample + "NQSBQ", t_sample + "NQSBQ", n_sample + "NQSMM", t_sample + "NQSMM", n_sample + "RStart", t_sample + "RStart", n_sample + "REnd", t_sample + "REnd", n_sample + "SC", t_sample + "SC"]: if not q in rec or rec[q] is None: rec[q] = 0 if not ("feat:" + q) in has_warned: logging.warn("Missing feature %s" % q) has_warned["feat:" + q] = True else: if q.endswith("AD") or q.endswith("MM") or q.endswith("MQS") or \ q.endswith("NQSBQ") or q.endswith("NQSMM") or \ q.endswith("REnd") or q.endswith("RStart"): if type(rec[q]) is not list: if not has_warned[q + "_PARSE_FAIL"]: logging.warn("Cannot parse %s: %s" % (q, str(rec[q]))) has_warned[q + "_PARSE_FAIL"] = True rec[q] = [-1, -1] for xx in range(2): if len(rec[q]) <= xx: rec[q].append(-1) else: try: rec[q][xx] = float(rec[q][xx]) except ValueError: rec[q][xx] = -1 elif q.endswith("SC"): if type(rec[q]) is not list: if not has_warned[q + "_PARSE_FAIL"]: logging.warn("Cannot parse %s: %s" % (q, str(rec[q]))) has_warned[q + "_PARSE_FAIL"] = True rec[q] = [-1, -1, -1, -1] else: for xx in range(4): if len(rec[q]) <= xx: rec[q].append(-1) else: try: rec[q][xx] = float(rec[q][xx]) except ValueError: rec[q][xx] = -1 else: try: rec[q] = int(rec[q]) except ValueError: rec[q] = -1 rec["tag"] = tag n_DP = float(rec[n_sample + "DP"]) t_DP = float(rec[t_sample + "DP"]) n_DP_ratio = 0 t_DP_ratio = 0 if avg_depth: if rec["CHROM"] in avg_depth: n_DP_ratio = n_DP/float(avg_depth[rec["CHROM"]]) t_DP_ratio = t_DP/float(avg_depth[rec["CHROM"]]) elif not rec["CHROM"] in has_warned: logging.warn("Cannot normalize depths on %s" % rec["CHROM"]) has_warned[rec["CHROM"]] = True elif not "DPnorm" in has_warned: logging.warn("Cannot normalize depths.") has_warned["DPnorm"] = True n_allele_ref_count = rec[n_sample + "AD"][0] alleles_alt = rec["ALT"] if alleles_alt == ['.']: n_allele_alt_count = 0 else: n_allele_alt_count = 0 for a in xrange(1, len(rec[n_sample + "AD"])): n_allele_alt_count += float(rec[n_sample + "AD"][a]) if n_allele_alt_count + n_allele_ref_count == 0: n_allele_rate = 0 else: n_allele_rate = n_allele_alt_count / float(n_allele_alt_count + n_allele_ref_count) t_allele_ref_count = rec[t_sample + "AD"][0] alleles_alt = rec["ALT"] if alleles_alt == ['.']: t_allele_alt_count = 0 else: t_allele_alt_count = 0 for a in xrange(1, len(rec[t_sample + "AD"])): t_allele_alt_count += float(rec[t_sample + "AD"][a]) if t_allele_alt_count + t_allele_ref_count == 0: t_allele_rate = 0 else: t_allele_rate = t_allele_alt_count / float(t_allele_alt_count + t_allele_ref_count) # Gather the computed data into a dict qrec = { "CHROM": rec["CHROM"], "POS": int(rec["POS"]), "REF": rec["REF"], "ALT": ",".join(rec["ALT"]), "FILTER": ",".join(rec["FILTER"]), "N_DP": n_DP, "T_DP": t_DP, "N_DP_RATE" : n_DP_ratio, "T_DP_RATE" : t_DP_ratio, "N_GT": rec[n_sample + "GT"], "T_GT": rec[t_sample + "GT"], "N_AD": rec[n_sample + "AD"], "T_AD": rec[t_sample + "AD"], "N_ALT_RATE": n_allele_rate, "T_ALT_RATE": t_allele_rate, "N_MM": n_sample + "MM", "T_MM": t_sample + "MM", "N_MQS": n_sample + "MQS", "T_MQS": t_sample + "MQS", "N_NQSBQ": n_sample + "NQSBQ", "T_NQSBQ": t_sample + "NQSBQ", "N_NQSMM": n_sample + "NQSMM", "T_NQSMM": t_sample + "NQSMM", "N_RStart": n_sample + "RStart", "T_RStart": t_sample + "RStart", "N_REnd": n_sample + "REnd", "T_REnd": t_sample + "REnd", "N_SC": n_sample + "SC", "T_SC": t_sample + "SC", "tag" : tag } records.append(qrec) cols = [ "CHROM", "POS", "REF", "ALT", "FILTER", "DBSNP", "N_DP", "T_DP", "N_DP_RATE", "T_DP_RATE", "N_GT", "T_GT", "N_AD", "T_AD", "N_ALT_RATE", "T_ALT_RATE", "N_MM", "T_MM", "N_MQS", "T_MQS", "N_NQSBQ", "T_NQSBQ", "N_NQSMM", "T_NQSMM", "N_RStart", "T_RStart", "N_REnd", "T_REnd", "N_SC", "T_SC", "tag"] if records: df = pandas.DataFrame(records, columns=cols) else: df = pandas.DataFrame(columns=cols) return df
def preprocess(vcf_input, vcf_output, reference, locations=None, filters=None, fixchr=None, regions=None, targets=None, leftshift=True, decompose=True, bcftools_norm=False, windowsize=10000, threads=1, gender=None, somatic_allele_conversion=False, sample="SAMPLE", filter_nonref=True, convert_gvcf_to_vcf=False): """ Preprocess a single VCF file :param vcf_input: input file name :param vcf_output: output file name :param reference: reference fasta name :param locations: list of locations or None :param filters: list of filters to apply ("*" to only allow PASS) :param fixchr: None for auto, or True/False -- fix chr prefix to match reference :param regions: regions bed file :param targets: targets bed file :param leftshift: left-shift variants :param decompose: decompose variants :param bcftools_norm: use bcftools_norm :param windowsize: normalisation window size :param threads: number of threads to for preprcessing :param gender: the sex of the sample ("male" / "female" / "auto" / None) :param somatic_allele_conversion: convert somatic alleles -- False / half / het / hemi / hom :param sample: when using somatic_allele_conversion, name of the output sample :param filter_nonref: remove any variants genotyped as <NON_REF> :return: the sex if auto-determined (otherwise the same value as sex parameter) """ tempfiles = [] try: # If the input is in BCF format, we can continue to # process it in bcf # if it is in .vcf.gz, don't try to convert it to # bcf because there are a range of things that can # go wrong there (e.g. undefined contigs and bcftools # segfaults) if vcf_input.endswith(".bcf") or vcf_output.endswith(".bcf"): int_suffix = ".bcf" int_format = "b" if not vcf_input.endswith(".bcf") and vcf_output.endswith(".bcf"): logging.warn("Turning vcf into bcf can cause problems when headers are not consistent with all " "records in the file. I will run vcfcheck to see if we will run into trouble. " "To save time in the future, consider converting your files into bcf using bcftools before" " running pre.py.") else: int_suffix = ".vcf.gz" int_format = "z" # HAP-317 always check for BCF errors since preprocessing tools now require valid headers mf = subprocess.check_output("vcfcheck %s --check-bcf-errors 1" % pipes.quote(vcf_input), shell=True) if gender == "auto": logging.info(mf) if "female" in mf: gender = "female" else: gender = "male" h = vcfextract.extractHeadersJSON(vcf_input) reference_contigs = set(fastaContigLengths(reference).keys()) reference_has_chr_prefix = hasChrPrefix(reference_contigs) allfilters = [] for f in h["fields"]: try: if f["key"] == "FILTER": allfilters.append(f["values"]["ID"]) except: logging.warn("ignoring header: %s" % str(f)) required_filters = None if filters: fts = filters.split(",") required_filters = ",".join(list(set(["PASS", "."] + [x for x in allfilters if x not in fts]))) if fixchr is None: try: if not h["tabix"]: logging.warn("input file is not tabix indexed, consider doing this in advance for performance reasons") vtf = tempfile.NamedTemporaryFile(delete=False, suffix=int_suffix) vtf.close() tempfiles.append(vtf.name) runBcftools("view", "-o", vtf.name, "-O", int_format, vcf_input) runBcftools("index", vtf.name) h2 = vcfextract.extractHeadersJSON(vcf_input) chrlist = h2["tabix"]["chromosomes"] else: chrlist = h["tabix"]["chromosomes"] vcf_has_chr_prefix = hasChrPrefix(chrlist) if reference_has_chr_prefix and not vcf_has_chr_prefix: fixchr = True except: logging.warn("Guessing the chr prefix in %s has failed." % vcf_input) if leftshift or decompose: # all these require preprocessing vtf = tempfile.NamedTemporaryFile(delete=False, suffix=int_suffix) vtf.close() tempfiles.append(vtf.name) vtf = vtf.name else: vtf = vcf_output preprocessVCF(vcf_input, vtf, locations, filters == "*", fixchr, bcftools_norm, regions, targets, reference, required_filters, somatic_allele_conversion=somatic_allele_conversion, sample=sample, filter_nonref=filter_nonref, convert_gvcf=convert_gvcf_to_vcf, num_threads=threads) if leftshift or decompose or gender == "male": Haplo.partialcredit.partialCredit(vtf, vcf_output, reference, locations, threads=threads, window=windowsize, leftshift=leftshift, decompose=decompose, haploid_x=gender == "male") finally: for t in tempfiles: try: os.unlink(t) except: pass return gender
def preprocess(vcf_input, vcf_output, reference, locations=None, filters=None, fixchr=None, regions=None, targets=None, leftshift=True, decompose=True, bcftools_norm=False, windowsize=10000, threads=1, ): """ Preprocess a single VCF file :param vcf_input: input file name :param vcf_output: output file name :param reference: reference fasta name :param locations: list of locations or None :param filters: list of filters to apply ("*" to only allow PASS) :param fixchr: None for auto, or True/False -- fix chr prefix to match reference :param regions: regions bed file :param targets: targets bed file :param leftshift: left-shift variants :param decompose: decompose variants :param bcftools_norm: use bcftools_norm :param windowsize: normalisation window size :param threads: number of threads to for preprcessing """ tempfiles = [] try: # If the input is in BCF format, we can continue to # process it in bcf # if it is in .vcf.gz, don't try to convert it to # bcf because there are a range of things that can # go wrong there (e.g. undefined contigs and bcftools # segfaults) if vcf_input.endswith(".bcf") or vcf_output.endswith(".bcf"): int_suffix = ".bcf" int_format = "b" if not vcf_input.endswith(".bcf") and vcf_output.endswith(".bcf"): logging.warn("Turning vcf into bcf can cause problems when headers aren't consistent with all " "records in the file. I will run vcfcheck to see if we will run into trouble. " "To save time in the future, consider converting your files into bcf using bcftools before" " running pre.py.") subprocess.check_call("vcfcheck %s" % vcf_input, shell=True) else: int_suffix = ".vcf.gz" int_format = "z" h = vcfextract.extractHeadersJSON(vcf_input) reference_contigs = set(fastaContigLengths(reference).keys()) reference_has_chr_prefix = hasChrPrefix(reference_contigs) allfilters = [] for f in h["fields"]: try: if f["key"] == "FILTER": allfilters.append(f["values"]["ID"]) except: logging.warn("ignoring header: %s" % str(f)) required_filters = None if filters: fts = filters.split(",") required_filters = ",".join(list(set(["PASS", "."] + [x for x in allfilters if x not in fts]))) if fixchr is None: try: if not h["tabix"]: logging.warn("input file is not tabix indexed, consider doing this in advance for performance reasons") vtf = tempfile.NamedTemporaryFile(delete=False, suffix=int_suffix) vtf.close() tempfiles.append(vtf.name) runBcftools("view", "-o", vtf.name, "-O", int_format, vcf_input) runBcftools("index", vtf.name) h2 = vcfextract.extractHeadersJSON(vcf_input) chrlist = h2["tabix"]["chromosomes"] else: chrlist = h["tabix"]["chromosomes"] vcf_has_chr_prefix = hasChrPrefix(h["tabix"]["chromosomes"]) if reference_has_chr_prefix and not vcf_has_chr_prefix: fixchr = True except: logging.warn("Guessing the chr prefix in %s has failed." % vcf_input) # all these require preprocessing vtf = vcf_input if leftshift or decompose: vtf = tempfile.NamedTemporaryFile(delete=False, suffix=int_suffix) vtf.close() tempfiles.append(vtf.name) vtf = vtf.name else: vtf = vcf_output preprocessVCF(vcf_input, vtf, locations, filters == "*", fixchr, bcftools_norm, regions, targets, reference, required_filters) if leftshift or decompose: Haplo.partialcredit.partialCredit(vtf, vcf_output, reference, locations, threads=threads, window=windowsize, leftshift=leftshift, decompose=decompose) finally: for t in tempfiles: try: os.unlink(t) except: pass