def OutputDiffRefBias(diffs_from_ref, reflens, fname, xlim=(0,100), \ mingts=100, metric="mean", binsize=5): r"""Plot reflen vs. mean difference from ref bias plot Parameters ---------- diffs_from_ref : list of int Difference of each allele call from the ref allele (in bp) reflens : list of int List of reference allele lengths for each call (in bp) fname : str Filename of output plot xlim: tuple of int, optional Specify the minimum and maximum x-axis range (in bp) mingts: int, optional Don't plot data points computed based on fewer than this many genotypes metric: str, optional Which metric to plot on the y-axis value. Must be mean or median binsize: int, optional Size (in bp) of bins on the x-axis. """ data = pd.DataFrame({"diff": diffs_from_ref, "ref": reflens, "count": [1]*len(reflens)}) data["ref"] = data["ref"].apply(lambda x: int(x/binsize)*binsize) if metric == "mean": sum_fn = np.mean elif metric == "median": sum_fn = np.median else: common.WARNING("Invalid metric ({}) specified. Skipping reference bias plot".format(metric)) return metric = metric.capitalize() summ = data.groupby("ref", as_index=False).agg({"diff": sum_fn, "count": len}).sort_values("ref") summ = summ[summ["count"]>=mingts] # exclude small counts summ = summ[(summ["ref"]>=xlim[0]) & (summ["ref"]<=xlim[1])] # filter by x range if summ.shape[0] == 0: common.WARNING("No points left to plot in reference bias plot after " "filtering. Skipping") return common.MSG("Plotting ref bias plot with the following data:") common.MSG(summ) trcounts = np.cumsum(summ["count"]) trfreqs = trcounts/np.sum(summ["count"]) fig = plt.figure() ax = fig.add_subplot(111) ax.plot(summ["ref"], summ["diff"], marker="o", color="darkblue") ax.axhline(y=0, linestyle="dashed", color="gray") ax.set_xlabel("Reference length (bp)", size=15) ax.set_ylabel("{} diff from ref (bp)".format(metric), size=15) ax1 = ax.twinx() ax1.plot(summ["ref"], trfreqs, color="darkred") ax1.set_ylabel("Cumulative fraction of alleles", size=15) fig.tight_layout() fig.savefig(fname) plt.close()
def MakeWriter(outfile, invcf, command): r"""Create a VCF writer with a dumpSTR header Adds a header line with the dumpSTR command used Parameters ---------- outfile : str Name of the output file invcf : vcf.Reader object Input VCF. Used to grab header info command : str String command used to run dumpSTR Returns ------- writer : vcf.Writer object VCF writer initialized with header of input VCF Set to None if we had a problem writing the file """ invcf.metadata["command-DumpSTR"] = [command] try: writer = vcf.Writer(open(outfile, "w"), invcf) except OSError as e: common.WARNING(str(e)) writer = None return writer
def GetSamples(readers, usefilenames=False): r"""Get list of samples used in all files being merged Parameters ---------- readers : list of vcf.Reader objects usefilenames : bool, optional If True, add filename to sample names. Useful if sample names overlap across files Returns ------- samples : list of str List of samples in merged list """ samples = [] for r in readers: if usefilenames: samples = samples + [ r.filename.strip(".vcf.gz") + ":" + s for s in r.samples ] else: samples = samples + r.samples if len(set(samples)) != len(samples): common.WARNING("Duplicate samples found.") return [] return samples
def LoadRegions(self, filename): if not filename.endswith(".bed.gz") and not filename.endswith( ".bed.bgz"): #raise ValueError("Make sure %s is bgzipped and indexed"%filename) self.regions = None common.WARNING("Make sure %s is bgzipped and indexed" % filename) self.pass_checks = False return if not os.path.isfile(filename): #raise ValueError("Could not find regions BED file %s"%filename) self.regions = None common.WARNING("Could not find regions BED file %s" % filename) self.pass_checks = False return if not os.path.isfile(filename + ".tbi"): #raise ValueError("Could not find tabix index %s.tbi"%filename) self.regions = None common.WARNING("Could not find tabix index %s.tbi" % filename) self.pass_checks = False return self.regions = BedTool(filename)
def CheckHipSTRFilters(invcf, args): r"""Check HipSTR call-level filters Parameters ---------- invcf : str vcf.Reader object args : argparse namespace Contains user arguments Returns ------- checks : bool Set to True if all filters look ok. Set to False if filters are invalid """ if args.hipstr_max_call_flank_indel is not None: if args.hipstr_max_call_flank_indel < 0 or args.hipstr_max_call_flank_indel > 1: common.WARNING( "--hipstr-max-call-flank-indel must be between 0 and 1") return False assert "DP" in invcf.formats and "DFLANKINDEL" in invcf.formats # should always be true if args.hipstr_max_call_stutter is not None: if args.hipstr_max_call_stutter < 0 or args.hipstr_max_call_stutter > 1: common.WARNING("--hipstr-max-call-stutter must be between 0 and 1") return False assert "DP" in invcf.formats and "DSTUTTER" in invcf.formats # should always be true if args.hipstr_min_supp_reads is not None: if args.hipstr_min_supp_reads < 0: common.WARNING("--hipstr-min-supp-reads must be >= 0") return False assert "ALLREADS" in invcf.formats and "GB" in invcf.formats if args.hipstr_min_call_DP is not None: if args.hipstr_min_call_DP < 0: common.WARNING("--hipstr-min-call-DP must be >= 0") return False assert "DP" in invcf.formats if args.hipstr_max_call_DP is not None: if args.hipstr_max_call_DP < 0: common.WARNING("--hipstr-max-call-DP must be >= 0") return False assert "DP" in invcf.formats if args.hipstr_min_call_DP is not None and args.hipstr_max_call_DP is not None: if args.hipstr_max_call_DP < args.hipstr_min_call_DP: common.WARNING( "--hipstr-max-call-DP must be >= --hipstr-min-call-DP") return False if args.hipstr_min_call_Q is not None: if args.hipstr_min_call_Q < 0 or args.hipstr_min_call_Q > 1: common.WARNING("--hipstr-min-call-Q must be between 0 and 1") return False assert "Q" in invcf.formats return True
def LoadSingleReader(vcffile, checkgz=True, region=None): r"""Return VCF reader Parameters ---------- vcffile : str VCF files to read checkgz: boolean, optional Check whether VCF file is gzipped and indexed region : str, optional Chrom:start-end to restrict to Returns ------- reader : vcf.Reader VCF reader """ if not os.path.isfile(vcffile): common.WARNING("Could not find VCF file %s" % vcffile) return None if checkgz: if not vcffile.endswith(".vcf.gz") and not vcffile.endswith( ".vcf.bgz"): common.WARNING("Make sure %s is bgzipped and indexed" % vcffile) return None if not os.path.isfile(vcffile + ".tbi"): common.WARNING("Could not find VCF index %s.tbi" % vcffile) return None if vcffile.endswith(".vcf.gz") or vcffile.endswith(".vcf.bgz"): reader = vcf.Reader(open(vcffile, "rb")) else: reader = vcf.Reader(open(vcffile)) if region is None: return reader else: return reader.fetch(region)
def CheckLocusFilters(args, vcftype): r"""Perform checks on user inputs for locus-level filters Parameters ---------- args : argparse namespace Contains user arguments vcftype : enum. Specifies which tool this VCF came from. Must be included in trh.VCFTYPES Returns ------- checks : bool Set to True if all filters look ok. Set to False if filters are invalid """ if args.min_locus_hwep is not None: if args.min_locus_hwep < 0 or args.min_locus_hwep > 1: common.WARNING("Invalid --min-locus-hwep. Must be between 0 and 1") return False if args.min_locus_het is not None: if args.min_locus_het < 0 or args.min_locus_het > 1: common.WARNING("Invalid --min-locus-het. Must be between 0 and 1") return False if args.max_locus_het is not None: if args.max_locus_het < 0 or args.max_locus_het > 1: common.WARNING("Invalid --max-locus-het. Must be between 0 and 1") return False if args.min_locus_het is not None and args.max_locus_het is not None: if args.max_locus_het < args.min_locus_het: common.WARNING( "Cannot have --max-locus-het less than --min-locus-het") return False if args.use_length and vcftype not in [trh.VcfTypes["hipstr"]]: common.WARNING( "--use-length is only meaningful for HipSTR, which reports sequence level differences." ) if args.filter_hrun and vcftype not in [trh.VcfTypes["hipstr"]]: common.WARNING( "--filter-run only relevant to HipSTR files. This filter will have no effect." ) if args.filter_regions is not None: if args.filter_regions_names is not None: filter_region_files = args.filter_regions.split(",") filter_region_names = args.filter_regions_names.split(",") if len(filter_region_names) != len(filter_region_files): common.WARNING( "Length of --filter-regions-names must match --filter-regions." ) return False return True
def CheckPopSTRFilters(invcf, args): r"""Check PopSTR call-level filters Parameters ---------- invcf : str vcf.Reader object args : argparse namespace Contains user arguments Returns ------- checks : bool Set to True if all filters look ok. Set to False if filters are invalid """ if args.popstr_min_call_DP is not None: if args.popstr_min_call_DP < 0: common.WARNING("--popstr-min-call-DP must be >= 0") return False assert "DP" in invcf.formats if args.popstr_max_call_DP is not None: if args.popstr_max_call_DP < 0: common.WARNING("--popstr-max-call-DP must be >= 0") return False assert "DP" in invcf.formats if args.popstr_min_call_DP is not None and args.popstr_max_call_DP is not None: if args.popstr_max_call_DP < args.popstr_min_call_DP: common.WARNING( "--popstr-max-call-DP must be >= --popstr-min-call-DP") return False if args.popstr_require_support is not None: if args.popstr_require_support < 0: common.WARNING("--popstr-require-support must be >= 0") return False assert "AD" in invcf.formats return True
def CheckAdVNTRFilters(invcf, args): r"""Check adVNTR call-level filters Parameters ---------- invcf : str vcf.Reader object args : argparse namespace Contains user arguments Returns ------- checks : bool Set to True if all filters look ok. Set to False if filters are invalid """ if args.advntr_min_call_DP is not None: if args.advntr_min_call_DP < 0: common.WARNING("--advntr-min-call-DP must be >= 0") return False assert "DP" in invcf.formats if args.advntr_max_call_DP is not None: if args.advntr_max_call_DP < 0: common.WARNING("--advntr-max-call-DP must be >= 0") return False assert "DP" in invcf.formats if args.advntr_min_call_DP is not None and args.advntr_max_call_DP is not None: if args.advntr_max_call_DP < args.advntr_min_call_DP: common.WARNING( "--advntr-max-call-DP must be >= --advntr-min-call-DP") return False if args.advntr_min_spanning is not None: if args.advntr_min_spanning < 0: common.WARNING("--advntr-min-spanning must be >=0") return False assert "SR" in invcf.formats if args.advntr_min_flanking is not None: if args.advntr_min_flanking < 0: common.WARNING("--advntr-min-flanking must be >=0") return False assert "FR" in invcf.formats if args.advntr_min_ML is not None: if args.advntr_min_ML < 0: common.WARNING("--advntr-min-ML must be >= 0") return False assert "ML" in invcf.formats return True
def CheckEHFilters(invcf, args): # pragma: no cover r"""Check ExpansionHunter call-level filters Parameters ---------- invcf : str vcf.Reader object args : argparse namespace Contains user arguments Returns ------- checks : bool Set to True if all filters look ok. Set to False if filters are invalid """ if args.eh_min_ADFL is not None: if args.eh_min_ADFL < 0: common.WARNING("--eh-min-ADFL must be >= 0") return False assert "ADFL" in invcf.formats if args.eh_min_ADIR is not None: if args.eh_min_ADIR < 0: common.WARNING("--eh-min-ADIR must be >= 0") return False assert "ADIR" in invcf.formats if args.eh_min_ADSP is not None: if args.eh_min_ADSP < 0: common.WARNING("--eh-min-ADSP must be >= 0") return False assert "ADSP" in invcf.formats if args.eh_min_call_LC is not None: if args.eh_min_call_LC < 0: common.WARNING("--eh-min-call-LC must be >= 0") return False assert "LC" in invcf.formats if args.eh_max_call_LC is not None: if args.eh_max_call_LC < 0: common.WARNING("--eh-max-call-LC must be >= 0") return False assert "LC" in invcf.formats if args.eh_min_call_LC is not None and args.eh_max_call_LC is not None: if args.eh_max_call_LC < args.eh_min_call_LC: common.WARNING("--eh-max-call-LC must be >= --eh-min-call-LC") return False return True
def GetInfoItem(current_records, mergelist, info_field, fail=True): """Get INFO item for a group of records Make sure it's the same across merged records if fail=True, die if items not the same. if fail=False, only do something if we have a rule on how to handle that field Parameters ---------- current_records : list of vcf.Record List of records being merged mergelist : list of bool List of indicators of whether to merge each record info_field : str INFO field being merged fail : bool If True, throw error if fields don't have same value Returns ------- infostring : str INFO string to add (key=value) """ if not fail: return None # TODO in future implement smart merging of select fields vals = set() for i in range(len(mergelist)): if mergelist[i]: if info_field in current_records[i].INFO: vals.add(current_records[i].INFO[info_field]) else: raise ValueError("Missing info field %s" % info_field) if len(vals) == 1: return "%s=%s" % (info_field, vals.pop()) else: common.WARNING("Incompatible info field value %s" % info_field) return None
def CheckFilters(invcf, args, vcftype): r"""Perform checks on user input for filters Parameters ---------- invcf : str vcf.Reader object args : argparse namespace Contains user arguments vcftype : enum. Specifies which tool this VCF came from. Must be included in trh.VCFTYPES Returns ------- checks : bool Set to True if all filters look ok. Set to False if filters are invalid """ if not CheckLocusFilters(args, vcftype): return False # Check HipSTR specific filters if args.hipstr_max_call_flank_indel is not None or \ args.hipstr_max_call_stutter is not None or \ args.hipstr_min_supp_reads is not None or \ args.hipstr_min_call_DP is not None or \ args.hipstr_max_call_DP is not None or \ args.hipstr_min_call_Q is not None: if vcftype != trh.VcfTypes["hipstr"]: common.WARNING("HipSTR options can only be applied to HipSTR VCFs") return False else: if not CheckHipSTRFilters(invcf, args): return False # Check GangSTR specific filters if args.gangstr_min_call_DP is not None or \ args.gangstr_max_call_DP is not None or \ args.gangstr_min_call_Q is not None or \ args.gangstr_expansion_prob_het is not None or \ args.gangstr_expansion_prob_hom is not None or \ args.gangstr_expansion_prob_total is not None or \ args.gangstr_filter_span_only or \ args.gangstr_filter_spanbound_only or \ args.gangstr_filter_badCI or \ args.gangstr_require_support is not None or \ args.gangstr_readlen is not None: if vcftype != trh.VcfTypes["gangstr"]: common.WARNING( "GangSTR options can only be applied to GangSTR VCFs") return False else: if not CheckGangSTRFilters(invcf, args): return False # Check adVNTR specific filters if args.advntr_min_call_DP is not None or \ args.advntr_max_call_DP is not None or \ args.advntr_min_spanning is not None or \ args.advntr_min_flanking is not None or \ args.advntr_min_ML is not None: if vcftype != trh.VcfTypes["advntr"]: common.WARNING("adVNTR options can only be applied to adVNTR VCFs") return False else: if not CheckAdVNTRFilters(invcf, args): return False # Check EH specific filters if args.eh_min_ADFL is not None or \ args.eh_min_ADIR is not None or \ args.eh_min_ADSP is not None or \ args.eh_min_call_LC is not None or \ args.eh_max_call_LC is not None: if vcftype != trh.VcfTypes["eh"]: common.WARNING( "ExpansionHunter options can only be applied to ExpansionHunter VCFs" ) return False else: # pragma: no cover if not CheckEHFilters(invcf, args): # pragma: no cover return False # pragma: no cover # Check popSTR specific filters if args.popstr_min_call_DP is not None or \ args.popstr_max_call_DP is not None or \ args.popstr_require_support is not None: if vcftype != trh.VcfTypes["popstr"]: common.WARNING("popSTR options can only be applied to popSTR VCFs") return False else: if not CheckPopSTRFilters(invcf, args): return False return True
def getargs(): # pragma: no cover parser = argparse.ArgumentParser( __doc__, formatter_class=utils.ArgumentDefaultsHelpFormatter) inout_group = parser.add_argument_group("Input/output") inout_group.add_argument("--vcf", help="Input STR VCF file", type=str, required=True) inout_group.add_argument( "--out", help="Output file prefix. Use stdout to print file to standard output.", type=str, required=True) inout_group.add_argument("--vcftype", help="Options=%s" % [str(item) for item in trh.VcfTypes.__members__], type=str, default="auto") filter_group = parser.add_argument_group("Filtering group") filter_group.add_argument( "--samples", help= "File containing list of samples to include. Or a comma-separated list of files to compute stats separate for each group of samples", type=str) filter_group.add_argument( "--sample-prefixes", help= "Prefixes to name output for each samples group. By default uses 1,2,3 etc.", type=str) filter_group.add_argument("--region", help="Restrict to this region chrom:start-end", type=str) stat_group_name = "Stats group" stat_group = parser.add_argument_group(stat_group_name) stat_group.add_argument( "--thresh", help= "Output threshold field (max allele size, used for GangSTR strinfo).", action="store_true") stat_group.add_argument("--afreq", help="Output allele frequencies", action="store_true") stat_group.add_argument("--acount", help="Output allele counts", action="store_true") stat_group.add_argument("--hwep", help="Output HWE p-values per loci.", action="store_true") stat_group.add_argument("--het", help="Output heterozygosity of each locus.", action="store_true") stat_group.add_argument("--mean", help="Output mean of allele frequencies.", action="store_true") stat_group.add_argument("--mode", help="Output mode of allele frequencies.", action="store_true") stat_group.add_argument("--var", help="Output variance of allele frequencies.", action="store_true") stat_group.add_argument("--numcalled", help="Output number of samples called.", action="store_true") stat_group.add_argument( "--use-length", help= "Calculate per-locus stats (het, HWE) collapsing alleles by length", action="store_true") plot_group = parser.add_argument_group("Plotting group") plot_group.add_argument( "--plot-afreq", help= "Output allele frequency plot. Will only do for a maximum of 10 TRs.", action="store_true") ver_group = parser.add_argument_group("Version") ver_group.add_argument("--version", action="version", version='{version}'.format(version=__version__)) args = parser.parse_args() # If no stat selected, print an error message and terminate stat_dict = {} for grp in parser._action_groups: if grp.title == stat_group_name: stat_dict = { a.dest: getattr(args, a.dest, None) for a in grp._group_actions } if not any(stat_dict.values()): common.WARNING( "Error: Please use at least one of the flags in the Stats group. See statSTR --help for options." ) return None return args
def CheckGangSTRFilters(invcf, args): r"""Check GangSTR call-level filters Parameters ---------- invcf : str vcf.Reader object args : argparse namespace Contains user arguments Returns ------- checks : bool Set to True if all filters look ok. Set to False if filters are invalid """ if args.gangstr_min_call_DP is not None: if args.gangstr_min_call_DP < 0: common.WARNING("--gangstr-min-call-DP must be >= 0") return False assert "DP" in invcf.formats if args.gangstr_max_call_DP is not None: if args.gangstr_max_call_DP < 0: common.WARNING("--gangstr-max-call-DP must be >= 0") return False assert "DP" in invcf.formats if args.gangstr_min_call_DP is not None and args.gangstr_max_call_DP is not None: if args.gangstr_max_call_DP < args.gangstr_min_call_DP: common.WARNING( "--gangstr-max-call-DP must be >= --gangstr-min-call-DP") return False if args.gangstr_min_call_Q is not None: if args.gangstr_min_call_Q < 0 or args.gangstr_min_call_Q > 1: common.WARNING("--gangstr-min-call-Q must be between 0 and 1") return False assert "Q" in invcf.formats if args.gangstr_expansion_prob_het is not None: if args.gangstr_expansion_prob_het < 0 or args.gangstr_expansion_prob_het > 1: common.WARNING( "--gangstr-expansion-prob-het must be between 0 and 1") return False assert "QEXP" in invcf.formats if args.gangstr_expansion_prob_hom is not None: if args.gangstr_expansion_prob_hom < 0 or args.gangstr_expansion_prob_hom > 1: common.WARNING( "--gangstr-expansion-prob-hom must be between 0 and 1") return False assert "QEXP" in invcf.formats if args.gangstr_expansion_prob_total is not None: if args.gangstr_expansion_prob_total < 0 or args.gangstr_expansion_prob_total > 1: common.WARNING( "--gangstr-expansion-prob-total must be between 0 and 1") return False assert "QEXP" in invcf.formats if args.gangstr_require_support is not None: if args.gangstr_require_support < 0: common.WARNING("--gangstr-require-support must be >= 0") return False if args.gangstr_require_support > 0 and args.gangstr_readlen is None: common.WARNING( "Using --gangstr-require-support requires setting --gangstr-readlen" ) return False if args.gangstr_readlen is not None and args.gangstr_readlen < 20: common.WARNING("--gangstr-readlen must be an integer value >= 20") return False assert "ENCLREADS" in invcf.formats and "FLNKREADS" in invcf.formats and "RC" in invcf.formats return True
def main(args): if not os.path.exists(args.vcf): common.WARNING("Error: %s does not exist" % args.vcf) return 1 if not os.path.exists(os.path.dirname(os.path.abspath(args.out))): common.WARNING( "Error: The directory which contains the output location {} does" " not exist".format(args.out)) return 1 if os.path.isdir(args.out) and args.out.endswith(os.sep): common.WARNING("Error: The output location {} is a " "directory".format(args.out)) return 1 # Load samples sample_lists = [] sample_prefixes = [] if args.samples: sfiles = args.samples.split(",") if args.sample_prefixes: sample_prefixes = args.sample_prefixes.split(",") else: sample_prefixes = [str(item) for item in range(1, len(sfiles) + 1)] if len(sfiles) != len(sample_prefixes): common.MSG("--sample-prefixes must be same length as --samples") return 1 for sf in sfiles: sample_lists.append( [item.strip() for item in open(sf, "r").readlines()]) invcf = utils.LoadSingleReader(args.vcf, checkgz=False) if invcf is None: return 1 if args.vcftype != 'auto': vcftype = trh.VcfTypes[args.vcftype] else: vcftype = trh.InferVCFType(invcf) header = ["chrom", "start", "end"] if args.thresh: header.extend(GetHeader("thresh", sample_prefixes)) if args.afreq: header.extend(GetHeader("afreq", sample_prefixes)) if args.acount: header.extend(GetHeader("acount", sample_prefixes)) if args.hwep: header.extend(GetHeader("hwep", sample_prefixes)) if args.het: header.extend(GetHeader("het", sample_prefixes)) if args.mean: header.extend(GetHeader("mean", sample_prefixes)) if args.mode: header.extend(GetHeader("mode", sample_prefixes)) if args.var: header.extend(GetHeader("var", sample_prefixes)) if args.numcalled: header.extend(GetHeader("numcalled", sample_prefixes)) if args.out == "stdout": if args.plot_afreq: common.MSG("Cannot use --out stdout when generating plots") return 1 outf = sys.stdout else: outf = open(args.out + ".tab", "w") outf.write("\t".join(header) + "\n") if args.region: if not os.path.isfile(args.vcf + ".tbi"): common.MSG("Make sure %s is bgzipped and indexed" % args.vcf) return 1 regions = invcf.fetch(args.region) else: regions = invcf num_plotted = 0 for record in regions: trrecord = trh.HarmonizeRecord(vcftype, record) if args.plot_afreq and num_plotted <= MAXPLOTS: PlotAlleleFreqs(trrecord, args.out, samplelists=sample_lists, sampleprefixes=sample_prefixes) num_plotted += 1 items = [ record.CHROM, record.POS, record.POS + len(trrecord.ref_allele) ] if args.thresh: items.extend(GetThresh(trrecord, samplelists=sample_lists)) if args.afreq: items.extend( GetAFreq(trrecord, samplelists=sample_lists, uselength=args.use_length)) if args.acount: items.extend( GetAFreq(trrecord, samplelists=sample_lists, uselength=args.use_length, count=True)) if args.hwep: items.extend( GetHWEP(trrecord, samplelists=sample_lists, uselength=args.use_length)) if args.het: items.extend( GetHet(trrecord, samplelists=sample_lists, uselength=args.use_length)) if args.mean: items.extend(GetMean(trrecord, samplelists=sample_lists)) if args.mode: items.extend(GetMode(trrecord, samplelists=sample_lists)) if args.var: items.extend(GetVariance(trrecord, samplelists=sample_lists)) if args.numcalled: items.extend(GetNumSamples(trrecord, samplelists=sample_lists)) outf.write("\t".join([str(item) for item in items]) + "\n") outf.close() return 0
def main(args): # Load VCF file invcf = utils.LoadSingleReader(args.vcf, checkgz=False) if invcf is None: return 1 if not os.path.exists(os.path.dirname(os.path.abspath(args.out))): common.WARNING( "Error: The directory which contains the output location {} does" " not exist".format(args.out)) return 1 if os.path.isdir(args.out) and args.out.endswith(os.sep): common.WARNING("Error: The output location {} is a " "directory".format(args.out)) return 1 # Set up record harmonizer and infer VCF type vcftype = trh.InferVCFType(invcf, args.vcftype) # Check filters all make sense if not CheckFilters(invcf, args, vcftype): return 1 # Set up locus-level filter list try: filter_list = BuildLocusFilters(args, vcftype) except ValueError: return 1 filter_list = BuildLocusFilters(args, vcftype) invcf.filters = {} for f in filter_list: short_doc = f.__doc__ or '' short_doc = short_doc.split('\n')[0].lstrip() invcf.filters[f.filter_name()] = _Filter(f.filter_name(), short_doc) # Set up call-level filters call_filters = BuildCallFilters(args) # Add new FORMAT fields if "FILTER" not in invcf.formats: invcf.formats["FILTER"] = _Format("FILTER", 1, "String", "Call-level filter") # Add new INFO fields invcf.infos["AC"] = _Info("AC", -1, "Integer", "Alternate allele counts", source=None, version=None) invcf.infos["REFAC"] = _Info("REFAC", 1, "Integer", "Reference allele count", source=None, version=None) invcf.infos["HET"] = _Info("HET", 1, "Float", "Heterozygosity", source=None, version=None) invcf.infos["HWEP"] = _Info("HWEP", 1, "Float", "HWE p-value for obs. vs. exp het rate", source=None, version=None) invcf.infos["HRUN"] = _Info("HRUN", 1, "Integer", "Length of longest homopolymer run", source=None, version=None) # Set up output files if not os.path.exists(os.path.dirname(os.path.abspath(args.out))): common.WARNING("Output directory does not exist") return 1 outvcf = MakeWriter(args.out + ".vcf", invcf, " ".join(sys.argv)) if outvcf is None: return 1 # Set up sample info all_reasons = GetAllCallFilters(call_filters) sample_info = {} for s in invcf.samples: sample_info[s] = {"numcalls": 0, "totaldp": 0} for r in all_reasons: sample_info[s][r] = 0 # Set up locus info loc_info = {"totalcalls": 0, "PASS": 0} for filt in filter_list: loc_info[filt.filter_name()] = 0 # Go through each record record_counter = 0 while True: try: record = next(invcf) except IndexError: common.WARNING( "Skipping TR that couldn't be parsed by PyVCF. Check VCF format" ) if args.die_on_warning: return 1 except StopIteration: break if args.verbose: common.MSG("Processing %s:%s" % (record.CHROM, record.POS)) record_counter += 1 if args.num_records is not None and record_counter > args.num_records: break # Call-level filters record = ApplyCallFilters(record, invcf, call_filters, sample_info) # Locus-level filters record.FILTER = None output_record = True for filt in filter_list: if filt(record) == None: continue if args.drop_filtered: output_record = False break record.add_filter(filt.filter_name()) loc_info[filt.filter_name()] += 1 if args.drop_filtered: if record.call_rate == 0: output_record = False if output_record: trrecord = trh.HarmonizeRecord(vcftype, record) # Recalculate locus-level INFO fields record.INFO["HRUN"] = utils.GetHomopolymerRun(record.REF) if record.num_called > 0: allele_freqs = trrecord.GetAlleleFreqs( uselength=args.use_length) genotype_counts = trrecord.GetGenotypeCounts( uselength=args.use_length) record.INFO["HET"] = utils.GetHeterozygosity(allele_freqs) record.INFO["HWEP"] = utils.GetHardyWeinbergBinomialTest( allele_freqs, genotype_counts) record.INFO["AC"] = [ int(item * (3 * record.num_called)) for item in record.aaf ] record.INFO["REFAC"] = int( (1 - sum(record.aaf)) * (2 * record.num_called)) else: record.INFO["HET"] = -1 record.INFO["HWEP"] = -1 record.INFO["AC"] = [0] * len(record.ALT) record.INFO["REFAC"] = 0 # Recalc filter if record.FILTER is None and not args.drop_filtered: record.FILTER = "PASS" loc_info["PASS"] += 1 loc_info["totalcalls"] += record.num_called # Output the record outvcf.write_record(record) # Output log info WriteSampLog(sample_info, all_reasons, args.out + ".samplog.tab") WriteLocLog(loc_info, args.out + ".loclog.tab") return 0
def main(args): if not os.path.exists(args.vcf): common.WARNING("The input vcf location %s does not exist"%args.vcf) return 1 if not os.path.exists(os.path.dirname(os.path.abspath(args.out))): common.WARNING("Error: The directory which contains the output location {} does" " not exist".format(args.out)) return 1 if os.path.isdir(args.out) and args.out.endswith(os.sep): common.WARNING("Error: The output location {} is a " "directory".format(args.out)) return 1 # Set up reader and harmonizer invcf = utils.LoadSingleReader(args.vcf, checkgz = False) if invcf is None: return 1 if args.vcftype != 'auto': harmonizer = trh.TRRecordHarmonizer(invcf, args.vcftype) else: harmonizer = trh.TRRecordHarmonizer(invcf) if len(args.quality) > 0 and not harmonizer.HasQualityScore(): common.WARNING("Requested a quality plot, but the input vcf doesn't have " "quality scores!") return 1 # Check refbias options if args.refbias_binsize < 1: common.WARNING("--refbias-binsize must be >=1") return 1 if args.refbias_mingts < 0: # allow for 0 mingts as a synonym for 1 common.WARNING("--refbias-mingts must be >=1") return 1 if args.refbias_xrange_min >= args.refbias_xrange_max: common.WARNING("--refbias-xrange-min ({}) cannot be >= --refbias-xrange-max ({})".format( args.refbias_xrange_min, args.refbias_xrange_max)) return 1 # Load samples if args.samples: samplelist = [item.strip() for item in open(args.samples, "r").readlines() if item.strip() in invcf.samples] else: samplelist = invcf.samples # Figure out which quality plot to produce by default default_quality = False if len(args.quality) == 0 and harmonizer.HasQualityScore(): default_quality = True if len(samplelist) <= 5: args.quality = [_QualityTypes.sample_stratified.value] else: args.quality = [_QualityTypes.per_locus.value] # Set up data to keep track of sample_calls = dict([(sample, 0) for sample in samplelist]) # sample->numcalls contigs = invcf.contigs if len(contigs) == 0: common.WARNING("Warning: no contigs found in VCF file.") chrom_calls = dict([(chrom, 0) for chrom in contigs]) # chrom->numcalls diffs_from_ref = [] # for each allele call, keep track of diff (bp) from ref diffs_from_ref_unit = [] # for each allele call, keep track of diff (units) from ref reflens = [] # for each allele call, keep track of reference length (bp) if _QualityTypes.per_locus.value in args.quality: per_locus_data = [] if _QualityTypes.per_sample.value in args.quality: per_sample_data = {} for sample in samplelist: per_sample_data[sample] = [] if _QualityTypes.per_call.value in args.quality: per_call_data = [] if _QualityTypes.sample_stratified.value in args.quality: sample_strat_data = {} for sample in samplelist: sample_strat_data[sample] = [] if _QualityTypes.locus_stratified.value in args.quality: locus_strat_data = {} # read the vcf numrecords = 0 for trrecord in harmonizer: if args.numrecords is not None and numrecords >= args.numrecords: break if args.period is not None and len(trrecord.motif) != args.period: continue record = trrecord.vcfrecord # Extract stats chrom = record.CHROM rl = len(trrecord.ref_allele) allele_counts = trrecord.GetAlleleCounts(uselength=False, samplelist=samplelist) # Update data num_calls = 0 if _QualityTypes.per_locus.value in args.quality: per_locus_data.append([]) if _QualityTypes.locus_stratified.value in args.quality: locus_strat_data[trrecord.record_id] = [] # loop over sample data for call in record: s = call.sample if s not in samplelist: continue if call.called: sample_calls[s] += 1 num_calls += 1 if len(args.quality) == 0: continue # set non-calls to zero quality if call.called: quality_score = trrecord.GetQualityScore(call) elif args.quality_ignore_no_call: continue else: quality_score = 0 if _QualityTypes.per_sample.value in args.quality: per_sample_data[s].append(quality_score) if _QualityTypes.sample_stratified.value in args.quality: sample_strat_data[s].append(quality_score) if _QualityTypes.per_locus.value in args.quality: per_locus_data[-1].append(quality_score) if _QualityTypes.locus_stratified.value in args.quality: locus_strat_data[trrecord.record_id].append(quality_score) if _QualityTypes.per_call.value in args.quality: per_call_data.append(quality_score) chrom_calls[chrom] = chrom_calls.get(chrom, 0) + num_calls for allele in allele_counts.keys(): allelediff = len(allele)-rl count = allele_counts[allele] reflens.extend([rl]*count) diffs_from_ref.extend([allelediff]*count) diffs_from_ref_unit.extend([allelediff/len(trrecord.motif)]*count) numrecords += 1 print("Producing " + args.out + "-diffref-bias.pdf ... ", end='', flush=True) OutputDiffRefBias(diffs_from_ref, reflens, args.out + "-diffref-bias.pdf", \ xlim=(args.refbias_xrange_min, args.refbias_xrange_max), \ mingts=args.refbias_mingts, metric=args.refbias_metric, \ binsize=args.refbias_binsize) if len(samplelist) > 1: print("Done.\nProducing " + args.out + "-sample-callnum.pdf ... ", end='', flush=True) OutputSampleCallrate(sample_calls, args.out+"-sample-callnum.pdf") print("Done.") else: print("Done.\nOnly one sample, so skipping " + args.out + "-sample-callnum.pdf ...") if 1 < len(list(chrom for chrom, value in chrom_calls.items() if value > 0)): print("Producing " + args.out + "-chrom-callnum.pdf ... ", end='', flush=True) OutputChromCallrate(chrom_calls, args.out+"-chrom-callnum.pdf") print("Done.\n", end='') else: print("Only one chromosome, so skipping " + args.out + "-chrom-callnum.pdf ...") print("Producing " + args.out + "-diffref-histogram.pdf ... ", end='', flush=True) OutputDiffRefHistogram(diffs_from_ref_unit, args.out + "-diffref-histogram.pdf") print("Done.") if default_quality: def quality_output_loc(quality_value): return args.out+"-quality.pdf" else: def quality_output_loc(quality_value): return args.out+"-quality-{}.pdf".format(quality_value) prior_qual_plot = False if _QualityTypes.per_sample.value in args.quality: print("Producing " + quality_output_loc(_QualityTypes.per_sample.value) + " ... ", end='', flush=True) new_per_sample_data = [] for sample_data in per_sample_data.values(): new_per_sample_data.append(stat.mean(sample_data)) OutputQualityPerSample(new_per_sample_data, quality_output_loc(_QualityTypes.per_sample.value)) prior_qual_plot = True if _QualityTypes.sample_stratified.value in args.quality: if prior_qual_plot: print("Done.") print("Producing " + quality_output_loc(_QualityTypes.sample_stratified.value) + " ... ", end='', flush=True) OutputQualitySampleStrat(sample_strat_data, quality_output_loc(_QualityTypes.sample_stratified.value)) prior_qual_plot = True if _QualityTypes.per_locus.value in args.quality: if prior_qual_plot: print("Done.") print("Producing " + quality_output_loc(_QualityTypes.per_locus.value) + " ... ", end='', flush=True) new_per_locus_data = [] for locus_data in per_locus_data: new_per_locus_data.append(stat.mean(locus_data)) OutputQualityPerLocus(new_per_locus_data, quality_output_loc(_QualityTypes.per_locus.value)) prior_qual_plot = True if _QualityTypes.locus_stratified.value in args.quality: if prior_qual_plot: print("Done.") print("Producing " + quality_output_loc(_QualityTypes.locus_stratified.value) + " ... ", end='', flush=True) OutputQualityLocusStrat(locus_strat_data, quality_output_loc(_QualityTypes.locus_stratified.value)) prior_qual_plot = True if _QualityTypes.per_call.value in args.quality: if prior_qual_plot: print("Done.") print("Producing " + quality_output_loc(_QualityTypes.per_call.value) + " ... ", end='', flush=True) OutputQualityPerCall(per_call_data, quality_output_loc(_QualityTypes.per_call.value)) if len(args.quality) == 0: print("This vcf does not have quality scores, so skipping all " "quality plots.") print("Done.") return 0
def WriteMergedHeader(vcfw, args, readers, cmd, vcftype): r"""Write merged header for VCFs in args.vcfs Also do some checks on the VCFs to make sure merging is appropriate. Return info and format fields to use Parameters ---------- vcfw : file object Writer to write the merged VCF args : argparse namespace Contains user options readers : list of vcf.Reader List of readers to merge cmd : str Command used to call this program vcftype : str Type of VCF files being merged Returns ------- useinfo : list of (str, bool) List of (info field, required) to use downstream useformat: list of str List of format field strings to use downstream """ def get_contigs(reader): return set(reader.contigs.values()) def get_alts(reader): return set(reader.alts.values()) def get_sources(reader): if "source" in reader.metadata: return set(r.metadata["source"]) else: return set() # Check contigs the same for all readers contigs = get_contigs(readers[0]) for i in range(1, len(readers)): if get_contigs(readers[i]) != contigs: raise ValueError( "Different contigs found across VCF files. Make sure all " "files used the same reference. Consider using this " "command:\n\t" "bcftools reheader -f ref.fa.fai file.vcf.gz -o file_rh.vcf.gz" ) # Write VCF format, commands, and contigs vcfw.write("##fileformat=VCFv4.1\n") # Update commands for r in readers: if "command" in r.metadata: for i in range(len(r.metadata["command"])): vcfw.write("##command=" + r.metadata["command"][i] + "\n") vcfw.write("##command=" + cmd + "\n") # Update sources sources = set.union(*[get_sources(reader) for reader in readers]) for src in sources: vcfw.write("##source=" + src + "\n") for contig in contigs: # contigs in VCFs can contain more info than just ID and length # (such as URL) # even though pyvcf ignores all other fields. # in the future (e.g. when swapping to cyvcf2), # write out the entire contig not just those two fields vcfw.write("##contig=<ID=%s,length=%s>\n" % (contig.id, contig.length)) # Write ALT fields if present alts = set.union(*[get_alts(reader) for reader in readers]) for alt in alts: vcfw.write("##ALT=<ID=%s,Description=\"%s\">\n" % (alt.id, alt.desc)) # Write INFO fields, different for each tool useinfo = [] for (field, reqd) in INFOFIELDS[vcftype]: if field not in readers[0].infos: common.WARNING("Expected info field %s not found. Skipping" % field) else: vcfw.write(GetInfoString(readers[0].infos[field]) + "\n") useinfo.append((field, reqd)) # Write GT header vcfw.write( "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n") # Write FORMAT fields, different for each tool useformat = [] for field in FORMATFIELDS[vcftype]: if field not in readers[0].formats: common.WARNING("Expected format field %s not found. Skipping" % field) else: vcfw.write(GetFormatString(readers[0].formats[field]) + "\n") useformat.append(field) # Write sample list samples = mergeutils.GetSamples(readers, usefilenames=args.update_sample_from_file) if len(samples) == 0: return None, None header_fields = [ "CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT" ] vcfw.write("#" + "\t".join(header_fields + samples) + "\n") return useinfo, useformat
def main(args): if not os.path.exists(os.path.dirname(os.path.abspath(args.out))): common.WARNING( "Error: The directory which contains the output location {} does" " not exist".format(args.out)) return 1 if os.path.isdir(args.out) and args.out.endswith(os.sep): common.WARNING("Error: The output location {} is a " "directory".format(args.out)) return 1 ### Check and Load VCF files ### vcfreaders = utils.LoadReaders(args.vcfs.split(","), checkgz=True) if vcfreaders is None: return 1 if len(vcfreaders) == 0: return 1 contigs = vcfreaders[0].contigs # WriteMergedHeader will confirm that the list of contigs is the same for # each vcf, so just pulling it from one here is fine chroms = list(contigs) ### Check inferred type of each is the same vcftype = mergeutils.GetAndCheckVCFType(vcfreaders, args.vcftype) ### Set up VCF writer ### vcfw = open(args.out + ".vcf", "w") useinfo, useformat = WriteMergedHeader(vcfw, args, vcfreaders, " ".join(sys.argv), vcftype) if useinfo is None or useformat is None: common.WARNING("Error writing merged header. Quitting") return 1 ### Walk through sorted readers, merging records as we go ### current_records = [next(reader) for reader in vcfreaders] # Check if contig ID is set in VCF header for all records done = mergeutils.DoneReading(current_records) while not done: for r, reader in zip(current_records, vcfreaders): if r is None: continue if not r.CHROM in chroms: common.WARNING( ("Error: found a record in file {} with " "chromosome '{}' which was not found in the contig list " "({})").format(reader.filename, r.CHROM, ", ".join(chroms))) common.WARNING( "VCF files must contain a ##contig header line for each chromosome." ) common.WARNING( "If this is only a technical issue and all the vcf " "files were truly built against against the " "same reference, use bcftools " "(https://github.com/samtools/bcftools) to fix the contigs" ", e.g.: bcftools reheader -f hg19.fa.fai -o myvcf-readher.vcf.gz myvcf.vcf.gz" ) return 1 is_min = mergeutils.GetMinRecords(current_records, chroms) if args.verbose: mergeutils.DebugPrintRecordLocations(current_records, is_min) if mergeutils.CheckMin(is_min): return 1 MergeRecords(vcfreaders, current_records, is_min, vcfw, args, useinfo, useformat) current_records = mergeutils.GetNextRecords(vcfreaders, current_records, is_min) done = mergeutils.DoneReading(current_records) return 0
def MergeRecords(readers, current_records, mergelist, vcfw, args, useinfo, useformat): r"""Merge records from different files Merge all records with indicator set to True in mergelist Output merged record to vcfw Parameters ---------- readers : list of vcf.Reader List of readers being merged current_records : list of vcf.Record List of current records for each reader mergelist : list of bool Indicates whether to include each reader in merge vcfw : file File to write output to args : argparse namespace Contains user options useinfo : list of (str, bool) List of (info field, required) to use downstream useformat: list of str List of format field strings to use downstream """ output_items = [] use_ind = [i for i in range(len(mergelist)) if mergelist[i]] if len(use_ind) == 0: return chrom = current_records[use_ind[0]].CHROM pos = current_records[use_ind[0]].POS alt_alleles = GetAltAlleles(current_records, mergelist) ref_allele = GetRefAllele(current_records, mergelist) if ref_allele is None: common.WARNING("Conflicting refs found at {}:{}. Skipping.".format( chrom, pos)) return # Set common fields output_items.append(chrom) # CHROM output_items.append(str(pos)) # POS output_items.append(GetID(current_records[use_ind[0]].ID)) # ID output_items.append(ref_allele) # REF if len(alt_alleles) == 0: output_items.append(".") else: output_items.append(",".join(alt_alleles)) # ALT output_items.append(".") # QUAL output_items.append(".") # FILTER # Set INFO info_items = [] for (field, reqd) in useinfo: inf = GetInfoItem(current_records, mergelist, field, fail=reqd) if inf is not None: info_items.append(inf) info_items = [item for item in info_items if item is not None] output_items.append(";".join(info_items)) # Set FORMAT - add GT to front output_items.append(":".join(["GT"] + useformat)) # Set sample info alleles = [ref_allele] + alt_alleles for i in range(len(mergelist)): if mergelist[i]: output_items.extend( GetSampleInfo(current_records[i], alleles, useformat)) else: output_items.extend([NOCALLSTRING] * len(readers[i].samples)) # NOCALL vcfw.write("\t".join(output_items) + "\n")
def main(args): if not os.path.exists(os.path.dirname(os.path.abspath(args.out))): common.WARNING( "Error: The directory which contains the output location {} does" " not exist".format(args.out)) return 1 if os.path.isdir(args.out) and args.out.endswith(os.sep): common.WARNING("Error: The output location {} is a " "directory".format(args.out)) return 1 ### Check and load VCF files ### vcfreaders = utils.LoadReaders([args.vcf1, args.vcf2], checkgz=True, region=args.region) if vcfreaders is None or len(vcfreaders) != 2: return 1 contigs = vcfreaders[0].contigs chroms = list(contigs) ### Load shared samples ### samples = mergeutils.GetSharedSamples(vcfreaders) if len(samples) == 0: common.WARNING("No shared smaples found between vcf readers") return 1 if args.samples: usesamples = set( [item.strip() for item in open(args.samples, "r").readlines()]) samples = list(set(samples).intersection(usesamples)) if len(samples) == 0: common.WARNING("No shared samples found between files") return 1 ### Determine FORMAT fields we should look for ### if args.stratify_file is not None and args.stratify_file not in [0, 1, 2]: common.MSG("--stratify-file must be 0,1, or 2") return 1 format_fields, format_binsizes = GetFormatFields(args.stratify_fields, args.stratify_binsizes, args.stratify_file, vcfreaders) ### Keep track of data to summarize at the end ### results_dir = { "chrom": [], "start": [], "period": [], "sample": [], "gtstring1": [], "gtstring2": [], "gtsum1": [], "gtsum2": [], "metric-conc-seq": [], "metric-conc-len": [], } for ff in format_fields: results_dir[ff + "1"] = [] results_dir[ff + "2"] = [] vcftype1 = trh.GetVCFType(vcfreaders[0], args.vcftype1) vcftype2 = trh.GetVCFType(vcfreaders[1], args.vcftype2) ### Walk through sorted readers, merging records as we go ### current_records = [next(reader) for reader in vcfreaders] is_min = mergeutils.GetMinRecords(current_records, chroms) done = mergeutils.DoneReading(current_records) num_records = 0 while not done: if any([item is None for item in current_records]): break if args.numrecords is not None and num_records >= args.numrecords: break if args.verbose: mergeutils.DebugPrintRecordLocations(current_records, is_min) if mergeutils.CheckMin(is_min): return 1 if all([is_min]): if (current_records[0].CHROM == current_records[1].CHROM and \ current_records[0].POS == current_records[1].POS): UpdateComparisonResults(trh.HarmonizeRecord(vcftype1, current_records[0]), \ trh.HarmonizeRecord(vcftype2, current_records[1]), \ format_fields, samples, results_dir) current_records = mergeutils.GetNextRecords(vcfreaders, current_records, is_min) is_min = mergeutils.GetMinRecords(current_records, chroms) done = mergeutils.DoneReading(current_records) num_records += 1 ### Load all results to a dataframe and output full results ### data = pd.DataFrame(results_dir) data.to_csv(args.out + "-callcompare.tab", sep="\t", index=False) ### Overall metrics ### OutputOverallMetrics(data, format_fields, format_binsizes, args.stratify_file, args.period, args.out) if not args.noplot: OutputBubblePlot(data, args.period, args.out, minval=args.bubble_min, maxval=args.bubble_max) ### Per-locus metrics ### OutputLocusMetrics(data, args.out, args.noplot) ### Per-sample metrics ### OutputSampleMetrics(data, args.out, args.noplot) return 0
def test_WARNING(): common.WARNING("Writing a test warning") common.WARNING("Writing a test warning")