def main(ARGS=None): if ARGS == None: ARGS = sys.argv[1:] args = parse_args(ARGS) """ convert certain comma delim str args to lists """ args.qual_impacts = str_none_split(args.qual_impacts, ",") args.max_impact_csqs = str_none_split(args.max_impact_csqs, ",") args.max_csq_scores = str_none_split(args.max_csq_scores, ",") args.min_csq_scores = str_none_split(args.min_csq_scores, ",") """ convert all comma delim args to list, or leave as none. """ args_str_none = ("filter_include", "af_max_fields", "vcf_info_flags_exclude") for arg in args_str_none: args.__dict__[arg] = misc.str_none_split(args.__dict__[arg], ",") """ read samples from fam file, send basic stats to stdout """ samples_i = samples.Samples(args.in_fam) samples_i.print_stats() """ get all trios from samples with male proband """ male_trios = {} for iid in samples_i.trios: if samples_i.samples[iid].gender == "M": male_trios[iid] = samples_i.samples[iid] if len(male_trios) == 0: print("ERROR" + \ "Can't do hemizygous screen if no " + \ "trios with male proband in cohort.") sys.exit(1) """ read cnds files """ var_cnds = None iid_cnds = None pid_cnds = None mid_cnds = None if args.variant_cnds != None: var_cnds = VcfCnds(args.variant_cnds) if args.iid_cnds != None: iid_cnds = VcfCnds(args.iid_cnds) if args.pid_cnds != None: pid_cnds = VcfCnds(args.pid_cnds) if args.mid_cnds != None: mid_cnds = VcfCnds(args.mid_cnds) """ init cyvcf2 VCF obj, get info subfields, header for output """ vcf = cyvcf2.VCF(args.in_vcf, strict_gt=True, gts012=True) cyvcf2_vcf = Cyvcf2Vcf(vcf) cyvcf2_vcf.get_info_subfields() cyvcf2_vcf.get_csq_keys(spliton="Format: ", delim="|") vcf_header_str = cyvcf2_vcf.header_to_list( gt_varnames=GT_VARNAMES, max_impact=args.max_impact, max_impact_csqs=args.max_impact_csqs, max_csq_scores=args.max_csq_scores, min_csq_scores=args.min_csq_scores, delim="\t") """ create sample idx """ samples_i.get_vcf_idx(vcf.samples) """ get case, control idxs """ case_idxs = [samples_i.samples[x].idx for x in samples_i.cases] ctrl_idxs = [samples_i.samples[x].idx for x in samples_i.ctrls] """ iterate through all variants, performing newlyhemizygous screen on each one """ hemi_counts = defaultdict(int) prev_chrom = None linenum = 0 """ init output file """ init_out_file(args.out_tsv, force_overwrite=args.force_overwrite, init_line=vcf_header_str + "\n") """ only parse X chromosome, since this is only place where hemi vars can happen """ for vcf_variant in cyvcf2_vcf.cyvcf2_vcf(args.x_chrom_interval): """ assume single allele per site, exclude sites with call as '*' """ alt = vcf_variant.ALT[0] if alt == '*': continue """ create new Cyvcf2Variant instance """ cyvcf2_variant = Cyvcf2Variant(vcf_variant) ## if no qualifying impact str found in CSQ, skip if args.qual_impacts != None: res = cyvcf2_variant.qual_impacts_screen(args.qual_impacts, csq_subfield="CSQ") if res == False: continue ## if desired, derive max impact annots from var, along with other ## user defined max or min scores in CSQ for variant csqs_maximpact_list = [] max_csq_scores = [] min_csq_scores = [] if args.max_impact == True: cyvcf2_variant.get_annot_txs(cyvcf2_vcf.csq_keys, csq_subfield="CSQ") res = cyvcf2_variant.maxmin_csqs( max_impact_csqs=args.max_impact_csqs, max_csq_scores=args.max_csq_scores, min_csq_scores=args.min_csq_scores, impact_subfield="IMPACT") (csqs_maximpact_list, max_csq_scores, min_csq_scores) = res ## filter on internal ctrl-only maf, can't do in var cnds file if args.internal_ctrl_af_max != None: ctrl_af = cyvcf2_variant.compute_maf(ctrl_idxs) if ctrl_af > args.internal_ctrl_af_max: continue ## variant cnds file provided, filter exclusively on that if var_cnds != None: if var_cnds.test_variant(vcf_variant) == False: continue ## otherwise, filter on user args else: var_pass = screens.variant_screen( vcf_variant, min_qual=args.min_qual, filter_include=args.filter_include, vcf_info_flags_exclude=args.vcf_info_flags_exclude, internal_af_max=args.internal_af_max, af_max=args.af_max, af_max_fields=args.af_max_fields) if var_pass == False: continue hemi_carriers = set() if iid_cnds != None and pid_cnds != None and mid_cnds != None: for iid in male_trios: pid = samples_i.samples[iid].pid mid = samples_i.samples[iid].mid iid_idx = samples_i.samples[iid].idx pid_idx = samples_i.samples[pid].idx mid_idx = samples_i.samples[mid].idx trio_idxs = (iid_idx, mid_idx) ## is iid hemi at site? trio_gts = ([vcf_variant.gt_types[id_x] for id_x in trio_idxs]) if trio_gts[0] not in set([1, 2]): continue ## is father hom ref at site? if trio_gts[1] not in set([0]): continue ## is mother het at site? if trio_gts[2] not in set([1]): continue ## test if proband and parents pass conditions in proband and parent ## cnds files, respectively if iid_cnds.test_gt(vcf_variant, iid_idx) == False: continue if pid_cnds.test_gt(vcf_variant, pid_idx) == False: continue if mid_cnds.test_gt(vcf_variant, mid_idx) == False: continue hemi_carriers.add(iid) else: hemi_screen = screens.hemi_screen hemi_carriers = hemi_screen(vcf_variant, samples_i, male_trios, min_coverage=args.min_coverage, iid_min_perc_alt=args.iid_min_perc_alt, pid_max_perc_alt=args.pid_max_perc_alt, mid_min_perc_alt=args.mid_min_perc_alt, iid_het_phredmax=args.iid_het_phredmax, pid_hom_phredmax=args.pid_hom_phredmax, mid_het_phredmax=args.mid_het_phredmax, iid_hom_phredmin=args.iid_hom_phredmin, pid_het_phredmin=args.pid_het_phredmin, mid_hom_phredmin=args.mid_hom_phredmin) if len(hemi_carriers) > 0: iids = list(hemi_carriers) for iid in iids: samples_i.samples[iid].varcounts["newlyhemi"] += 1 outs = cyvcf2_variant.variant_to_list( vcf_variant, samples_i, hemi_carriers, cyvcf2_vcf.info_subfields, GT_VARNAMES, csqs_maximpact=csqs_maximpact_list, max_csq_scores=max_csq_scores, min_csq_scores=min_csq_scores, delim="\t") for out in outs: append_out_file(args.out_tsv, out) vcf.close() print("Screening of VCF for newly hemizygous variants complete.") samples_i.print_varcount_stats(var_types=["newlyhemi"]) return
def main(ARGS = None): if ARGS == None: ARGS = sys.argv[1:] args = parse_args(ARGS) """ convert certain comma delim str args to lists """ args.qual_impacts = misc.str_none_split(args.qual_impacts, ",") args.max_impact_csqs = misc.str_none_split(args.max_impact_csqs, ",") args.max_csq_scores = misc.str_none_split(args.max_csq_scores, ",") args.min_csq_scores = misc.str_none_split(args.min_csq_scores, ",") """ read samples from fam file, send basic stats to stdout """ samples_i = Samples(args.in_fam) samples_i.print_stats() n_samples = len(samples_i.samples) n_males = len(samples_i.males) n_females = len(samples_i.females) n_cases = len(samples_i.cases) n_ctrls = len(samples_i.ctrls) """ read cnds files """ var_cnds = None pro_cnds = None par_cnds = None if args.variant_cnds != None: var_cnds = VcfCnds(args.variant_cnds) if args.pro_cnds != None: pro_cnds = VcfCnds(args.pro_cnds) if args.par_cnds != None: par_cnds = VcfCnds(args.par_cnds) """ init cyvcf2 VCF obj, get info subfields, header for output """ vcf = cyvcf2.VCF(args.in_vcf, strict_gt=True, gts012=True) cyvcf2_vcf = Cyvcf2Vcf(vcf) cyvcf2_vcf.get_info_subfields() cyvcf2_vcf.get_csq_keys(spliton="Format: ", delim="|") vcf_header_str = cyvcf2_vcf.header_to_list(gt_varnames=GT_VARNAMES, max_impact=args.max_impact, max_impact_csqs=args.max_impact_csqs, max_csq_scores=args.max_csq_scores, min_csq_scores=args.min_csq_scores, delim="\t") """ create sample idx """ samples_i.get_vcf_idx(vcf.samples) """ get case, control idxs """ case_idxs = [samples_i.samples[x].idx for x in samples_i.cases] ctrl_idxs = [samples_i.samples[x].idx for x in samples_i.ctrls] """ iterate through all variants, performing de novo screen on each one """ hom_counts = defaultdict(int) prev_chrom = None linenum=0 """ if intervals provided, make sure to parse over those, else whole vcf """ if args.intervals != None: if os.path.isfile(args.intervals): intervals = open(args.intervals, "r").readlines() intervals = [x.rstrip() for x in intervals] else: intervals = [args.intervals] else: intervals = [""] """ init output file """ misc.init_out_file(args.out_tsv, force_overwrite = args.force_overwrite, init_line = vcf_header_str) """ parse VCF file looking for de novo variant calls """ for vcf_variant in cyvcf2_vcf.iterator(intervals): linenum+=1 #if linenum == 1000000: break if vcf_variant.CHROM != prev_chrom: print("Extracting newly homozygous vars from chrom " + vcf_variant.CHROM) prev_chrom = vcf_variant.CHROM """ create new Cyvcf2Variant instance """ cyvcf2_variant=Cyvcf2Variant(vcf_variant) """ assume single alternate allele per row, exclude sites with call as '*' since these are by-product of multi-sample calling and don't really represent a real SNV/indel """ alt = vcf_variant.ALT[0] if alt == '*': continue ## if no qualifying impact str found in CSQ, skip if args.qual_impacts != None: res = cyvcf2_variant.qual_impacts_screen(args.qual_impacts, csq_subfield="CSQ") if res == False: continue ## if desired, derive max impact annots from var, along with other ## user defined max or min scores in CSQ for variant csqs_maximpact_list = [] max_csq_scores = [] min_csq_scores = [] if args.max_impact == True: cyvcf2_variant.get_annot_txs(cyvcf2_vcf.csq_keys, csq_subfield="CSQ") res=cyvcf2_variant.maxmin_csqs(max_impact_csqs=args.max_impact_csqs, max_csq_scores=args.max_csq_scores, min_csq_scores=args.min_csq_scores, impact_subfield="IMPACT") (csqs_maximpact_list, max_csq_scores, min_csq_scores) = res ## filter on internal ctrl-only maf, can't do in var cnds file if args.internal_ctrl_af_max != None: ctrl_af = cyvcf2_variant.compute_maf(ctrl_idxs) if ctrl_af > args.internal_ctrl_af_max: continue ## variant cnds file provided, filter exclusively on that if var_cnds != None: if var_cnds.test_variant(vcf_variant) == False: continue ## otherwise, filter on user args else: ## filter on FILTER column, default is PASS only, allow for ## user defined FILTER classifs too if args.filter_include == None and vcf_variant.FILTER != None: continue elif args.filter_include != None and vcf_variant.FILTER != None: filter_include = set(",".split(args.filter_include)) if vcf_variant not in filter_include: continue ## filter on user-defined VCF INFO flags if args.vcf_info_flags_exclude != None: vcf_info_flags_fail = False for vcf_info_flag in args.vcf_info_flags_exclude.split(","): if vcf_variant.INFO.get(vcf_info_flag) == True: vcf_info_flags_fail = True break if vcf_info_flags_fail == True: continue ## filter on user-defined maximum internal MAF if args.internal_af_max != None: if vcf_variant.INFO.get("AF") > args.internal_af_max: continue ## filter on user-defined maximum external MAF if args.af_max_fields != None and args.af_max != None: af_max_fields = args.af_max_fields.split(",") af_max_fail = False for af_max_field in af_max_fields: af = vcf_variant.INFO.get(af_max_field) if af > args.af_max: af_max_fail=True break if af_max_fail == True: continue hom_carriers = set() if pro_cnds != None and par_cnds != None: hom_carriers = set() for iid in samples_i.trios: pid = samples_i.trios[iid].pid mid = samples_i.trios[iid].mid iid_idx=samples_i.samples[iid].idx pid_idx=samples_i.samples[pid].idx mid_idx=samples_i.samples[mid].idx trio_idxs=(iid_idx,pid_idx,mid_idx) ## is iid hom alt at site? trio_gts = ([vcf_variant.gt_types[id_x] for id_x in trio_idxs]) if trio_gts[0] != 2: continue ## is either parent sample hom alt at site? if trio_gts[1] != 1 or trio_gts[2] != 1: continue ## test if proband and parents pass conditions in proband and parent ## cnds files, respectively if pro_cnds.test_gt(vcf_variant, iid_idx) == False: continue if par_cnds.test_gt(vcf_variant, pid_idx) == False: continue if par_cnds.test_gt(vcf_variant, mid_idx) == False: continue hom_carriers.add(iid) else: hom_screen = screens.hom_screen hom_carriers = hom_screen(vcf_variant, samples_i, min_coverage = args.min_coverage, pro_min_perc_alt=args.pro_min_perc_alt, par_max_perc_alt=args.par_max_perc_alt, pro_homref_phredmin=args.pro_homref_phredmin, pro_het_phredmin=args.pro_het_phredmin, pro_homalt_phredmax=args.pro_homalt_phredmax, par_homref_phredmin=args.par_homref_phredmin, par_het_phredmax=args.par_het_phredmax, par_homalt_phredmin=args.par_homalt_phredmin) if len(hom_carriers) > 0: iids = list(hom_carriers) for iid in iids: samples_i.samples[iid].varcounts["hom"] += 1 outs = cyvcf2_variant.variant_to_list(samples_i, hom_carriers, cyvcf2_vcf.info_subfields, GT_VARNAMES, csqs_maximpact=csqs_maximpact_list, max_csq_scores=max_csq_scores, min_csq_scores=min_csq_scores, delim="\t") for out in outs: misc.append_out_file(args.out_tsv, out) vcf.close() samples_i.print_varcount_stats(var_types=["hom"]) return
def main(ARGS = None): if ARGS == None: ARGS = sys.argv[1:] args = parse_args(ARGS) """ set name of gt class """ gt_class = "het_trans" if args.ntrans == True: gt_class = "het_ntrans" """ convert certain comma delim str args to lists """ args.qual_impacts = misc.str_none_split(args.qual_impacts, ",") args.max_impact_csqs = misc.str_none_split(args.max_impact_csqs, ",") args.max_csq_scores = misc.str_none_split(args.max_csq_scores, ",") args.min_csq_scores = misc.str_none_split(args.min_csq_scores, ",") """ read samples from fam file, send basic stats to stdout """ samples_i = Samples(args.in_fam) samples_i.print_stats() n_samples = len(samples_i.samples) n_males = len(samples_i.males) n_females = len(samples_i.females) n_cases = len(samples_i.cases) n_ctrls = len(samples_i.ctrls) """ read cnds files """ var_cnds = None pro_cnds = None transpar_cnds = None ntranspar_cnds = None if args.variant_cnds != None: var_cnds = VcfCnds(args.variant_cnds) if args.pro_cnds != None: pro_cnds = VcfCnds(args.pro_cnds) if args.transpar_cnds != None: transpar_cnds = VcfCnds(args.transpar_cnds) if args.ntranspar_cnds != None: ntranspar_cnds = VcfCnds(args.ntranspar_cnds) """ init cyvcf2 VCF obj, get info subfields, header for output """ vcf = cyvcf2.VCF(args.in_vcf, strict_gt=True, gts012=True) cyvcf2_vcf = Cyvcf2Vcf(vcf) cyvcf2_vcf.get_info_subfields() cyvcf2_vcf.get_csq_keys(spliton="Format: ", delim="|") vcf_header_str = cyvcf2_vcf.header_to_list(main_fields=["PAR_IID","PRO_IID","CHROM", "POS","ID","REF","ALT", "QUAL","FILTER"], gt_varnames=GT_VARNAMES, max_impact=args.max_impact, max_impact_csqs=args.max_impact_csqs, max_csq_scores=args.max_csq_scores, min_csq_scores=args.min_csq_scores, delim="\t") """ create sample idx """ samples_i.get_vcf_idx(vcf.samples) """ get case, control idxs """ case_idxs = [samples_i.samples[x].idx for x in samples_i.cases] ctrl_idxs = [samples_i.samples[x].idx for x in samples_i.ctrls] """ iterate through all variants, performing de novo screen on each one """ trans_counts = defaultdict(int) prev_chrom = None linenum=0 """ if intervals provided, make sure to parse over those, else whole vcf """ if args.intervals != None: if os.path.isfile(args.intervals): intervals = open(args.intervals, "r").readlines() intervals = [x.rstrip() for x in intervals] else: intervals = [args.intervals] else: intervals = [""] """ init output file """ misc.init_out_file(args.out_tsv, force_overwrite = args.force_overwrite, init_line = vcf_header_str) """ parse VCF file looking for de novo variant calls """ for vcf_variant in cyvcf2_vcf.iterator(intervals): linenum+=1 #if linenum == 1000000: break if vcf_variant.CHROM != prev_chrom: print("Extracting " + gt_class + " from chrom " + vcf_variant.CHROM) prev_chrom = vcf_variant.CHROM """ assume single allele per site, exclude sites with call as '*' """ alt = vcf_variant.ALT[0] if alt == '*': continue """ create new Cyvcf2Variant instance """ cyvcf2_variant=Cyvcf2Variant(vcf_variant) ## if no qualifying impact str found in CSQ, skip if args.qual_impacts != None: res = cyvcf2_variant.qual_impacts_screen(args.qual_impacts, csq_subfield="CSQ") if res == False: continue ## if desired, derive max impact annots from var, along with other ## user defined max or min scores in CSQ for variant csqs_maximpact_list = [] max_csq_scores = [] min_csq_scores = [] if args.max_impact == True: cyvcf2_variant.get_annot_txs(cyvcf2_vcf.csq_keys, csq_subfield="CSQ") res=cyvcf2_variant.maxmin_csqs(max_impact_csqs=args.max_impact_csqs, max_csq_scores=args.max_csq_scores, min_csq_scores=args.min_csq_scores, impact_subfield="IMPACT") (csqs_maximpact_list, max_csq_scores, min_csq_scores) = res ## filter on internal ctrl-only maf, can't do in var cnds file if args.internal_ctrl_af_max != None: ctrl_af = cyvcf2_variant.compute_maf(ctrl_idxs) if ctrl_af > args.internal_ctrl_af_max: continue ## variant cnds file provided, filter exclusively on that if var_cnds != None: if var_cnds.test_variant(vcf_variant) == False: continue ## otherwise, filter on user args else: ## filter on FILTER column, default is PASS only, allow for ## user defined FILTER classifs too if args.filter_include == None and vcf_variant.FILTER != None: continue elif args.filter_include != None and vcf_variant.FILTER != None: filter_include = set(",".split(args.filter_include)) if vcf_variant not in filter_include: continue ## filter on user-defined VCF INFO flags if args.vcf_info_flags_exclude != None: vcf_info_flags_fail = False for vcf_info_flag in args.vcf_info_flags_exclude.split(","): if vcf_variant.INFO.get(vcf_info_flag) == True: vcf_info_flags_fail = True break if vcf_info_flags_fail == True: continue ## filter on user-defined maximum internal MAF if args.internal_af_max != None: if vcf_variant.INFO.get("AF") > args.internal_af_max: continue ## filter on user-defined maximum external MAF if args.af_max_fields != None and args.af_max != None: af_max_fields = args.af_max_fields.split(",") af_max_fail = False for af_max_field in af_max_fields: af = vcf_variant.INFO.get(af_max_field) if af > args.af_max: af_max_fail=True break if af_max_fail == True: continue trans_carriers = set() if pro_cnds != None and transpar_cnds != None and ntranspar_cnds != None: for iid in samples_i.trios: pid = samples_i.trios[iid].pid mid = samples_i.trios[iid].mid iid_idx=samples_i.samples[iid].idx pid_idx=samples_i.samples[pid].idx mid_idx=samples_i.samples[mid].idx trio_idxs=(iid_idx,pid_idx,mid_idx) ## get trio gts trio_gts = ([vcf_variant.gt_types[id_x] for id_x in trio_idxs]) ## parent with higher % alt reads is transpar, ## other parent is ntranspar pid_alt_freq = vcf_variant.gt_alt_freqs[pid_idx] mid_alt_freq = vcf_variant.gt_alt_freqs[mid_idx] if pid_alt_freq > mid_alt_freq: transpar = pid transpar_idx = pid_idx transpar_gt = trio_gts[1] ntranspar = mid ntranspar_idx = mid_idx ntranspar_gt = trio_gts[2] else: transpar = mid transpar_idx = mid_idx transpar_gt = trio_gts[2] ntranspar = pid ntranspar_idx = pid_idx ntranspar_gt = trio_gts[1] ## is more likely transmitting parent GT equal to 1? if transpar_gt not in set([1]): continue ## is likely transmitting parent GT equal to 0? if ntranspar_gt not in set([0]): continue ## if looking at trans variants, keep if pro is het at site ## if looking at ntnrans variant, keep if pro i homref at site if args.ntrans == False: if trio_gts[0] not in set([1]): continue else: if trio_gts[0] not in set([0]): continue ## test if proband passes conditions in proband cnds file if pro_cnds.test_gt(vcf_variant, iid_idx) == False: continue ## test if more likely transmitting parent passes conditions ## in transpar cnds file if transpar_cnds.test_gt(vcf_variant, transpar_idx) == False: continue ## test if less likely transmitting parent passes conditions ## in ntranspar cnds file if ntranspar_cnds.test_gt(vcf_variant, ntranspar_idx) == False: continue trans_carriers.add((transpar, iid)) else: trans_screen = screens.trans_screen res = trans_screen(vcf_variant, samples_i, ntrans=args.ntrans, transpar_gts=set([1]), ntranspar_gts=set([0]), min_coverage=args.min_coverage, pro_min_perc_alt=args.pro_min_perc_alt, pro_max_perc_alt=args.pro_max_perc_alt, transpar_min_perc_alt=args.transpar_min_perc_alt, transpar_max_perc_alt=args.transpar_max_perc_alt, ntranspar_max_perc_alt=args.ntranspar_max_perc_alt, pro_homref_phredmin=args.pro_homref_phredmin, pro_het_phredmax=args.pro_het_phredmax, pro_homalt_phredmin=args.pro_homalt_phredmin) trans_carriers = res if len(trans_carriers) > 0: transpars_iids = list(trans_carriers) for pair in trans_carriers: transpar = pair[0] iid = pair[1] samples_i.samples[iid].varcounts[gt_class] += 1 outs = cyvcf2_variant.variant_to_list(samples_i, trans_carriers, cyvcf2_vcf.info_subfields, GT_VARNAMES, csqs_maximpact=csqs_maximpact_list, max_csq_scores=max_csq_scores, min_csq_scores=min_csq_scores, delim="\t") for out in outs: misc.append_out_file(args.out_tsv, out) vcf.close() samples_i.print_varcount_stats(var_types=[gt_class]) return