def generate_trios(pedfile, f1=True): """ Given a PED file, specify whether you want to output trios w/r/t the F1 or F2 generation (i.e., whether the kid in each trio is an F1 or F2). """ from peddy import Ped ped = Ped(pedfile) p0s = [k for k in ped.samples() if k.mom is None] f1s = [k for k in ped.samples() if k.mom and k.dad and k.mom.mom is None] f2s = [k for k in ped.samples() if k.mom and k.mom.mom] if f1: trios = f1s else: trios = f2s for i in trios: yield (i.sample_id, i.mom.sample_id, i.dad.sample_id)
def create_samples(self): ped = Ped(self.ped_path) cols = [ 'sample_id', 'family_id', 'name', 'paternal_id', 'maternal_id', 'sex', 'phenotype' ] if ped.header is None: ped.header = [x for x in cols if x != 'name'] samples = [fix_sample_name(s) for s in self.vcf.samples] cols = [ 'sample_id', 'family_id', 'name', 'paternal_id', 'maternal_id', 'sex', 'phenotype' ] idxs, rows, not_in_vcf = [], [], [] cols.extend(ped.header[6:]) sample_id = 1 for i, s in enumerate(ped.samples(), start=1): try: idxs.append(samples.index(fix_sample_name(s.sample_id))) except ValueError: not_in_vcf.append(s.sample_id) continue rows.append([ sample_id, s.family_id, fix_sample_name(s.sample_id), fix_sample_name(str(s.paternal_id)), fix_sample_name(str(s.maternal_id)), '1' if s.sex == 'male' else '2' if s.sex == 'female' else '-9', '2' if s.affected is True else '1' if s.affected is False else '-9', ] + s.attrs) sample_id += 1 if len(not_in_vcf) > 0: print("not in VCF: %s" % ",".join(not_in_vcf), file=sys.stderr) scols = [sql.Column('sample_id', sql.Integer, primary_key=True)] for i, col in enumerate(cols[1:], start=1): vals = None try: vals = [r[i] for r in rows] l = max(len(v) for v in vals) scols.append(sql.Column(col, Unicode(l))) except: print(col, vals, file=sys.stderr) raise t = sql.Table('samples', self.metadata, *scols) t.drop(checkfirst=True) t.create() self.engine.execute(t.insert(), [dict(zip(cols, r)) for r in rows]) # track the order to pull from the genotype fields. self.sample_idxs = np.array(idxs) return [r[2] for r in rows]
def create_samples(self): ped = Ped(self.ped_path) cols = ["sample_id", "family_id", "name", "paternal_id", "maternal_id", "sex", "phenotype"] if ped.header is None: ped.header = [x for x in cols if x != "name"] samples = [fix_sample_name(s) for s in self.vcf.samples] cols = ["sample_id", "family_id", "name", "paternal_id", "maternal_id", "sex", "phenotype"] idxs, rows, not_in_vcf = [], [], [] cols.extend(ped.header[6:]) sample_id = 1 for i, s in enumerate(ped.samples(), start=1): try: idxs.append(samples.index(fix_sample_name(s.sample_id))) except ValueError: not_in_vcf.append(s.sample_id) continue rows.append( [ sample_id, s.family_id, fix_sample_name(s.sample_id), fix_sample_name(str(s.paternal_id)), fix_sample_name(str(s.maternal_id)), "1" if s.sex == "male" else "2" if s.sex == "female" else "-9", "2" if s.affected is True else "1" if s.affected is False else "-9", ] + s.attrs ) sample_id += 1 if len(not_in_vcf) > 0: print("not in VCF: %s" % ",".join(not_in_vcf), file=sys.stderr) scols = [sql.Column("sample_id", sql.Integer, primary_key=True)] for i, col in enumerate(cols[1:], start=1): vals = None try: vals = [r[i] for r in rows] l = max(len(v) for v in vals) scols.append(sql.Column(col, Unicode(l))) except: print(col, vals, file=sys.stderr) raise t = sql.Table("samples", self.metadata, *scols) t.drop(checkfirst=True) t.create() self.engine.execute(t.insert(), [dict(zip(cols, r)) for r in rows]) # track the order to pull from the genotype fields. self.sample_idxs = np.array(idxs) return [r[2] for r in rows]
def test_6(): p = Ped(op.join(HERE, 'peddy/tests/a6.ped')) assert len(list(p.samples())) == 14 for sam in p.samples(): assert sam.family_id[:3] == "fam"
def test_getattr(): p = Ped(op.join(HERE, 'peddy/tests/a.ped')) li = list(p.samples(ethnicity='caucasianNEuropean')) assert len(li) == 5 for item in li: assert item.ethnicity == 'caucasianNEuropean'
def test_ped(): p = Ped(op.join(HERE, 'peddy/tests/a.ped')) assert len(p.families) == 4 assert len(list(p.samples())) == 14
def run(args): ped = Ped(args.ped) vcf = VCF(args.vcf, gts012=True) ped_samples = [s.sample_id for s in ped.samples()] vcf_samples = set(vcf.samples) samples = [s for s in ped_samples if s in vcf_samples] exclude = read_exclude(args.exclude) vcf = VCF(args.vcf, samples=samples, gts012=True) if args.region: vcf_iter = vcf(args.region) else: vcf_iter = vcf pctile1 = 10 # build a dict of sample_id to sample index smp2idx = dict(zip(vcf.samples, range(len(vcf.samples)))) # get the Ped objects for the family of interest if args.families is None: fams = ped.families.values() else: fams = [ped.families[f] for f in args.families.split(",")] if len(fams) == 0: sys.exit('Families %s not found in ped file' % args.families) # create a simple dictionary of info for each family member fs = [get_family_dict(fam, smp2idx, args) for fam in fams] del fam fsites = open("%s.sites" % args.prefix, "w") # fcalls contains the crossovers for all samples. try: vcf["AB"] has_abs = True except KeyError: has_abs = False smp2gt = defaultdict(int) nused, i, report_at, t0 = 0, 0, 20000, time.time() for i, v in enumerate(vcf_iter, start=1): if i % report_at == 0: persec = i / float(time.time() - t0) print("%s:%d (%.1f/sec) %.2f%% informative (%d/%d variants)" % (v.CHROM, v.POS, persec, 100.0 * nused / i, nused, i), file=sys.stderr) if i == 20000: report_at = 100000 if i == 100000: report_at = 200000 for f in fs: for k in f: if k.startswith('fh'): f[k].flush() sys.stderr.flush() if v.var_type != 'snp': continue ### no indels if len(v.ALT) > 1: continue #if len(v.REF) > 3 or len(v.ALT) > 1 or len(v.ALT[0]) > 3: # continue #if v.call_rate < 0.95: continue if v.call_rate < 0.90: continue if v.FILTER is not None: continue if int(v.INFO.get('AC')) == 1: continue if exclude is not None and 0 != len(exclude[v.CHROM].search( v.start, v.end)): continue # expensive to get gt_bases and we only need it at the crossover. gt_bases = None gt_types, gt_quals, gt_depths = v.gt_types, v.gt_quals, v.gt_depths ### added by tom 2019-11-13, as 1st percentile of depths on chr17 ### were often negative... #gt_depths[gt_depths < 0] = 0 gt_phases = v.gt_phases ipctiles, pctiles = None, None sample_abs = None nsites = 0 # track the number of families that had this as an informative site. for f in fs: #if ipctiles is not None and ipctiles[0] < pctile1: # break # is_informative only needs gt_types, so we check that first... add_genotype_info(f, gt_types=gt_types, gt_phases=gt_phases) # ############## PHASED #################### if all(f['gt_phase']): if gt_bases is None: gt_bases = v.gt_bases nsites += phased_check(f, v, gt_bases) continue # ############ END PHASED #################### # need exactly 1 het parent for unphased checks. if 1 != ((f['gt_type'][0] == HET) + (f['gt_type'][1] == HET)): continue if not is_informative(f): continue # now wee need to add quality and depth. add_genotype_info(f, gt_quals=gt_quals, gt_depths=gt_depths) if not passes_quality_control(f, args): continue # detect crossovers. for parent, (p1, p2) in [("dad", (0, 1)), ("mom", (1, 0))]: if not (f['gt_type'][p1] == HET and f['gt_type'][p2] == HOM_REF): continue if gt_bases is None: gt_bases = v.gt_bases if sample_abs is None: sample_abs = get_allele_balance(v, has_abs) if sample_abs is None: break fam_abs = sample_abs[f['idxs']] if all(np.isnan(fam_abs)): continue off = 0.31 # require that off <= alt/(ref+alt) <= 1-off if ((fam_abs[p1] >= 1 - off) | (fam_abs[p1] <= off)): continue if np.any((1 - off < fam_abs[2:]) | (fam_abs[2:] <= off)): continue fam_bases = "\t".join(gt_bases[f['idxs']]) fam_abs = "|".join("%.2f" % val for val in fam_abs) # calculate on first use. we found that having a low 1st pctile # was a good indicator of increased chance of spurious XO even # in families with decent depth. #print (np.mean(gt_depths), np.median(gt_depths)) if pctiles is None: ipctiles = np.percentile(gt_depths, (1, 5, 10, 50, 90)) pctiles = "|".join("%.0f" % de for de in ipctiles) #if ipctiles[0] < pctile1: # break fam_depths = "|".join(map(str, gt_depths[f['idxs']])) nsites += 1 val = 1 if f['gt_type'][2] == f['gt_type'][3] else 0 f['fh-%s' % parent].write('\t'.join( str(s) for s in [ v.CHROM, v.POS - 1, v.POS, f['ids'][p1], f['family_id'], val, fam_bases, fam_depths, "%.2f" % v.call_rate, pctiles, fam_abs ]) + '\n') fsites.write("%s:%d\t%d\n" % (v.CHROM, v.POS, nsites)) if nsites > 0: nused += 1 fsites.close() persec = i / float(time.time() - t0) print("finished at %s:%d (%.1f/sec) %.2f%% informative (%d/%d variants)" % (v.CHROM, v.POS, persec, 100.0 * nused / i, nused, i), file=sys.stderr) kept = _remove_empty(fs) call_all(kept, args.prefix, min_sites=20)
def test_ped(): p = Ped('peddy/tests/a.ped') assert len(p.families) == 4 assert len(list(p.samples())) == 14
def run(args): print ('\t'.join(['chrom', 'start', 'end', 'sample_id', 'parent_id', 'n_vars', 'hap_start'])) vcf = VCF(args.vcf) ped = Ped(args.ped) samples = [s for s in ped.samples()] kids = [s for s in samples if s.mom is not None and s.dad is not None] fams = set([s.family_id for s in samples]) smp2ped = dict(zip([s.sample_id for s in samples], samples)) exclude = read_exclude(args.exclude) #kid = smp2ped[args.kid].sample_id #dad, mom = smp2ped[kid].paternal_id, smp2ped[kid].maternal_id #samples_in_ped = [s.sample_id for s in samples] #samples_in_fam = [s.sample_id for s in samples if s.family_id == smp2ped[kid].family_id] # restrict VCF to the samples in the current family #vcf.set_samples(samples_in_ped) smp2idx = dict(zip(vcf.samples, range(len(vcf.samples)))) bad_positions = [] v_feats = [] haps = defaultdict() inf_positions = defaultdict(list) fam_dict = defaultdict(lambda: defaultdict()) for fam in fams: mom = [s for s in samples if s.family_id == fam and s.mom is None and s.dad is None and s.sex == 'female'][0] dad = [s for s in samples if s.family_id == fam and s.mom is None and s.dad is None and s.sex == 'male'][0] sibs = [s.sample_id for s in kids if s.dad == dad and s.mom == mom] fam_dict[fam]['mom'] = mom fam_dict[fam]['dad'] = dad fam_dict[fam]['sibs'] = ','.join(sibs) nused, i, report_at, t0 = 0, 0, 1000, time.time() for i,v in enumerate(vcf(args.chrom)): if i != 0 and i % report_at == 0: persec = i / float(time.time() - t0) print("%s:%d (%.1f/sec) %.2f%% informative (%d/%d variants)" % (v.CHROM, v.POS, persec, 100.0 * nused/i, nused, i), file=sys.stderr) if args.exclude and len(exclude[v.CHROM].search(v.start, v.end)) > 0: continue if v.var_type != 'snp': continue if v.FILTER not in ('PASS', None): continue if v.call_rate < 0.90: continue if len(v.ALT) > 1: continue if len(v.ALT[0]) > 1: continue gts = v.gt_types quals = v.gt_quals rd, ad = v.gt_ref_depths, v.gt_alt_depths for fam in fam_dict: mom = fam_dict[fam]['mom'] dad = fam_dict[fam]['dad'] sibs = fam_dict[fam]['sibs'].split(',') if args.inf_parent and args.inf_parent not in (mom, dad): continue try: mi, di = smp2idx[mom.sample_id], smp2idx[dad.sample_id] except KeyError: sys.exit() # ensure we're at an informative site if gts[mi] != HOM_REF and gts[di] != HOM_REF: continue if gts[mi] == HOM_REF and gts[di] == HOM_REF: continue sib_gts = [gts[smp2idx[k]] for k in sibs] inf_parent = None if (gts[mi] == HET and gts[di] == HOM_REF): inf_parent = mom.sample_id elif (gts[di] == HET and gts[mi] == HOM_REF): inf_parent = dad.sample_id else: continue # check that both parents are "high-quality" if not is_good_site(mi, quals, gts, ad, rd, het_ab=args.ab): continue if not is_good_site(di, quals, gts, ad, rd, het_ab=args.ab): continue if args.inf_parent and inf_parent != args.inf_parent: continue # catalog the "states" of each child w/r/t the informative parent # if kids are HETs, their state w/r/t to the INF parent is 0 states = [] sib_pass = [] for i,k in enumerate(sibs): k_idx = smp2idx[k] k_pass = is_good_site(k_idx, quals, gts, ad, rd, het_ab=args.ab) sib_pass.append(k_pass) k_state = -1 if v.CHROM in ('chrX', 'X') and k.sex == "male": k_state = 0 if gts[k_idx] == HOM_ALT else 1 else: k_state = 0 if gts[k_idx] == HET else 1 states.append(k_state) if not all([x is True for x in sib_pass]): continue if sum([s < 0 for s in states]) > 0: continue if inf_parent not in haps: haps[inf_parent] = np.array(states) else: haps[inf_parent] = np.vstack((haps[inf_parent], np.array(states))) inf_positions[inf_parent].append(v.start) nused += 1 persec = i / float(time.time() - t0) xos = get_xo(args, haps, inf_positions)