def test_relatedness_coefficient(): kid = Sample('fam1', 'kid', 'dad', 'mom', '2', '2') dad = Sample('fam1', 'dad', '-9', '-9', '1', '2') mom = Sample('fam1', 'mom', '-9', '-9', '2', '2') gma = Sample('fam1', 'gma', '-9', '-9', '2', '2') ggma = Sample('fam1', 'ggma', '-9', '-9', '2', '2') kid.mom = mom kid.dad = dad mom.mom = gma gma.mom = ggma unrelated = Sample('fam1', 'un', '-9', '-9', '2', '2') from io import StringIO p = Ped(StringIO()) p.families['fam1'] = Family([kid, mom, dad, gma, ggma, unrelated]) rel = p.relatedness_coefficient("mom", "dad") assert rel == 0.0, rel d = p.relatedness_coefficient("mom", "kid") assert d == 0.5, d d = p.relatedness_coefficient("dad", "gma") assert d == 0.0, d d = p.relatedness_coefficient("mom", "gma") assert d == 0.5, d d = p.relatedness_coefficient("kid", "gma") assert d == 0.25, d d = p.relatedness_coefficient("kid", "ggma") assert d == 0.125, d assert p.relatedness_coefficient("mom", "mom") == 1.0
def t_ped_check(): try: import pandas as pd import cyvcf2 cyvcf2 except ImportError: return p = Ped(op.join(HERE, 'peddy/tests/test.mendel.ped')) v = p.ped_check(op.join(HERE, b'peddy/tests/test.mendel.vcf.gz')) assert isinstance(v, pd.DataFrame), v # remove samples f = list(p.families.values())[0] l = len(f.samples) s = f.samples[-1] f.samples = f.samples[:-1] assert l - 1 == len(f.samples) v = p.ped_check(op.join(HERE, b'peddy/tests/test.mendel.vcf.gz')) assert isinstance(v, pd.DataFrame), v assert "ibs0" in v.columns # changed the sample id of a sample s.sample_id = "XDFSDFX" f.samples.append(s) v = p.ped_check(op.join(HERE, b'peddy/tests/test.mendel.vcf.gz')) assert isinstance(v, pd.DataFrame), v
def t_ped_check(): try: import pandas as pd import cyvcf2 cyvcf2 except ImportError: return p = Ped(op.join(HERE, 'peddy/tests/test.mendel.ped')) v = p.ped_check(op.join(HERE, b'peddy/tests/test.mendel.vcf.gz')) assert isinstance(v, pd.DataFrame), v # remove samples f = list(p.families.values())[0] l = len(f.samples) s = f.samples[-1] f.samples = f.samples[:-1] assert l -1 == len(f.samples) v = p.ped_check(op.join(HERE, b'peddy/tests/test.mendel.vcf.gz')) assert isinstance(v, pd.DataFrame), v assert "ibs0" in v.columns # changed the sample id of a sample s.sample_id = "XDFSDFX" f.samples.append(s) v = p.ped_check(op.join(HERE, b'peddy/tests/test.mendel.vcf.gz')) assert isinstance(v, pd.DataFrame), v
def create_samples(self): ped = Ped(self.ped_path) cols = [ 'sample_id', 'family_id', 'name', 'paternal_id', 'maternal_id', 'sex', 'phenotype' ] if ped.header is None: ped.header = [x for x in cols if x != 'name'] samples = [fix_sample_name(s) for s in self.vcf.samples] cols = [ 'sample_id', 'family_id', 'name', 'paternal_id', 'maternal_id', 'sex', 'phenotype' ] idxs, rows, not_in_vcf = [], [], [] cols.extend(ped.header[6:]) sample_id = 1 for i, s in enumerate(ped.samples(), start=1): try: idxs.append(samples.index(fix_sample_name(s.sample_id))) except ValueError: not_in_vcf.append(s.sample_id) continue rows.append([ sample_id, s.family_id, fix_sample_name(s.sample_id), fix_sample_name(str(s.paternal_id)), fix_sample_name(str(s.maternal_id)), '1' if s.sex == 'male' else '2' if s.sex == 'female' else '-9', '2' if s.affected is True else '1' if s.affected is False else '-9', ] + s.attrs) sample_id += 1 if len(not_in_vcf) > 0: print("not in VCF: %s" % ",".join(not_in_vcf), file=sys.stderr) scols = [sql.Column('sample_id', sql.Integer, primary_key=True)] for i, col in enumerate(cols[1:], start=1): vals = None try: vals = [r[i] for r in rows] l = max(len(v) for v in vals) scols.append(sql.Column(col, Unicode(l))) except: print(col, vals, file=sys.stderr) raise t = sql.Table('samples', self.metadata, *scols) t.drop(checkfirst=True) t.create() self.engine.execute(t.insert(), [dict(zip(cols, r)) for r in rows]) # track the order to pull from the genotype fields. self.sample_idxs = np.array(idxs) return [r[2] for r in rows]
def test_sex_check(): if sys.version_info[0] == 3: return p = Ped(op.join(HERE, 'peddy/tests/test.mendel.ped')) df = p.sex_check(op.join(HERE, 'peddy/tests/test.mendel.vcf.gz')) assert "predicted_sex" in df.columns assert "ped_sex", df.columns assert "error" in df.columns
def test_relatedness_coefficient_missing_parent(): gma = Sample('X28935', 'gma', '-9', '-9', '2', '1') mom = Sample('X28935', 'mom', '-9', 'gma', '2', '1') dad = Sample('X28935', 'dad', '-9', '-9', '1', '1') kid1 = Sample('X28935', 'kid1', '-9', 'mom', '1', '1') kid2 = Sample('X28935', 'kid2', '-9', 'mom', '2', '1') kid1 = Sample('X28935', 'kid1', 'dad', 'mom', '1', '1') kid2 = Sample('X28935', 'kid2', 'dad', 'mom', '2', '1') kid1.mom = mom kid2.mom = mom mom.mom = gma kid1.dad = dad kid2.dad = dad from io import StringIO p = Ped(StringIO()) p.families['X28935'] = Family([kid1, kid2, mom, gma]) #, dad]) assert "siblings" in p.relation('kid1', 'kid2'), p.relation('kid1', 'kid2') v = p.relatedness_coefficient('kid1', 'kid2') assert v == 0.5, v v = p.relatedness_coefficient('gma', 'kid2') assert v == 0.25, v v = p.relatedness_coefficient('gma', 'kid1') assert v == 0.25, v v = p.relatedness_coefficient('gma', 'mom') assert v == 0.5, v
def test_relation(): kid = Sample('fam1', 'kid', 'dad', 'mom', '2', '2') dad = Sample('fam1', 'dad', '-9', '-9', '1', '2') mom = Sample('fam1', 'mom', '-9', '-9', '2', '2') kid.mom = mom kid.dad = dad from io import StringIO p = Ped(StringIO()) p.families['fam1'] = Family([kid, mom, dad]) assert p.relation("mom", "dad") == "mom-dad"
def create_samples(self): ped = Ped(self.ped_path) cols = ["sample_id", "family_id", "name", "paternal_id", "maternal_id", "sex", "phenotype"] if ped.header is None: ped.header = [x for x in cols if x != "name"] samples = [fix_sample_name(s) for s in self.vcf.samples] cols = ["sample_id", "family_id", "name", "paternal_id", "maternal_id", "sex", "phenotype"] idxs, rows, not_in_vcf = [], [], [] cols.extend(ped.header[6:]) sample_id = 1 for i, s in enumerate(ped.samples(), start=1): try: idxs.append(samples.index(fix_sample_name(s.sample_id))) except ValueError: not_in_vcf.append(s.sample_id) continue rows.append( [ sample_id, s.family_id, fix_sample_name(s.sample_id), fix_sample_name(str(s.paternal_id)), fix_sample_name(str(s.maternal_id)), "1" if s.sex == "male" else "2" if s.sex == "female" else "-9", "2" if s.affected is True else "1" if s.affected is False else "-9", ] + s.attrs ) sample_id += 1 if len(not_in_vcf) > 0: print("not in VCF: %s" % ",".join(not_in_vcf), file=sys.stderr) scols = [sql.Column("sample_id", sql.Integer, primary_key=True)] for i, col in enumerate(cols[1:], start=1): vals = None try: vals = [r[i] for r in rows] l = max(len(v) for v in vals) scols.append(sql.Column(col, Unicode(l))) except: print(col, vals, file=sys.stderr) raise t = sql.Table("samples", self.metadata, *scols) t.drop(checkfirst=True) t.create() self.engine.execute(t.insert(), [dict(zip(cols, r)) for r in rows]) # track the order to pull from the genotype fields. self.sample_idxs = np.array(idxs) return [r[2] for r in rows]
def test_relatedness_coefficient_missing_parent(): gma = Sample('X28935', 'gma', '-9', '-9', '2', '1') mom = Sample('X28935', 'mom', '-9', 'gma', '2', '1') dad = Sample('X28935', 'dad', '-9', '-9', '1', '1') kid1 = Sample('X28935', 'kid1', '-9', 'mom', '1', '1') kid2 = Sample('X28935', 'kid2', '-9', 'mom', '2', '1') kid1 = Sample('X28935', 'kid1', 'dad', 'mom', '1', '1') kid2 = Sample('X28935', 'kid2', 'dad', 'mom', '2', '1') kid1.mom = mom kid2.mom = mom mom.mom = gma kid1.dad = dad kid2.dad = dad from io import StringIO p = Ped(StringIO()) p.families['X28935'] = Family([kid1, kid2, mom, gma])#, dad]) assert "siblings" in p.relation('kid1', 'kid2'), p.relation('kid1', 'kid2') v = p.relatedness_coefficient('kid1', 'kid2') assert v == 0.5, v v = p.relatedness_coefficient('gma', 'kid2') assert v == 0.25, v v = p.relatedness_coefficient('gma', 'kid1') assert v == 0.25, v v = p.relatedness_coefficient('gma', 'mom') assert v == 0.5, v
def generate_trios(pedfile, f1=True): """ Given a PED file, specify whether you want to output trios w/r/t the F1 or F2 generation (i.e., whether the kid in each trio is an F1 or F2). """ from peddy import Ped ped = Ped(pedfile) p0s = [k for k in ped.samples() if k.mom is None] f1s = [k for k in ped.samples() if k.mom and k.dad and k.mom.mom is None] f2s = [k for k in ped.samples() if k.mom and k.mom.mom] if f1: trios = f1s else: trios = f2s for i in trios: yield (i.sample_id, i.mom.sample_id, i.dad.sample_id)
def test_trios(): p = Ped(op.join(HERE, 'peddy/tests/a.ped')) f = p.families['family_4'] trios = list(f.trios()) assert len(trios) == 3 assert [t[0] for t in trios] == list(f.affecteds)
def test_relatedness_coefficient_missing_gparent(): p = Ped(open(os.path.join(HERE, "peddy/tests/test.fam.ped"))) # uncle v = p.relatedness_coefficient('101806-101806', '101811-101811') assert v == 0.25, v v = p.relatedness_coefficient('101806-101806', '101809-101809') assert v == 0.25, v # parent-child v = p.relatedness_coefficient('101806-101806', '101653-101653') assert v == 0.5, v p = Ped(open(os.path.join(HERE, "peddy/tests/test.fam2.ped"))) v = p.relatedness_coefficient('101806-101806', '101811-101811') assert v == 0.25, v v = p.relatedness_coefficient('101806-101806', '101809-101809') assert v == 0.25, v # parent-child v = p.relatedness_coefficient('101806-101806', '101653-101653') assert v == 0.5, v
def test_json(): p = Ped(op.join(HERE, 'peddy/tests/test.mendel.ped')) json = p.to_json() #expected = '[{"maternal_id": "-9", "paternal_id": "-9", "sex": "male", "family_id": "CEPH1463", "phenotype": "affected", "sample_id": "NA12889"}, {"maternal_id": "-9", "paternal_id": "-9", "sex": "female", "family_id": "CEPH1463", "phenotype": "affected", "sample_id": "NA12890"}, {"maternal_id": "NA12890", "paternal_id": "NA12889", "sex": "male", "family_id": "CEPH1463", "phenotype": "affected", "sample_id": "NA12877"}]' # this test may fail if order of dicts is changed assert "CEPH1463" in json, json
def run(args): print ('\t'.join(['chrom', 'start', 'end', 'sample_id', 'parent_id', 'n_vars', 'hap_start'])) vcf = VCF(args.vcf) ped = Ped(args.ped) samples = [s for s in ped.samples()] kids = [s for s in samples if s.mom is not None and s.dad is not None] fams = set([s.family_id for s in samples]) smp2ped = dict(zip([s.sample_id for s in samples], samples)) exclude = read_exclude(args.exclude) #kid = smp2ped[args.kid].sample_id #dad, mom = smp2ped[kid].paternal_id, smp2ped[kid].maternal_id #samples_in_ped = [s.sample_id for s in samples] #samples_in_fam = [s.sample_id for s in samples if s.family_id == smp2ped[kid].family_id] # restrict VCF to the samples in the current family #vcf.set_samples(samples_in_ped) smp2idx = dict(zip(vcf.samples, range(len(vcf.samples)))) bad_positions = [] v_feats = [] haps = defaultdict() inf_positions = defaultdict(list) fam_dict = defaultdict(lambda: defaultdict()) for fam in fams: mom = [s for s in samples if s.family_id == fam and s.mom is None and s.dad is None and s.sex == 'female'][0] dad = [s for s in samples if s.family_id == fam and s.mom is None and s.dad is None and s.sex == 'male'][0] sibs = [s.sample_id for s in kids if s.dad == dad and s.mom == mom] fam_dict[fam]['mom'] = mom fam_dict[fam]['dad'] = dad fam_dict[fam]['sibs'] = ','.join(sibs) nused, i, report_at, t0 = 0, 0, 1000, time.time() for i,v in enumerate(vcf(args.chrom)): if i != 0 and i % report_at == 0: persec = i / float(time.time() - t0) print("%s:%d (%.1f/sec) %.2f%% informative (%d/%d variants)" % (v.CHROM, v.POS, persec, 100.0 * nused/i, nused, i), file=sys.stderr) if args.exclude and len(exclude[v.CHROM].search(v.start, v.end)) > 0: continue if v.var_type != 'snp': continue if v.FILTER not in ('PASS', None): continue if v.call_rate < 0.90: continue if len(v.ALT) > 1: continue if len(v.ALT[0]) > 1: continue gts = v.gt_types quals = v.gt_quals rd, ad = v.gt_ref_depths, v.gt_alt_depths for fam in fam_dict: mom = fam_dict[fam]['mom'] dad = fam_dict[fam]['dad'] sibs = fam_dict[fam]['sibs'].split(',') if args.inf_parent and args.inf_parent not in (mom, dad): continue try: mi, di = smp2idx[mom.sample_id], smp2idx[dad.sample_id] except KeyError: sys.exit() # ensure we're at an informative site if gts[mi] != HOM_REF and gts[di] != HOM_REF: continue if gts[mi] == HOM_REF and gts[di] == HOM_REF: continue sib_gts = [gts[smp2idx[k]] for k in sibs] inf_parent = None if (gts[mi] == HET and gts[di] == HOM_REF): inf_parent = mom.sample_id elif (gts[di] == HET and gts[mi] == HOM_REF): inf_parent = dad.sample_id else: continue # check that both parents are "high-quality" if not is_good_site(mi, quals, gts, ad, rd, het_ab=args.ab): continue if not is_good_site(di, quals, gts, ad, rd, het_ab=args.ab): continue if args.inf_parent and inf_parent != args.inf_parent: continue # catalog the "states" of each child w/r/t the informative parent # if kids are HETs, their state w/r/t to the INF parent is 0 states = [] sib_pass = [] for i,k in enumerate(sibs): k_idx = smp2idx[k] k_pass = is_good_site(k_idx, quals, gts, ad, rd, het_ab=args.ab) sib_pass.append(k_pass) k_state = -1 if v.CHROM in ('chrX', 'X') and k.sex == "male": k_state = 0 if gts[k_idx] == HOM_ALT else 1 else: k_state = 0 if gts[k_idx] == HET else 1 states.append(k_state) if not all([x is True for x in sib_pass]): continue if sum([s < 0 for s in states]) > 0: continue if inf_parent not in haps: haps[inf_parent] = np.array(states) else: haps[inf_parent] = np.vstack((haps[inf_parent], np.array(states))) inf_positions[inf_parent].append(v.start) nused += 1 persec = i / float(time.time() - t0) xos = get_xo(args, haps, inf_positions)
def test_6(): p = Ped(op.join(HERE, 'peddy/tests/a6.ped')) assert len(list(p.samples())) == 14 for sam in p.samples(): assert sam.family_id[:3] == "fam"
def test_distant(): p = Ped(op.join(HERE, 'peddy/tests/test-unknown-gma.ped')) d = p.relatedness_coefficient('kid1', 'cousin1') assert d == 0.125, d d = p.relatedness_coefficient('kid1', 'aunt') assert d == 0.25, d d = p.relatedness_coefficient('cousin1', 'aunt') assert d == 0.5, d d = p.relatedness_coefficient('mom', 'aunt') assert d == 0.5, d r = p.relation('kid1', 'cousin1') assert r == 'cousins', r r = p.relation('kid1', 'grandma') assert r == 'grandchild', r r = p.relation('kid1', 'aunt') assert r == 'niece/nephew', r # because we don't know that the uncle is related r = p.relation('kid1', 'uncle') assert r == 'related at unknown level', r r = p.relation('cousin1', 'mom') assert r == 'niece/nephew', r r = p.relation('cousin1', 'dad') # because we don't know that the dad is related assert r == 'related at unknown level', r
def test_ped(): p = Ped(op.join(HERE, 'peddy/tests/a.ped')) assert len(p.families) == 4 assert len(list(p.samples())) == 14
def test_getattr(): p = Ped(op.join(HERE, 'peddy/tests/a.ped')) li = list(p.samples(ethnicity='caucasianNEuropean')) assert len(li) == 5 for item in li: assert item.ethnicity == 'caucasianNEuropean'
import csv import sys from peddy import Ped from collections import defaultdict samples = [s for s in Ped(sys.argv[2]).samples()] kids = [s for s in samples if s.mom is not None and s.dad is not None] gt = defaultdict() alleles = defaultdict() reads = defaultdict() with open(sys.argv[1]) as f: fh = csv.reader(f, delimiter='\t') for l in fh: gt[l[0]] = l[1] alleles[l[0]] = l[2] reads[l[0]] = l[3] md_seen = [] inh_errors = 0 total_inh = 0 for s in kids: if s.dad.sample_id == '8477': continue parent_gts = [gt[p] for p in (s.mom.sample_id, s.dad.sample_id)] #if 'unk' in parent_gts: continue #if gt[s.sample_id] == 'unk': continue if gt[s.sample_id] not in parent_gts: print ('\t'.join([s.family_id, s.sample_id, gt[s.sample_id], gt[s.mom.sample_id], gt[s.dad.sample_id], alleles[s.sample_id], alleles[s.mom.sample_id], alleles[s.dad.sample_id], reads[s.sample_id]]))
def run(args): ped = Ped(args.ped) vcf = VCF(args.vcf, gts012=True) ped_samples = [s.sample_id for s in ped.samples()] vcf_samples = set(vcf.samples) samples = [s for s in ped_samples if s in vcf_samples] exclude = read_exclude(args.exclude) vcf = VCF(args.vcf, samples=samples, gts012=True) if args.region: vcf_iter = vcf(args.region) else: vcf_iter = vcf pctile1 = 10 # build a dict of sample_id to sample index smp2idx = dict(zip(vcf.samples, range(len(vcf.samples)))) # get the Ped objects for the family of interest if args.families is None: fams = ped.families.values() else: fams = [ped.families[f] for f in args.families.split(",")] if len(fams) == 0: sys.exit('Families %s not found in ped file' % args.families) # create a simple dictionary of info for each family member fs = [get_family_dict(fam, smp2idx, args) for fam in fams] del fam fsites = open("%s.sites" % args.prefix, "w") # fcalls contains the crossovers for all samples. try: vcf["AB"] has_abs = True except KeyError: has_abs = False smp2gt = defaultdict(int) nused, i, report_at, t0 = 0, 0, 20000, time.time() for i, v in enumerate(vcf_iter, start=1): if i % report_at == 0: persec = i / float(time.time() - t0) print("%s:%d (%.1f/sec) %.2f%% informative (%d/%d variants)" % (v.CHROM, v.POS, persec, 100.0 * nused / i, nused, i), file=sys.stderr) if i == 20000: report_at = 100000 if i == 100000: report_at = 200000 for f in fs: for k in f: if k.startswith('fh'): f[k].flush() sys.stderr.flush() if v.var_type != 'snp': continue ### no indels if len(v.ALT) > 1: continue #if len(v.REF) > 3 or len(v.ALT) > 1 or len(v.ALT[0]) > 3: # continue #if v.call_rate < 0.95: continue if v.call_rate < 0.90: continue if v.FILTER is not None: continue if int(v.INFO.get('AC')) == 1: continue if exclude is not None and 0 != len(exclude[v.CHROM].search( v.start, v.end)): continue # expensive to get gt_bases and we only need it at the crossover. gt_bases = None gt_types, gt_quals, gt_depths = v.gt_types, v.gt_quals, v.gt_depths ### added by tom 2019-11-13, as 1st percentile of depths on chr17 ### were often negative... #gt_depths[gt_depths < 0] = 0 gt_phases = v.gt_phases ipctiles, pctiles = None, None sample_abs = None nsites = 0 # track the number of families that had this as an informative site. for f in fs: #if ipctiles is not None and ipctiles[0] < pctile1: # break # is_informative only needs gt_types, so we check that first... add_genotype_info(f, gt_types=gt_types, gt_phases=gt_phases) # ############## PHASED #################### if all(f['gt_phase']): if gt_bases is None: gt_bases = v.gt_bases nsites += phased_check(f, v, gt_bases) continue # ############ END PHASED #################### # need exactly 1 het parent for unphased checks. if 1 != ((f['gt_type'][0] == HET) + (f['gt_type'][1] == HET)): continue if not is_informative(f): continue # now wee need to add quality and depth. add_genotype_info(f, gt_quals=gt_quals, gt_depths=gt_depths) if not passes_quality_control(f, args): continue # detect crossovers. for parent, (p1, p2) in [("dad", (0, 1)), ("mom", (1, 0))]: if not (f['gt_type'][p1] == HET and f['gt_type'][p2] == HOM_REF): continue if gt_bases is None: gt_bases = v.gt_bases if sample_abs is None: sample_abs = get_allele_balance(v, has_abs) if sample_abs is None: break fam_abs = sample_abs[f['idxs']] if all(np.isnan(fam_abs)): continue off = 0.31 # require that off <= alt/(ref+alt) <= 1-off if ((fam_abs[p1] >= 1 - off) | (fam_abs[p1] <= off)): continue if np.any((1 - off < fam_abs[2:]) | (fam_abs[2:] <= off)): continue fam_bases = "\t".join(gt_bases[f['idxs']]) fam_abs = "|".join("%.2f" % val for val in fam_abs) # calculate on first use. we found that having a low 1st pctile # was a good indicator of increased chance of spurious XO even # in families with decent depth. #print (np.mean(gt_depths), np.median(gt_depths)) if pctiles is None: ipctiles = np.percentile(gt_depths, (1, 5, 10, 50, 90)) pctiles = "|".join("%.0f" % de for de in ipctiles) #if ipctiles[0] < pctile1: # break fam_depths = "|".join(map(str, gt_depths[f['idxs']])) nsites += 1 val = 1 if f['gt_type'][2] == f['gt_type'][3] else 0 f['fh-%s' % parent].write('\t'.join( str(s) for s in [ v.CHROM, v.POS - 1, v.POS, f['ids'][p1], f['family_id'], val, fam_bases, fam_depths, "%.2f" % v.call_rate, pctiles, fam_abs ]) + '\n') fsites.write("%s:%d\t%d\n" % (v.CHROM, v.POS, nsites)) if nsites > 0: nused += 1 fsites.close() persec = i / float(time.time() - t0) print("finished at %s:%d (%.1f/sec) %.2f%% informative (%d/%d variants)" % (v.CHROM, v.POS, persec, 100.0 * nused / i, nused, i), file=sys.stderr) kept = _remove_empty(fs) call_all(kept, args.prefix, min_sites=20)
def test_ped(): p = Ped('peddy/tests/a.ped') assert len(p.families) == 4 assert len(list(p.samples())) == 14
def run(pedf, region, ref, bams, min_req_alts=MIN_REQ_ALTS): print(pedf, file=sys.stderr) ped = Ped(pedf) bams = " ".join(bams) cmd = CMD.format(**locals()) sample_names = None trios = [] for f in ped.families.values(): trios.extend(f.trios(affected=None)) print("found: %d trios" % len(trios), file=sys.stderr) if len(trios) == 0: raise Exception("found no trios") p = sp.Popen(cmd, shell=True, stderr=sys.stderr, stdout=sp.PIPE) atexit.register(p.kill) for i, line in enumerate(p.stdout): if line[0] == '#': if line.startswith("#CHROM"): sample_names = line.rstrip().split("\t")[9:] print( """##INFO=<ID=MOSAIC,Number=1,Type=String,Description="Pipe-delimited list of samples with evidence of mosaicism">""" ) print(line, end="") continue toks = line.rstrip().split("\t") format = toks[8].split(":") if i % 1000 == 0: print("mosaic: checked ...", i, file=sys.stderr) sys.stderr.flush() samples = { sample_names[k]: dict(zip(format, t.split(":"))) for k, t in enumerate(toks[9:]) } candidates = [] for kid, mom, dad in trios: try: mom = samples[mom.sample_id]['AO'].split(",") if not any('0' == m for m in mom): continue dad = samples[dad.sample_id]['AO'].split(",") if not any('0' == d for d in dad): continue parents = [mom[k] + dad[k] for k in range(len(dad))] if not '00' in parents: continue skid = samples[kid.sample_id] kid_alts = map(int, skid['AO'].split(",")) except KeyError: # require all samples to be called. continue if not any(a >= MIN_REQ_ALTS and parents[k] == '00' for k, a in enumerate(kid_alts)): continue candidates.append( "%s:%s:%s:%s" % (kid.sample_id, skid['RO'], skid['AO'], skid['QA'])) if not candidates: continue toks[7] = "MOSAIC=%s;%s" % ("|".join(candidates), toks[7]) print("\t".join(toks)) sys.stdout.flush()