def get_families(db, selected_families=None): """ Query the samples table to return a list of Family objects that each contain all of the Subjects in a Family. """ conn = sqlite3.connect(db) conn.isolation_level = None conn.row_factory = sqlite3.Row c = conn.cursor() families_dict = Family.from_cursor(c) # if the user has specified a set of selected families # to which the analysis should be restricted, then # first sanity check that the family ids they specified are valid. if selected_families is not None: for family in selected_families.split(','): if family not in families_dict: sys.exit("ERROR: family \"%s\" is not a valid family_id\n" % family) families = [] for fam in families_dict: if selected_families is None or fam in selected_families: families.append(families_dict[fam]) return families
def set_family_info(self): """ Extract the relevant genotype filters, as well all labels for each family in the database. """ self.families = families = Family.from_cursor(self.gq.c).values() args = self.args self.family_ids = [] self.family_masks = [] kwargs = {'only_affected': not getattr(self.args, "allow_unaffected", False), 'min_gq': args.min_gq} if self.model == "mendel_violations": kwargs = {'only_affected': self.args.only_affected} if self.model != "comp_het" and self.model != "mendel_violations": kwargs['strict'] = not self.args.lenient elif self.model == "comp_het": kwargs['pattern_only'] = self.args.pattern_only requested_fams = None if not args.families else set(args.families.split(",")) for family in families: if requested_fams is None or family.family_id in requested_fams: # e.g. family.auto_rec(gt_ll, min_depth) family_filter = getattr(family, self.model)(gt_ll=self.args.gt_phred_ll, min_depth=self.args.min_sample_depth, **kwargs) else: family_filter = 'False' self.family_masks.append(family_filter) self.family_ids.append(family.family_id)
def set_family_info(self): """ Extract the relevant genotype filters, as well all labels for each family in the database. """ self.families = families = Family.from_cursor(self.gq.c).values() args = self.args self.family_ids = [] self.family_masks = [] kwargs = { 'only_affected': not getattr(self.args, "allow_unaffected", False) } if self.model == "mendel_violations": kwargs = {'only_affected': self.args.only_affected} if self.model != "comp_het" and self.model != "mendel_violations": kwargs['strict'] = not self.args.lenient elif self.model == "comp_het": kwargs['pattern_only'] = self.args.pattern_only requested_fams = None if not args.families else set( args.families.split(",")) for family in families: if requested_fams is None or family.family_id in requested_fams: # e.g. family.auto_rec(gt_ll, min_depth) family_filter = getattr(family, self.model)( gt_ll=self.args.gt_phred_ll, min_depth=self.args.min_sample_depth, **kwargs) else: family_filter = 'False' self.family_masks.append(family_filter) self.family_ids.append(family.family_id)
def test_comp_het_one_parent_2kids(): """ test that we cant have a candidate when a parent is HOM_REF at both sites. """ mom._i = 0 kid._i = 1 kid2._i = 2 kid.dad = None kid.mom = None kid.mom = mom efam = EvalFamily(Family([mom, kid, kid2], '2kids')) efam.gt_types = [Family.HOM_REF, Family.HET, Family.HET] res = efam.comp_het_pair([Family.HOM_REF, Family.HET, Family.HET], ["T/T", "T/C", "T/C"], [Family.HOM_REF, Family.HET, Family.HET], ["A/A", "A/C", "A/C"], [False] * 3, [False] * 3, "T", "C", "A", "C", fast_mode=False, allow_unaffected=True) assert not res['candidate'], res
def test_comp_het_singleton(): kid = Sample('kid', affected=True) efam = EvalFamily(Family([kid], 'singleton')) efam.gt_types = [Family.HET] res = efam.comp_het_pair([Family.HET], ["A/C"], [Family.HET], ["A/C"], [False], [False], "A", "C", "A", "C") assert res['candidate'] assert res['priority'] == 2, res
def make_fam2(): # 1 affected kid, parent, grandparent fam = Family.from_ped("""\ #family_id sample_id paternal_id maternal_id sex phenotype 1 dad 0 0 1 1 1 mom grandpa grandma 2 2 1 kid dad mom 1 2 1 kid2 dad mom 1 1 1 grandma 0 0 2 2 1 grandpa 0 0 1 1""") return fam
def make_fam1(): # only 1 affected kid. fam = Family.from_ped("""\ #family_id sample_id paternal_id maternal_id sex phenotype 1 dad 0 0 1 1 1 mom grandpa grandma 2 1 1 kid dad mom 1 2 1 kid2 dad mom 1 1 1 grandma 0 0 2 1 1 grandpa 0 0 1 1""") return fam
def candidates(self): args = self.args self.gq._connect_to_database() fams = self.fams = Family.from_cursor(self.gq.conn) if args.families: fams = {f: fam for f, fam in fams.items() if f in set(args.families.split(","))} for grp, li in self.gen_candidates('gene'): samples_w_hetpair = defaultdict(list) sites, strs = [], [] for row in li: gt_types, gt_bases, gt_phases = row['gt_types'], row['gts'], row['gt_phases'] site = Site(row) site.gt_phases, site.gt_bases, site.gt_types = gt_phases, gt_bases, gt_types sites.append((str(site), site)) for family_id, fam in fams.items(): # if a site has been deemed "impossible", we store and then # skip it to avoid compuational overhead on it multiple times. impossible_sites = {} for i, (strsite1, site1) in enumerate(sites[:-1], start=1): if strsite1 in impossible_sites: continue for (strsite2, site2) in sites[i:]: if strsite2 in impossible_sites: continue ch = fam.comp_het_pair(site1.gt_types, site1.gt_bases, site2.gt_types, site2.gt_bases, site1.gt_phases, site2.gt_phases, ref1=site1.row['ref'], alt1=site1.row['alt'], ref2=site2.row['ref'], alt2=site2.row['alt'], allow_unaffected=args.allow_unaffected, fast_mode=True, pattern_only=args.pattern_only) if ch.get('impossible') == 'site1': impossible_sites[strsite1] = True break if ch.get('impossible') == 'site2': impossible_sites[strsite2] = True if not ch['candidate']: continue samples_w_hetpair[(site1, site2)].append(ch) yield grp, self.filter_candidates(samples_w_hetpair)
def make_fam(n_affecteds, n_unaffecteds, n_unknowns, id="xxx"): samples = [] for i in range(n_affecteds): samples.append( Sample('affected_%d' % i, affected=True, sex=random.randint(1, 2), name='affected_%d' % i)) for i in range(n_unaffecteds): samples.append( Sample('unaffected_%d' % i, affected=False, sex=random.randint(1, 2), name='affected_%d' % i)) for i in range(n_unknowns): samples.append( Sample('unknown_%d' % i, affected=None, sex=random.randint(1, 2), name='affected_%d' % i)) for i in range(int((n_affecteds + n_affecteds + n_unknowns) / 2)): sample = random.choice(samples) if random.random() < 0.9: try: sample.dad = random.choice([ s for s in samples if not s == sample and s.sex == 'male' ]) except IndexError: pass if random.random() < 0.9: try: sample.mom = random.choice([ s for s in samples if not s == sample and s.sex == 'female' ]) except IndexError: pass fam = EvalFamily(Family(samples, 'fam_%s' % id)) fam.gt_types = [random.randrange(0, 4) for _ in range(len(samples))] fam.gt_depths = [random.randrange(0, 100) for _ in range(len(samples))] fam.gt_phred_ll_homref = [ random.randrange(0, 100) for _ in range(len(samples)) ] fam.gt_phred_ll_het = [ random.randrange(0, 100) for _ in range(len(samples)) ] fam.gt_phred_ll_homalt = [ random.randrange(0, 100) for _ in range(len(samples)) ] fam.gt_quals = [random.randrange(5, 100) for _ in range(len(samples))] return fam
def test_comp_het_all_hets(): efam = EvalFamily(Family([dad, mom, kid], 'triox')) efam.gt_types = [Family.HET] * 3 res = efam.comp_het_pair([Family.HET] * 3, ["A/C"] * 3, [Family.HET] * 3, ["A/C"] * 3, [False] * 3, [False] * 3, "A", "C", "A", "C") assert res['candidate'] assert res['priority'] == 3
def test_x_rec(): mom = Sample('mom_1239NIH', affected=False, sex='female') dad = Sample('dad_1240NIH', affected=False, sex='male') kid_aff = Sample('kidaff_1238NIH', affected=True, sex='female') kid_aff.mom = mom kid_aff.dad = dad efam = EvalFamily(Family([dad, mom, kid_aff], 'oler-trio')) # mom should be a carrier efam.gt_types = [Family.HOM_REF, Family.HOM_REF, Family.HOM_ALT] assert efam.x_rec()
def candidates(self): args = self.args self.gq._connect_to_database() fams = self.fams = Family.from_cursor(self.gq.c) if args.families: fams = { f: fam for f, fam in fams.items() if f in set(args.families.split(",")) } for grp, li in self.gen_candidates('gene'): samples_w_hetpair = defaultdict(list) sites = [] for row in li: gt_types, gt_bases, gt_phases = row['gt_types'], row[ 'gts'], row['gt_phases'] site = Site(row) site.gt_phases, site.gt_bases, site.gt_types = gt_phases, gt_bases, gt_types sites.append(site) for i, site1 in enumerate(sites[:-1], start=1): for site2 in sites[i:]: for family_id, fam in fams.items(): ch = fam.comp_het_pair( site1.gt_types, site1.gt_bases, site2.gt_types, site2.gt_bases, site1.gt_phases, site2.gt_phases, ref1=site1.row['ref'], alt1=site1.row['alt'], ref2=site2.row['ref'], alt2=site2.row['alt'], allow_unaffected=args.allow_unaffected, fast_mode=True, pattern_only=args.pattern_only) if not ch['candidate']: continue samples_w_hetpair[(site1, site2)].append(ch) yield grp, self.filter_candidates(samples_w_hetpair)
def get_families(db, selected_families=None): """ Query the samples table to return a list of Family objects that each contain all of the Subjects in a Family. """ conn, metadata = database.get_session_metadata(db) families_dict = Family.from_cursor(conn) # if the user has specified a set of selected families # to which the analysis should be restricted, then # first sanity check that the family ids they specified are valid. if selected_families is not None: for family in selected_families.split(','): if family not in families_dict: raise ValueError("Family \"%s\" is not a valid family_id\n" % family) families = [] for fam in families_dict: if selected_families is None or fam in selected_families: families.append(families_dict[fam]) return families
def test_comp_het_one_parent(): mom._i = 0 kid._i = 1 kid.dad = None kid.mom = None efam = EvalFamily(Family([mom, kid], 'pair_mom')) efam.gt_types = [Family.HET] * 2 res = efam.comp_het_pair([Family.HET] * 2, ["A/C"] * 2, [Family.HET] * 2, ["A/C"] * 2, [False] * 2, [False] * 2, "A", "C", "A", "C") assert res['candidate'] assert res['priority'] == 3, res['priority'] res = efam.comp_het_pair([Family.HOM_REF, Family.HET] * 2, ["A/A", "A/C"], [Family.HET, Family.HET], ["A/C"] * 2, [False] * 2, [False] * 2, "A", "C", "A", "C") assert res['candidate'] assert res['priority'] == 2, res['priority'] res = efam.comp_het_pair([Family.HOM_REF, Family.HOM_REF] * 2, ["A/A", "A/A"], [Family.HET, Family.HET], ["A/C"] * 2, [False] * 2, [False] * 2, "A", "C", "A", "C") assert not res['candidate']
def test_x_dom_parents(): mom = Sample('mom', affected=False, sex='female') dad = Sample('dad', affected=False, sex='male') kid = Sample('kid', affected=True, sex='female') kid.mom, kid.dad = mom, dad efam = EvalFamily(Family([dad, mom, kid], 'trio')) efam.gt_types = [Family.HOM_REF, Family.HOM_REF, Family.HET] # neither parent is het assert not efam.x_dom() # neither parent is affected efam.gt_types = [Family.HET, Family.HOM_REF, Family.HET] assert not efam.x_dom() dad.affected = True assert efam.x_dom() # for male, only mom must be affected kid.sex = 'male' assert not efam.x_dom()
def candidates(self): args = self.args self.gq._connect_to_database() fams = self.fams = Family.from_cursor(self.gq.conn) if args.families: fams = { f: fam for f, fam in fams.items() if f in set(args.families.split(",")) } for grp, li in self.gen_candidates('gene'): samples_w_hetpair = defaultdict(list) sites, strs = [], [] for row in li: gt_types, gt_bases, gt_phases = row['gt_types'], row[ 'gts'], row['gt_phases'] site = Site(row) site.gt_phases, site.gt_bases, site.gt_types = gt_phases, gt_bases, gt_types sites.append((str(site), site)) for family_id, fam in fams.items(): # if a site has been deemed "impossible", we store and then # skip it to avoid compuational overhead on it multiple times. impossible_sites = {} for i, (strsite1, site1) in enumerate(sites[:-1], start=1): if strsite1 in impossible_sites: continue for (strsite2, site2) in sites[i:]: if strsite2 in impossible_sites: continue ch = fam.comp_het_pair( site1.gt_types, site1.gt_bases, site2.gt_types, site2.gt_bases, site1.gt_phases, site2.gt_phases, ref1=site1.row['ref'], alt1=site1.row['alt'], ref2=site2.row['ref'], alt2=site2.row['alt'], allow_unaffected=args.allow_unaffected, fast_mode=True, pattern_only=args.pattern_only) if ch.get('impossible') == 'site1': impossible_sites[strsite1] = True break if ch.get('impossible') == 'site2': impossible_sites[strsite2] = True if not ch['candidate']: continue samples_w_hetpair[(site1, site2)].append(ch) yield grp, self.filter_candidates(samples_w_hetpair)
from __future__ import print_function import sys from inheritance import Sample, Family, EvalFamily mom = Sample('mom', affected=False) dad = Sample('dad', affected=False) kid = Sample('kid', affected=True) kid.mom, kid.dad = mom, dad fam = Family([mom, dad, kid], 'a') def make_fam1(): # only 1 affected kid. fam = Family.from_ped("""\ #family_id sample_id paternal_id maternal_id sex phenotype 1 dad 0 0 1 1 1 mom grandpa grandma 2 1 1 kid dad mom 1 2 1 kid2 dad mom 1 1 1 grandma 0 0 2 1 1 grandpa 0 0 1 1""") return fam def make_fam2(): # 1 affected kid, parent, grandparent fam = Family.from_ped("""\ #family_id sample_id paternal_id maternal_id sex phenotype 1 dad 0 0 1 1