def _get_family_info(self): """ Extract the relevant genotype filters, as well all labels for each family in the database. """ families = subjects.get_families(self.args.db) self.family_ids = [] self.family_masks = [] self.family_gt_labels = [] self.family_gt_columns = [] self.family_dp_columns = [] for family in families: family_filter = None if self.model == "auto_rec": family_filter = family.get_auto_recessive_filter() elif self.model == "auto_dom": family_filter = family.get_auto_dominant_filter() elif self.model == "de_novo": family_filter = family.get_de_novo_filter() elif self.model == "mendel_violations": family_filter = family.get_mendelian_violation_filter() if family_filter != "False" and family_filter is not None: self.family_masks.append(family_filter) self.family_gt_labels.append(family.get_genotype_labels()) self.family_gt_columns.append(family.get_genotype_columns()) self.family_dp_columns.append(family.get_genotype_depths()) self.family_ids.append(family.family_id)
def get_auto_dominant_candidates(c): """ Report candidate variants that meet an autosomal dominant inheritance model. """ families = subjects.get_families(c) for family in families: query = "SELECT chrom, start, end, ref, alt, gene, \ impact, impact_severity, gt_types, gts \ FROM variants \ WHERE impact_severity != 'LOW'" c.execute(query) all_query_cols = [str(tuple[0]) for tuple in c.description if not tuple[0].startswith("gt")] family_genotype_mask = family.get_auto_dominant_filter() family_sample_gt_columns = family.get_subject_genotype_columns() family_sample_gt_labels = family.get_subject_genotype_labels() # yield a header header = [] header.append("family_id") for col in all_query_cols: header.append(col) for col in family_sample_gt_labels: header.append(col) yield header # yield the resulting auto_dom variants for this familiy for row in c: # unpack the genotype arrays so that we can interrogate # the genotypes present in each family member to conforming # to the genetic model being tested gt_types = compression.unpack_genotype_blob(row['gt_types']) gts = compression.unpack_genotype_blob(row['gts']) # skip if the variant doesn't meet a dominant model # for this family if not eval(family_genotype_mask): continue result = [] # first report all of the non-genotype columns result.append(str(family.family_id)) for col in all_query_cols: if col == 'gt_types' or col == 'gts': continue result.append(str(row[col])) # now report all of the genotype columns for col in family_sample_gt_columns: result.append(str(eval(col))) yield result
def get_actionable_mutations(parser, args): t_n_pairs = gemini_subjects.get_families(args.db) query = "SELECT variants.chrom, start, end, ref, alt, \ variants.gene, impact, is_somatic, \ gene_summary.in_cosmic_census \ FROM variants, gene_summary \ WHERE variants.is_somatic = 1 \ AND (variants.type = 'snp' \ OR variants.type = 'indel') \ AND (variants.impact_severity = 'HIGH' \ OR variants.impact_severity = 'MED') \ AND variants.chrom = gene_summary.chrom \ AND variants.gene = gene_summary.gene \ AND gene_summary.in_cosmic_census = 1" # collect the relevant genes and query DGIDB gq = GeminiQuery.GeminiQuery(args.db) gq.run(query) genes = defaultdict() for row in gq: genes[row['gene']] = True # collect info from DGIdb dgidb_info = query_dgidb(genes) # now rerun the query and report actionable mutations per DGIDB and COSMIC census. gq = GeminiQuery.GeminiQuery(args.db) gq.run(query) print'\t'.join(['tum_name', 'chrom', 'start', 'end', 'ref', 'alt', \ 'gene', 'impact', 'is_somatic', 'in_cosmic_census', 'dgidb_info']) for row in gq: for pair in t_n_pairs: samples = pair.subjects if len(samples) != 2: continue tumor = pair.subjects[0] normal = pair.subjects[1] # swap if we guessed the tumor incorrectly if tumor.affected is False: tumor, normal = normal, tumor print'\t'.join(str(s) for s in [tumor.name, row['chrom'], \ row['start'], row['end'], \ row['ref'], row['alt'], \ row['gene'], row['impact'], \ row['is_somatic'], \ row['in_cosmic_census'], \ str(dgidb_info[row['gene']])])
def get_compound_hets(self): """ Report candidate compound heterozygotes. """ args = self.args gq = GeminiQuery.GeminiQuery(args.db, include_gt_cols=True) idx_to_sample = gq.idx_to_sample self.subjects_dict = subjects.get_subjects(args) # run the query applying any genotype filters provided by the user. gq.run(self.create_query()) families = subjects.get_families(args.db, args.families) family_gt_labels, family_gt_cols = {}, {} for family in families: family_gt_labels[family.family_id] = family.get_genotype_labels() family_gt_cols[family.family_id] = family.get_genotype_columns() # output header print self.get_header(gq.header, is_comp_het=True) # Collect all of the genic heterozygotes for each sample / gene for gene, row_list in groupby(gq, itemgetter("gene")): sample_hets = collections.defaultdict(lambda: collections.defaultdict(list)) for row in row_list: gt_types, gt_bases, gt_phases = row["gt_types"], row["gts"], row["gt_phases"] site = Site(row) # track each sample that is heteroyzgous at this site. for idx, gt_type in enumerate(gt_types): if gt_type != HET: continue sample = idx_to_sample[idx] sample_site = copy(site) sample_site.phased = gt_phases[idx] if not sample_site.phased and not args.ignore_phasing: continue sample_site.gt = gt_bases[idx] # add the site to the list of candidates for this sample/gene sample_hets[sample][site.row["gene"]].append(sample_site) # process the last gene seen samples_w_hetpair = self.find_valid_het_pairs(sample_hets) self.filter_candidates(samples_w_hetpair, family_gt_labels, family_gt_cols)
def get_de_novo_candidates(c, min_sample_depth=30): """ Report candidate variants that meet appear to be de novo mutations in the child. We cannot distinguisj mutations that occured in the parental germline from those that occurred early in development in the child post-conception. """ families = subjects.get_families(c) for family in families: query = "SELECT chrom, start, end, ref, alt, gene, \ impact, impact_severity, in_dbsnp, \ rs_ids, aaf_1kg_all, aaf_esp_all, \ clinvar_sig, clinvar_disease_name, \ clinvar_dbsource, gt_types, \ gt_depths, gts \ FROM variants \ WHERE impact_severity != 'LOW' \ AND num_het = 1" c.execute(query) all_query_cols = [str(tuple[0]) for tuple in c.description if not tuple[0].startswith("gt")] family_genotype_mask = family.get_de_novo_filter() family_sample_gt_columns = family.get_subject_genotype_columns() family_sample_depth_columns = family.get_subject_depth_columns() family_sample_gt_labels = family.get_subject_genotype_labels() family_sample_dp_labels = family.get_subject_depth_labels() header = [] header.append("family_id") for col in all_query_cols: header.append(col) for col in family_sample_gt_labels: header.append(col) for col in family_sample_dp_labels: header.append(col) yield header # report the resulting de_novo variants for this familiy for row in c: # unpack the genotype arrays so that we can interrogate # the genotypes present in each family member to conforming # to the genetic model being tested gt_types = compression.unpack_genotype_blob(row['gt_types']) gt_depths = compression.unpack_genotype_blob(row['gt_depths']) gts = compression.unpack_genotype_blob(row['gts']) # does the variant meet the a de novo model for this family? # if not, ignore. if not eval(family_genotype_mask): continue # make sure each sample's genotype had sufficient coverage. # otherwise, ignore insufficient_depth = False for col in family_sample_depth_columns: depth = int(eval(col)) if depth < min_sample_depth: insufficient_depth = True break if insufficient_depth: continue result = [] # first report all of the non-genotype columns result.append(str(family.family_id)) for col in all_query_cols: if col == 'gt_types' or col == 'gts': continue result.append(str(row[col])) # now report all of the genotype columns for col in family_sample_gt_columns: result.append(str(eval(col))) # now report all of the depth columns for col in family_sample_depth_columns: result.append(str(eval(col))) yield result
def get_de_novo_candidates(args): """ Report candidate variants that meet appear to be de novo mutations in the child. We cannot distinguish mutations that occured in the parental germline from those that occurred early in development in the child post-conception. """ gq = GeminiQuery.GeminiQuery(args.db, include_gt_cols=True) if args.columns is not None: # the user only wants to report a subset of the columns query = "SELECT " + str(args.columns) + " FROM variants" else: # report the kitchen sink query = "SELECT *" + \ ", gts, gt_types, gt_phases, gt_depths, \ gt_ref_depths, gt_alt_depths, gt_quals" + \ " FROM variants" # add any non-genotype column limits to the where clause if args.filter: query += " WHERE " + args.filter # collect family info families = subjects.get_families(gq.c) family_ids = [] family_masks = [] family_sample_gt_labels = [] family_sample_gt_columns = [] family_sample_depth_columns = [] for family in families: family_filter = family.get_de_novo_filter() if family_filter != "False": family_masks.append(family_filter) family_sample_gt_labels.append( family.get_subject_genotype_labels()) family_sample_gt_columns.append( family.get_subject_genotype_columns()) family_sample_depth_columns.append( family.get_subject_depth_columns()) family_ids.append(family.family_id) # run the query applying any genotype filters provided by the user. gq.run(query) # print a header print "family_id\tfamily_members\tfamily_genotypes\tdepths\t", print gq.header # yield the resulting variants for this familiy for row in gq: # interrogate the genotypes present in each family member to conforming # to the genetic model being tested gt_types = row['gt_types'] gts = row['gts'] gt_depths = row['gt_depths'] # test the variant for each family in the db for idx, fam_id in enumerate(family_ids): family_genotype_mask = family_masks[idx] family_sample_gt_label = family_sample_gt_labels[idx] family_sample_gt_cols = family_sample_gt_columns[idx] family_sample_dp_cols = family_sample_depth_columns[idx] # skip if the variant doesn't meet a de novo model # for this family if not eval(family_genotype_mask): continue # make sure each sample's genotype had sufficient coverage. # otherwise, ignore insufficient_depth = False for col in family_sample_dp_cols: depth = int(eval(col)) if depth < args.min_sample_depth: insufficient_depth = True break if insufficient_depth: continue print str(fam_id) + "\t" + \ ",".join([str(s) for s in family_sample_gt_label]) + "\t", \ ",".join([str(eval(s)) for s in family_sample_gt_cols]) + "\t", \ ",".join([str(eval(s)) for s in family_sample_dp_cols]) + "\t", print row
def get_tumor_normal_pairs(args): conn = sqlite3.connect(args.db) conn.isolation_level = None conn.row_factory = sqlite3.Row c = conn.cursor() return gemini_subjects.get_families(c)
def get_auto_recessive_candidates(c): """ Report candidate variants that meet an autosomal recessive inheritance model. """ families = subjects.get_families(c) for family in families: query = "SELECT chrom, start, end, ref, alt, gene, \ impact, impact_severity, gt_types, gts \ FROM variants \ WHERE impact_severity != 'LOW'" c.execute(query) all_query_cols = [ str(tuple[0]) for tuple in c.description if not tuple[0].startswith("gt") ] family_genotype_mask = family.get_auto_recessive_filter() family_sample_gt_columns = family.get_subject_genotype_columns() family_sample_gt_labels = family.get_subject_genotype_labels() # skip this family if it cannot meet an autosomal_recessive model. if family_genotype_mask is None: continue # yield a header header = [] header.append("family_id") for col in all_query_cols: header.append(col) for col in family_sample_gt_labels: header.append(col) yield header # yield the resulting auto_rec variants for this familiy for row in c: # unpack the genotype arrays so that we can interrogate # the genotypes present in each family member to conforming # to the genetic model being tested gt_types = compression.unpack_genotype_blob(row['gt_types']) gts = compression.unpack_genotype_blob(row['gts']) # skip if the variant doesn't meet a recessive model # for this family if not eval(family_genotype_mask): continue result = [] # first report all of the non-genotype columns result.append(str(family.family_id)) for col in all_query_cols: if col == 'gt_types' or col == 'gts': continue result.append(str(row[col])) # now report all of the genotype columns for col in family_sample_gt_columns: result.append(str(eval(col))) yield result
def tag_somatic_mutations(args): t_n_pairs = gemini_subjects.get_families(args.db) gq = GeminiQuery.GeminiQuery(args.db) if args.chrom is None: query = "SELECT variant_id, chrom, start, end, \ ref, alt, gene, impact, gts, gt_types, \ gt_ref_depths, gt_alt_depths \ FROM variants \ WHERE depth >= " + str(args.min_depth) + \ " AND qual >= " + str(args.min_qual) else: query = "SELECT variant_id, chrom, start, end, \ ref, alt, gene, impact, gts, gt_types, \ gt_ref_depths, gt_alt_depths \ FROM variants \ WHERE depth >= " + str(args.min_depth) + \ " AND qual >= " + str(args.min_qual) + \ " AND chrom = \'" + args.chrom + "\'" gq.run(query) smp2idx = gq.sample_to_idx somatic_counter = 0 somatic_v_ids = [] if args.dry_run: print'\t'.join(['tum_name', 'tum_gt', 'tum_alt_freq', 'tum_alt_depth', 'tum_depth', \ 'nrm_name', 'nrm_gt', 'nrm_alt_freq', 'nrm_alt_depth', 'nrm_depth', 'chrom', 'start', 'end', 'ref', 'alt', 'gene']) for row in gq: # we can skip varinats where all genotypes are identical if len(set(row['gt_types'])) == 1: continue for pair in t_n_pairs: samples = pair.subjects if len(samples) != 2: continue tumor = pair.subjects[0] normal = pair.subjects[1] # swap if we guessed the tumor incorrectly if tumor.affected is False: tumor, normal = normal, tumor tum_idx = smp2idx[tumor.name] nrm_idx = smp2idx[normal.name] tum_gt = row['gts'][tum_idx] nrm_gt = row['gts'][nrm_idx] tum_gt_type = row['gt_types'][tum_idx] nrm_gt_type = row['gt_types'][nrm_idx] if nrm_gt_type == tum_gt_type: continue if nrm_gt_type == UNKNOWN or tum_gt_type == UNKNOWN: continue # the genotypes pass the smell test for somatic # mutations if in this block. if (nrm_gt_type == HOM_REF and tum_gt_type != HOM_REF): tum_ref_depth = row['gt_ref_depths'][tum_idx] nrm_ref_depth = row['gt_ref_depths'][nrm_idx] tum_alt_depth = row['gt_alt_depths'][tum_idx] nrm_alt_depth = row['gt_alt_depths'][nrm_idx] # total observed depth nrm_depth = nrm_alt_depth + nrm_ref_depth tum_depth = tum_alt_depth + tum_ref_depth if (nrm_depth < args.min_norm_depth \ or \ tum_depth < args.min_tumor_depth): continue tum_alt_freq = float(tum_alt_depth) / \ (float(tum_alt_depth) + float(tum_ref_depth)) nrm_alt_freq = float(nrm_alt_depth) / \ (float(nrm_alt_depth) + float(nrm_ref_depth)) # apply evidence thresholds. if nrm_alt_freq > args.max_norm_alt_freq \ or \ nrm_alt_depth > args.max_norm_alt_count: continue somatic_counter += 1 somatic_v_ids.append((1, row['variant_id'])) print'\t'.join(str(s) for s in [tumor.name, tum_gt, tum_alt_freq, tum_alt_depth, tum_depth, \ normal.name, nrm_gt, nrm_alt_freq, nrm_alt_depth, nrm_depth, \ row['chrom'], row['start'], row['end'], row['ref'], row['alt'], row['gene']]) if not args.dry_run: conn = sqlite3.connect(args.db) conn.isolation_level = None c = conn.cursor() # now set the identified mutations to True. update_qry = "UPDATE variants SET is_somatic = ? " update_qry += " WHERE variant_id = ?" c.executemany(update_qry, somatic_v_ids) print "Identified and set", somatic_counter, "somatic mutations" else: print "Would have identified and set", somatic_counter, "somatic mutations"
def tag_somatic_mutations(args): t_n_pairs = gemini_subjects.get_families(args.db) gq = GeminiQuery.GeminiQuery(args.db) depth_string, qual_string, ssc_string, chrom_string = ("", "", "", "") if args.min_depth: depth_string = " AND depth >= %s" % args.min_depth if args.min_qual: qual_string = " AND qual >= %s" % args.min_qual if args.min_somatic_score: ssc_string = " AND (type='sv' \ OR somatic_score >= %s)" % args.min_somatic_score if args.chrom: chrom_string = " AND chrom = '%s'" % args.chrom if args.chrom is None: query = "SELECT variant_id, chrom, start, end, \ ref, alt, gene, impact, gts, gt_types, \ gt_ref_depths, gt_alt_depths \ FROM variants \ WHERE 1 \ %s \ %s \ %s \ %s" % (depth_string, qual_string, ssc_string, chrom_string) gq.run(query) smp2idx = gq.sample_to_idx somatic_counter = 0 somatic_v_ids = [] if args.dry_run: print'\t'.join(['tum_name', 'tum_gt', 'tum_alt_freq', 'tum_alt_depth', 'tum_depth', \ 'nrm_name', 'nrm_gt', 'nrm_alt_freq', 'nrm_alt_depth', 'nrm_depth', 'chrom', 'start', 'end', 'ref', 'alt', 'gene']) for row in gq: # we can skip variants where all genotypes are identical if len(set(row['gt_types'])) == 1: continue for pair in t_n_pairs: samples = pair.subjects if len(samples) != 2: continue tumor = pair.subjects[0] normal = pair.subjects[1] # swap if we guessed the tumor incorrectly if tumor.affected is False: tumor, normal = normal, tumor tum_idx = smp2idx[tumor.name] nrm_idx = smp2idx[normal.name] tum_gt = row['gts'][tum_idx] nrm_gt = row['gts'][nrm_idx] tum_gt_type = row['gt_types'][tum_idx] nrm_gt_type = row['gt_types'][nrm_idx] if nrm_gt_type == tum_gt_type: continue if nrm_gt_type == UNKNOWN or tum_gt_type == UNKNOWN: continue # the genotypes pass the smell test for somatic # mutations if in this block. if (nrm_gt_type == HOM_REF and tum_gt_type != HOM_REF): tum_ref_depth = row['gt_ref_depths'][tum_idx] nrm_ref_depth = row['gt_ref_depths'][nrm_idx] tum_alt_depth = row['gt_alt_depths'][tum_idx] nrm_alt_depth = row['gt_alt_depths'][nrm_idx] # total observed depth nrm_depth = nrm_alt_depth + nrm_ref_depth tum_depth = tum_alt_depth + tum_ref_depth if (nrm_depth < args.min_norm_depth \ or \ tum_depth < args.min_tumor_depth): continue try: tum_alt_freq = float(tum_alt_depth) / \ (float(tum_alt_depth) + float(tum_ref_depth)) except ZeroDivisionError: tum_alt_freq = 'NA' try: nrm_alt_freq = float(nrm_alt_depth) / \ (float(nrm_alt_depth) + float(nrm_ref_depth)) except ZeroDivisionError: nrm_alt_freq = 'NA' # apply evidence thresholds. if (args.max_norm_alt_freq and nrm_alt_freq > args.max_norm_alt_freq) \ or \ (args.max_norm_alt_count and nrm_alt_depth > args.max_norm_alt_count): continue somatic_counter += 1 somatic_v_ids.append((1, row['variant_id'])) print'\t'.join(str(s) for s in [tumor.name, tum_gt, tum_alt_freq, tum_alt_depth, tum_depth, \ normal.name, nrm_gt, nrm_alt_freq, nrm_alt_depth, nrm_depth, \ row['chrom'], row['start'], row['end'], row['ref'], row['alt'], row['gene']]) if not args.dry_run: import database conn, metadata = database.get_session_metadata(args.db) # now set the identified mutations to True. update_qry = "UPDATE variants SET is_somatic = 1 " update_qry += " WHERE variant_id IN (%s)" update_qry %= ",".join(str(x[1]) for x in somatic_v_ids) res = conn.execute(update_qry) assert res.rowcount == somatic_counter print "Identified and set", somatic_counter, "somatic mutations" conn.commit() else: print "Would have identified and set", somatic_counter, "somatic mutations"
def get_auto_dominant_candidates(args): """ Report candidate variants that meet an autosomal dominant inheritance model. """ gq = GeminiQuery.GeminiQuery(args.db, include_gt_cols=True) if args.columns is not None: # the user only wants to report a subset of the columns query = "SELECT " + str(args.columns) + " FROM variants" else: # report the kitchen sink query = "SELECT *" + \ ", gts, gt_types, gt_phases, gt_depths, \ gt_ref_depths, gt_alt_depths, gt_quals" + \ " FROM variants" # add any non-genotype column limits to the where clause if args.filter: query += " WHERE " + args.filter # collect family info families = subjects.get_families(gq.c) family_ids = [] family_masks = [] family_sample_gt_labels = [] family_sample_gt_columns = [] for family in families: family_masks.append(family.get_auto_dominant_filter()) family_sample_gt_labels.append(family.get_subject_genotype_labels()) family_sample_gt_columns.append(family.get_subject_genotype_columns()) family_ids.append(family.family_id) # run the query applying any genotype filters provided by the user. gq.run(query) # print a header print "family_id\tfamily_members\tfamily_genotypes\t", print gq.header # yield the resulting variants for this familiy for row in gq: # interrogate the genotypes present in each family member to conforming # to the genetic model being tested gt_types = row['gt_types'] gts = row['gts'] # test the variant for each family in the db for idx, fam_id in enumerate(family_ids): family_genotype_mask = family_masks[idx] family_sample_gt_label = family_sample_gt_labels[idx] family_sample_gt_cols = family_sample_gt_columns[idx] # skip if the variant doesn't meet a dominant model # for this family if not eval(family_genotype_mask): continue print str(fam_id) + "\t" + \ ",".join([str(s) for s in family_sample_gt_label]) + "\t", \ ",".join([str(eval(s)) for s in family_sample_gt_cols]) + "\t", print row
def get_de_novo_candidates(args): """ Report candidate variants that meet appear to be de novo mutations in the child. We cannot distinguish mutations that occured in the parental germline from those that occurred early in development in the child post-conception. """ gq = GeminiQuery.GeminiQuery(args.db, include_gt_cols=True) if args.columns is not None: # the user only wants to report a subset of the columns query = "SELECT " + str(args.columns) + " FROM variants" else: # report the kitchen sink query = ( "SELECT *" + ", gts, gt_types, gt_phases, gt_depths, \ gt_ref_depths, gt_alt_depths, gt_quals" + " FROM variants" ) # add any non-genotype column limits to the where clause if args.filter: query += " WHERE " + args.filter # collect family info families = subjects.get_families(gq.c) family_ids = [] family_masks = [] family_sample_gt_labels = [] family_sample_gt_columns = [] family_sample_depth_columns = [] for family in families: family_masks.append(family.get_de_novo_filter()) family_sample_gt_labels.append(family.get_subject_genotype_labels()) family_sample_gt_columns.append(family.get_subject_genotype_columns()) family_sample_depth_columns.append(family.get_subject_depth_columns()) family_ids.append(family.family_id) # run the query applying any genotype filters provided by the user. gq.run(query) # print a header print "family_id\tfamily_members\tfamily_genotypes\tdepths\t", print gq.header # yield the resulting variants for this familiy for row in gq: # interrogate the genotypes present in each family member to conforming # to the genetic model being tested gt_types = row["gt_types"] gts = row["gts"] gt_depths = row["gt_depths"] # test the variant for each family in the db for idx, fam_id in enumerate(family_ids): family_genotype_mask = family_masks[idx] family_sample_gt_label = family_sample_gt_labels[idx] family_sample_gt_cols = family_sample_gt_columns[idx] family_sample_dp_cols = family_sample_depth_columns[idx] # skip if the variant doesn't meet a de novo model # for this family if not eval(family_genotype_mask): continue # make sure each sample's genotype had sufficient coverage. # otherwise, ignore insufficient_depth = False for col in family_sample_dp_cols: depth = int(eval(col)) if depth < args.min_sample_depth: insufficient_depth = True break if insufficient_depth: continue print str(fam_id) + "\t" + ",".join([str(s) for s in family_sample_gt_label]) + "\t", ",".join( [str(eval(s)) for s in family_sample_gt_cols] ) + "\t", ",".join([str(eval(s)) for s in family_sample_dp_cols]) + "\t", print row