def __init__(self, args): gq = GeminiQuery(args.db) subjects = get_subjects(args) # get samples in order of genotypes self.samples = [ gq.idx_to_sample_object[x] for x in range(len(subjects)) ]
def get_fusions(args): """ Identify candidate rearrangments resulting in fusion genes. """ gq = GeminiQuery.GeminiQuery(args.db, include_gt_cols=True) idx_to_sample = gq.idx_to_sample subjects_dict = subjects.get_subjects(args) # create strings for gemini query of command line args qual_string, ev_type_string, cosmic_string = ("", "", "") if args.min_qual: qual_string = " AND qual >= %s" % args.min_qual if args.evidence_type: ev_type_string = " AND sv_evidence_type = '%s'" % args.evidence_type query = """SELECT variants.chrom, start, end, ref, alt, qual, is_somatic, somatic_score, type, sub_type, variants.gene, sv_strand, sv_length, sv_cipos_start_left, sv_cipos_start_right, sv_cipos_end_left, sv_cipos_end_right, sv_event_id, sv_mate_id, sv_tool, sv_evidence_type, sv_is_precise, gene_summary.strand, gene_summary.transcript_min_start, gene_summary.transcript_max_end, gene_summary.in_cosmic_census FROM variants, gene_summary WHERE is_somatic = 1 AND type = 'sv' AND variants.gene is not NULL AND variants.chrom = gene_summary.chrom AND variants.gene = gene_summary.gene %s %s ORDER BY sv_event_id """ % (qual_string, ev_type_string) curr = None prev = None gq.run(query) for row in gq: # single-line variants (DEL, DUP, INV) if row['sub_type'] != 'complex': report_fusion([row], subjects_dict, args) # multi-line variants (BND) elif row['sv_mate_id']: curr = row # the SV event ids match, and prev is not None if (prev and curr['sv_event_id'] == prev['sv_event_id']): report_fusion([prev, curr], subjects_dict, args) # shift the previous prev = curr
def run_query(args): predicates = get_row_predicates(args) add_required_columns_to_query(args) formatter = select_formatter(args) genotypes_needed = needs_genotypes(args) gene_needed = needs_gene(args) try: subjects = get_subjects(args) except KeyError: subjects = [] kwargs = {} if args.bcolz: import gemini_bcolz kwargs['variant_id_getter'] = gemini_bcolz.filter gq = GeminiQuery.GeminiQuery(args.db, out_format=formatter, **kwargs) gq.run(args.query, args.gt_filter, args.show_variant_samples, args.sample_delim, predicates, genotypes_needed, gene_needed, args.show_families, subjects=subjects) if args.use_header and gq.header: print gq.header if not args.dgidb: for row in gq: print row else: # collect a list of all the genes that need to be queried # from DGIdb genes = defaultdict() for row in gq: genes[row['gene']] = True # collect info from DGIdb dgidb_info = query_dgidb(genes) # rerun the query (the cursor is now consumed) gq = GeminiQuery.GeminiQuery(args.db, out_format=formatter) gq.run(args.query, args.gt_filter, args.show_variant_samples, args.sample_delim, predicates, genotypes_needed, gene_needed, args.show_families, subjects=subjects, **kwargs) # report the query results with DGIdb info added at the end. for row in gq: print str(row) + "\t" + str(dgidb_info[row['gene']])
def get_compound_hets(args): """ Report candidate compound heterozygotes. """ gq = GeminiQuery.GeminiQuery(args.db, include_gt_cols=True) idx_to_sample = gq.idx_to_sample subjects_dict = subjects.get_subjects(args) # run the query applying any genotype filters provided by the user. gq.run(create_query(args)) sample_hets = collections.defaultdict( lambda: collections.defaultdict(list)) curr_gene = None prev_gene = None comp_het_counter = 0 # output header print "family\tsample\tcomp_het_id\t" + str(gq.header) # Collect all of the genic heterozygotes for each sample / gene for row in gq: gt_types = row['gt_types'] gt_bases = row['gts'] gt_phases = row['gt_phases'] curr_gene = row['gene'] # gene has changed. process the comp_hets for this gene and reset. if curr_gene != prev_gene and prev_gene is not None: # process comp_hets samples_w_hetpair = find_valid_het_pairs(args, sample_hets) comp_het_counter = filter_candidates(args, samples_w_hetpair, subjects_dict, comp_het_counter) # reset for next gene sample_hets = collections.defaultdict( lambda: collections.defaultdict(list)) site = Site(row) # track each sample that is heteroyzgous at this site. for idx, gt_type in enumerate(gt_types): if gt_type == HET: sample = idx_to_sample[idx] sample_site = copy(site) sample_site.phased = gt_phases[idx] if not sample_site.phased and not args.ignore_phasing: continue sample_site.gt = gt_bases[idx] # add the site to the list of candidates for this sample/gene sample_hets[sample][site.row['gene']].append(sample_site) prev_gene = curr_gene # process the last gene seen samples_w_hetpair = find_valid_het_pairs(args, sample_hets) comp_het_counter = filter_candidates(args, samples_w_hetpair, subjects_dict, comp_het_counter)
def __init__(self, args): subjects = get_subjects(args) self.carrier_summary = args.carrier_summary self.column_types = list(set([getattr(x, self.carrier_summary) for x in subjects.values()])) self.column_counters = {None: set()} for ct in self.column_types: self.column_counters[ct] = set([k for (k, v) in subjects.items() if getattr(v, self.carrier_summary) == ct])
def get_compound_hets(args): """ Report candidate compound heterozygotes. """ gq = GeminiQuery.GeminiQuery(args.db, include_gt_cols=True) idx_to_sample = gq.idx_to_sample subjects_dict = subjects.get_subjects(args) # run the query applying any genotype filters provided by the user. gq.run(create_query(args)) sample_hets = collections.defaultdict(lambda: collections.defaultdict(list)) curr_gene = None prev_gene = None comp_het_counter = 0 # output header print "family\tsample\tcomp_het_id\t" + str(gq.header) # Collect all of the genic heterozygotes for each sample / gene for row in gq: gt_types = row['gt_types'] gt_bases = row['gts'] gt_phases = row['gt_phases'] curr_gene = row['gene'] # gene has changed. process the comp_hets for this gene and reset. if curr_gene != prev_gene and prev_gene is not None: # process comp_hets samples_w_hetpair = find_valid_het_pairs(args, sample_hets) comp_het_counter = filter_candidates(args, samples_w_hetpair, subjects_dict, comp_het_counter) # reset for next gene sample_hets = collections.defaultdict(lambda: collections.defaultdict(list)) site = Site(row) # track each sample that is heteroyzgous at this site. for idx, gt_type in enumerate(gt_types): if gt_type == HET: sample = idx_to_sample[idx] sample_site = copy(site) sample_site.phased = gt_phases[idx] if not sample_site.phased and not args.ignore_phasing: continue sample_site.gt = gt_bases[idx] # add the site to the list of candidates for this sample/gene sample_hets[sample][site.row['gene']].append(sample_site) prev_gene = curr_gene # process the last gene seen samples_w_hetpair = find_valid_het_pairs(args, sample_hets) comp_het_counter = filter_candidates(args, samples_w_hetpair, subjects_dict, comp_het_counter)
def amend_sample(args): loaded_subjects = get_subjects(args) ped_dict = load_ped_file(args.sample) header = get_ped_fields(args.sample) with database_transaction(args.db) as c: for k, v in loaded_subjects.items(): if k in ped_dict: item_list = map(quote_string, ped_dict[k]) sample = zip(header, item_list) set_str = ",".join([str(x) + "=" + str(y) for (x, y) in sample]) sql_query = "update samples set {0} where sample_id={1}" c.execute(sql_query.format(set_str, v.sample_id))
def __init__(self, args): subjects = get_subjects(args) self.carrier_summary = args.carrier_summary # get the list of all possible values in the column # but don't include None, since we are treating that as unknown. self.column_types = list(set([getattr(x, self.carrier_summary) for x in subjects.values()])) self.column_types = [i for i in self.column_types if i is not None] self.column_counters = {None: set()} for ct in self.column_types: self.column_counters[ct] = set([k for (k, v) in subjects.items() if getattr(v, self.carrier_summary) == ct])
def amend_sample(args): loaded_subjects = get_subjects(args) ped_dict = load_ped_file(args.sample) header = get_ped_fields(args.sample) with database_transaction(args.db) as c: for k, v in loaded_subjects.items(): if k in ped_dict: item_list = map(quote_string, ped_dict[k]) sample = zip(header, item_list) set_str = ",".join( [str(x) + "=" + str(y) for (x, y) in sample]) sql_query = "update samples set {0} where sample_id={1}" c.execute(sql_query.format(set_str, v.sample_id))
def get_compound_hets(self): """ Report candidate compound heterozygotes. """ args = self.args gq = GeminiQuery.GeminiQuery(args.db, include_gt_cols=True) idx_to_sample = gq.idx_to_sample self.subjects_dict = subjects.get_subjects(args) # run the query applying any genotype filters provided by the user. gq.run(self.create_query()) families = subjects.get_families(args.db, args.families) family_gt_labels, family_gt_cols = {}, {} for family in families: family_gt_labels[family.family_id] = family.get_genotype_labels() family_gt_cols[family.family_id] = family.get_genotype_columns() # output header print self.get_header(gq.header, is_comp_het=True) # Collect all of the genic heterozygotes for each sample / gene for gene, row_list in groupby(gq, itemgetter("gene")): sample_hets = collections.defaultdict(lambda: collections.defaultdict(list)) for row in row_list: gt_types, gt_bases, gt_phases = row["gt_types"], row["gts"], row["gt_phases"] site = Site(row) # track each sample that is heteroyzgous at this site. for idx, gt_type in enumerate(gt_types): if gt_type != HET: continue sample = idx_to_sample[idx] sample_site = copy(site) sample_site.phased = gt_phases[idx] if not sample_site.phased and not args.ignore_phasing: continue sample_site.gt = gt_bases[idx] # add the site to the list of candidates for this sample/gene sample_hets[sample][site.row["gene"]].append(sample_site) # process the last gene seen samples_w_hetpair = self.find_valid_het_pairs(sample_hets) self.filter_candidates(samples_w_hetpair, family_gt_labels, family_gt_cols)
def get_fusions(args): """ Identify candidate rearrangments resulting in fusion genes. """ gq = GeminiQuery.GeminiQuery(args.db, include_gt_cols=True) idx_to_sample = gq.idx_to_sample subjects_dict = subjects.get_subjects(args) ##### COLBY, change "WHERE is_somatic is NULL" to "WHERE is_somatic = 1" query = """SELECT chrom, start, end, is_somatic, somatic_score, type, sub_type, gene, sv_strand, sv_length, sv_event_id, sv_mate_id, sv_tool, sv_evidence_type FROM variants WHERE is_somatic is NULL AND sv_mate_id is not NULL AND type = 'sv' AND sub_type = 'complex' AND gene is not NULL ORDER BY sv_event_id """ curr = None prev = None events = [] gq.run(query) for row in gq: curr = row['sv_event_id'] # the SV event id changed. if curr != prev and prev is not None: # did both ends of the sv meet all the query criteria # and are both ends on the same strand? if len(events) == 2 and \ (events[0]['sv_strand'] == events[1]['sv_strand']): report_fusion(events) # we are done with this candidate events = [] else: events.append(row) prev = curr
def all_samples_predicate(args): """ returns a predicate that returns True if, for a variant, the only samples that have the variant have a given phenotype """ subjects = get_subjects(args).values() return select_subjects_predicate(subjects, args)
def get_compound_hets(args): """ Report candidate compound heterozygous mutations. """ gq = GeminiQuery.GeminiQuery(args.db, include_gt_cols=True) idx_to_sample = gq.idx_to_sample subjects_dict = subjects.get_subjects(args) if args.columns is not None: custom_columns = _add_necessary_columns(args, str(args.columns)) query = "SELECT " + custom_columns + \ " FROM variants " + \ " WHERE (is_exonic = 1 or impact_severity != 'LOW') " else: # report the kitchen sink query = "SELECT *" + \ ", gts, gt_types, gt_phases, gt_depths, \ gt_ref_depths, gt_alt_depths, gt_quals" + \ " FROM variants " + \ " WHERE (is_exonic = 1 or impact_severity != 'LOW') " # add any non-genotype column limits to the where clause if args.filter: query += " AND " + args.filter # run the query applying any genotype filters provided by the user. gq.run(query) comp_hets = collections.defaultdict(lambda: collections.defaultdict(list)) for row in gq: gt_types = row['gt_types'] gts = row['gts'] gt_bases = row['gts'] gt_phases = row['gt_phases'] site = Site(row) # track each sample that is heteroyzgous at this site. for idx, gt_type in enumerate(gt_types): if gt_type == HET: sample = idx_to_sample[idx] if args.only_affected and not subjects_dict[sample].affected: continue # sample = "NA19002" sample_site = copy(site) sample_site.phased = gt_phases[idx] # require phased genotypes if not sample_site.phased and not args.ignore_phasing: continue sample_site.gt = gt_bases[idx] # add the site to the list of candidates # for this sample/gene comp_hets[sample][site.row['gene']].append(sample_site) # header print "family\tsample\tcomp_het_id\t" + str(gq.header) # step 2. now, cull the list of candidate heterozygotes for each # gene/sample to those het pairs where the alternate alleles # were inherited on opposite haplotypes. comp_het_id = 1 for sample in comp_hets: for gene in comp_hets[sample]: # we only care about combinations, not permutations # (e.g. only need site1,site2, not site1,site2 _and site2,site1) # thus we can do this in a ~ linear pass instead of a ~ N^2 pass for idx, site1 in enumerate(comp_hets[sample][gene]): for site2 in comp_hets[sample][gene][idx + 1:]: # expand the genotypes for this sample # at each site into it's composite # alleles. e.g. A|G -> ['A', 'G'] alleles_site1 = [] alleles_site2 = [] if not args.ignore_phasing: alleles_site1 = site1.gt.split('|') alleles_site2 = site2.gt.split('|') else: # split on phased (|) or unphased (/) genotypes alleles_site1 = re.split('\||/', site1.gt) alleles_site2 = re.split('\||/', site2.gt) # it is only a true compound heterozygote IFF # the alternates are on opposite haplotypes. if not args.ignore_phasing: # return the haplotype on which the alternate # allele was observed for this sample at each # candidate het. site. # e.g., if ALT=G and alleles_site1=['A', 'G'] # then alt_hap_1 = 1. if ALT=A, then alt_hap_1 = 0 if "," in str(site1.row['alt']) or \ "," in str(site2.row['alt']): sys.stderr.write("WARNING: Skipping candidate for sample" " %s b/c variants with mult. alt." " alleles are not yet supported. The sites are:" " %s and %s.\n" % (sample, site1, site2)) continue alt_hap_1 = alleles_site1.index(site1.row['alt']) alt_hap_2 = alleles_site2.index(site2.row['alt']) # report if # 1. phasing is considered AND the alt alleles are on # different haplotypes # OR # 2. the user doesn't care about phasing. if (not args.ignore_phasing and alt_hap_1 != alt_hap_2) \ or args.ignore_phasing: print \ "\t".join([str(subjects_dict[sample].family_id), sample, str(comp_het_id), str(site1.row)]) print \ "\t".join([str(subjects_dict[sample].family_id), sample, str(comp_het_id), str(site2.row)]) comp_het_id += 1
def __init__(self, args): gq = GeminiQuery(args.db) subjects = get_subjects(args) # get samples in order of genotypes self.samples = [gq.idx_to_sample_object[x] for x in range(len(subjects))]
def get_compound_hets(args): """ Report candidate compound heterozygous mutations. """ gq = GeminiQuery.GeminiQuery(args.db, include_gt_cols=True) idx_to_sample = gq.idx_to_sample subjects_dict = subjects.get_subjects(gq.c) if args.columns is not None: custom_columns = _add_necessary_columns(args, str(args.columns)) query = "SELECT " + custom_columns + \ " FROM variants " + \ " WHERE (is_exonic = 1 or impact_severity != 'LOW') " else: # report the kitchen sink query = "SELECT *" + \ ", gts, gt_types, gt_phases, gt_depths, \ gt_ref_depths, gt_alt_depths, gt_quals" + \ " FROM variants " + \ " WHERE (is_exonic = 1 or impact_severity != 'LOW') " # add any non-genotype column limits to the where clause if args.filter: query += " AND " + args.filter # run the query applying any genotype filters provided by the user. gq.run(query) comp_hets = collections.defaultdict(lambda: collections.defaultdict(list)) for row in gq: gt_types = row['gt_types'] gts = row['gts'] gt_bases = row['gts'] gt_phases = row['gt_phases'] site = Site(row) # track each sample that is heteroyzgous at this site. for idx, gt_type in enumerate(gt_types): if gt_type == HET: sample = idx_to_sample[idx] if args.only_affected and not subjects_dict[sample].affected: continue # sample = "NA19002" sample_site = copy(site) sample_site.phased = gt_phases[idx] # require phased genotypes if not sample_site.phased and not args.ignore_phasing: continue sample_site.gt = gt_bases[idx] # add the site to the list of candidates # for this sample/gene comp_hets[sample][site.row['gene']].append(sample_site) # header print "family\tsample\tcomp_het_id\t" + str(gq.header) # step 2. now, cull the list of candidate heterozygotes for each # gene/sample to those het pairs where the alternate alleles # were inherited on opposite haplotypes. comp_het_id = 1 for sample in comp_hets: # track which comp_hets we have seen so far for this sample. #seen = {} for gene in comp_hets[sample]: for site1 in comp_hets[sample][gene]: for site2 in comp_hets[sample][gene]: if site1 == site2: continue #if (site1, site2) in seen or (site2, site1) in seen: # continue # avoid reporting the same comp_het, yet just in the # opposition order. #seen[(site1, site2)] = True #seen[(site2, site1)] = True # expand the genotypes for this sample # at each site into it's composite # alleles. e.g. A|G -> ['A', 'G'] alleles_site1 = [] alleles_site2 = [] if not args.ignore_phasing: alleles_site1 = site1.gt.split('|') alleles_site2 = site2.gt.split('|') else: # split on phased (|) or unphased (/) genotypes alleles_site1 = re.split('\||/', site1.gt) alleles_site2 = re.split('\||/', site2.gt) # return the haplotype on which the alternate # allele was observed for this sample at each # candidate het. site. # e.g., if ALT=G and alleles_site1=['A', 'G'] # then alt_hap_1 = 1. if ALT=A, then alt_hap_1 = 0 alt_hap_1 = alleles_site1.index(site1.row['alt']) alt_hap_2 = alleles_site2.index(site2.row['alt']) # it is only a true compound heterozygote IFF # the alternates are on opposite haplotypes. if (not args.ignore_phasing and alt_hap_1 != alt_hap_2) \ or args.ignore_phasing: print \ "\t".join([str(subjects_dict[sample].family_id), sample, str(comp_het_id), str(site1.row)]) print \ "\t".join([str(subjects_dict[sample].family_id), sample, str(comp_het_id), str(site2.row)]) comp_het_id += 1
def get_compound_hets(args): """ Report candidate compound heterozygous mutations. """ gq = GeminiQuery.GeminiQuery(args.db, include_gt_cols=True) idx_to_sample = gq.idx_to_sample subjects_dict = subjects.get_subjects(gq.c) if args.columns is not None: custom_columns = _add_necessary_columns(args, str(args.columns)) query = "SELECT " + custom_columns + \ " FROM variants " + \ " WHERE (is_exonic = 1 or impact_severity != 'LOW') " else: # report the kitchen sink query = "SELECT *" + \ ", gts, gt_types, gt_phases, gt_depths, \ gt_ref_depths, gt_alt_depths, gt_quals" + \ " FROM variants " + \ " WHERE (is_exonic = 1 or impact_severity != 'LOW') " # add any non-genotype column limits to the where clause if args.filter: query += " AND " + args.filter # run the query applying any genotype filters provided by the user. gq.run(query) comp_hets = collections.defaultdict(lambda: collections.defaultdict(list)) for row in gq: gt_types = row['gt_types'] gts = row['gts'] gt_bases = row['gts'] gt_phases = row['gt_phases'] site = Site(row) # track each sample that is heteroyzgous at this site. for idx, gt_type in enumerate(gt_types): if gt_type == HET: sample = idx_to_sample[idx] if args.only_affected and not subjects_dict[sample].affected: continue # sample = "NA19002" sample_site = copy(site) sample_site.phased = gt_phases[idx] # require phased genotypes if not sample_site.phased and not args.ignore_phasing: continue sample_site.gt = gt_bases[idx] # add the site to the list of candidates # for this sample/gene comp_hets[sample][site.row['gene']].append(sample_site) # header print "family\tsample\tcomp_het_id\t" + str(gq.header) # step 2. now, cull the list of candidate heterozygotes for each # gene/sample to those het pairs where the alternate alleles # were inherited on opposite haplotypes. comp_het_id = 1 for sample in comp_hets: for gene in comp_hets[sample]: # we only care about combinations, not permutations # (e.g. only need site1,site2, not site1,site2 _and site2,site1) # thus we can do this in a ~ linear pass instead of a ~ N^2 pass for idx, site1 in enumerate(comp_hets[sample][gene]): for site2 in comp_hets[sample][gene][idx + 1:]: # expand the genotypes for this sample # at each site into it's composite # alleles. e.g. A|G -> ['A', 'G'] alleles_site1 = [] alleles_site2 = [] if not args.ignore_phasing: alleles_site1 = site1.gt.split('|') alleles_site2 = site2.gt.split('|') else: # split on phased (|) or unphased (/) genotypes alleles_site1 = re.split('\||/', site1.gt) alleles_site2 = re.split('\||/', site2.gt) # it is only a true compound heterozygote IFF # the alternates are on opposite haplotypes. if not args.ignore_phasing: # return the haplotype on which the alternate # allele was observed for this sample at each # candidate het. site. # e.g., if ALT=G and alleles_site1=['A', 'G'] # then alt_hap_1 = 1. if ALT=A, then alt_hap_1 = 0 if "," in str(site1.row['alt']) or \ "," in str(site2.row['alt']): sys.stderr.write("WARNING: Skipping candidate for sample" " %s b/c variants with mult. alt." " alleles are not yet supported. The sites are:" " %s and %s.\n" % (sample, site1, site2)) continue alt_hap_1 = alleles_site1.index(site1.row['alt']) alt_hap_2 = alleles_site2.index(site2.row['alt']) # report if # 1. phasing is considered AND the alt alleles are on # different haplotypes # OR # 2. the user doesn't care about phasing. if (not args.ignore_phasing and alt_hap_1 != alt_hap_2) \ or args.ignore_phasing: print \ "\t".join([str(subjects_dict[sample].family_id), sample, str(comp_het_id), str(site1.row)]) print \ "\t".join([str(subjects_dict[sample].family_id), sample, str(comp_het_id), str(site2.row)]) comp_het_id += 1