def do_annotate(self, aline, variant, liftover_variants): if variant is None: self._not_found(aline) return assert variant is not None length = None if VariantType.is_cnv(variant.variant_type): length = variant.end_position - variant.position effects = self.effect_annotator.do_annotate_variant( chrom=variant.chromosome, position=variant.position, ref=variant.reference, alt=variant.alternative, variant_type=variant.variant_type, length=length) r = self.wrap_effects(effects) aline[self.columns["effect_type"]] = r[0] aline[self.columns["effect_gene_genes"]] = r[1] aline[self.columns["effect_gene_types"]] = r[2] aline[self.columns["effect_genes"]] = [ "{}:{}".format(g, e) for g, e in zip(r[1], r[2]) ] aline[self.columns["effect_details_transcript_ids"]] = r[3] aline[self.columns["effect_details_genes"]] = r[4] aline[self.columns["effect_details_details"]] = r[5] aline[self.columns["effect_details"]] = [ "{}:{}:{}".format(t, g, d) for t, g, d in zip(r[3], r[4], r[5]) ]
def test_cnv_best_state_X(cnv_raw): vs = cnv_raw.query_variants( effect_types=["CNV+", "CNV-"], variant_type="cnv+ or cnv-", ) vs = [v for v in vs if v.chrom == "X"] assert len(vs) == 2 for v in vs: assert v.alt_alleles for aa in v.alt_alleles: assert VariantType.is_cnv(aa.variant_type) assert np.array_equal( vs[0].best_state, np.asarray([ [2, 1, 0, 2], [0, 0, 1, 0] ]) ) assert np.array_equal( vs[1].best_state, np.asarray([ [2, 1, 0, 2], [0, 0, 1, 0] ]) )
def do_annotate(self, aline, variant, liftover_variants): if VariantType.is_cnv(variant.variant_type): logger.info( f"skip trying to add NP position score for CNV variant " f"{variant}") self._scores_not_found(aline) return if self.liftover: variant = liftover_variants.get(self.liftover) if variant is None: self._scores_not_found(aline) return scores = self._fetch_scores(variant) if not scores: self._scores_not_found(aline) return scores_df = self.score_file.scores_to_dataframe(scores) if variant.variant_type & VariantType.substitution: aline.update(self._aggregate_substitution(variant, scores_df)) elif variant.variant_type & VariantType.indel: aline.update(self._aggregate_indel(variant, scores_df)) else: logger.warning( f"unexpected variant type: {variant}, {variant.variant_type}" ) self._scores_not_found(aline)
def _do_annotate_cnv(self, variant): assert VariantType.is_cnv(variant.variant_type) if variant.variant_type & VariantType.cnv_p: effect_type = "CNV+" elif variant.variant_type & VariantType.cnv_m: effect_type = "CNV-" else: raise ValueError( f"unexpected variant type: {variant.variant_type}") assert effect_type is not None effects = [] cnv_region = Region(variant.chromosome, variant.position, variant.position + variant.length) for (start, stop), tms in \ self.gene_models.utr_models[variant.chromosome].items(): if cnv_region.intersection(Region(variant.chromosome, start, stop)): for tm in tms: effects.append( EffectFactory.create_effect_with_tm(effect_type, tm)) if len(effects) == 0: effects.append(EffectFactory.create_effect(effect_type)) return effects
def from_cnv(variant): assert VariantType.is_cnv(variant._variant_type) variant_desc = VariantDesc(variant_type=variant._variant_type, position=variant.position, end_position=variant.end_position) return VariantDetails(variant.chrom, variant_desc)
def annotate(self, variant): logger = logging.getLogger(__name__) if VariantType.is_cnv(variant.variant_type): return self._do_annotate_cnv(variant) effects = [] if variant.chromosome not in self.gene_models.utr_models: effects.append(EffectFactory.create_effect("intergenic")) return effects for key in self.gene_models.utr_models[variant.chromosome]: if (variant.position <= key[1] + self.promoter_len and variant.ref_position_last >= key[0] - self.promoter_len): for tm in self.gene_models.utr_models[variant.chromosome][key]: logger.debug( "========: %s-%s :====================", tm.gene, tm.tr_id, ) effect = self.get_effect_for_transcript(variant, tm) logger.debug("") logger.debug("Result: %s", effect) logger.debug("") if effect is not None: effects.append(effect) if len(effects) == 0: effects.append(EffectFactory.create_effect("intergenic")) return effects
def __init__(self, chrom=None, position=None, loc=None, var=None, ref=None, alt=None, length=None, seq=None, variant_type=None): self.variant_type = None self.length = None self.set_position(chrom, position, loc) if VariantType.is_cnv(variant_type): assert self.chromosome is not None assert self.position is not None if self.length is None: assert length is not None self.length = length self.variant_type = variant_type else: self.set_ref_alt(var, ref, alt, length, seq, variant_type) self.ref_position_last = self.position + len(self.reference) self.alt_position_last = self.position + len(self.alternate) self.corrected_ref_position_last = max(self.position, self.ref_position_last - 1)
def __repr__(self) -> str: if VariantType.is_cnv(self._variant_type): return f"{self.chromosome}:{self.position}-{self.end_position}" elif not self.alternative: return f"{self.chrom}:{self.position} {self.reference}(ref)" else: return (f"{self.chrom}:{self.position}" f" {self.reference}->{self.alternative}")
def do_annotate(self, aline, variant, liftover_variants): if VariantType.is_cnv(variant.variant_type): logger.info( f"skip trying to add frequency for CNV variant {variant}") self._scores_not_found(aline) return if self.liftover: variant = liftover_variants.get(self.liftover) if variant is None: self._scores_not_found(aline) return if self.liftover and liftover_variants.get(self.liftover): variant = liftover_variants.get(self.liftover) chrom = variant.chromosome pos = variant.details.cshl_position logger.debug( f"{self.score_filename_base}: looking for DAE frequency of " f"{variant}; {chrom}:{pos};") scores = self.score_file.fetch_scores(chrom, pos, pos) if not scores: self._scores_not_found(aline) return variant_detail = variant.details.cshl_variant variant_occurrences = scores[self.variant_col_name] \ .count(variant_detail) if variant_occurrences > 0: if variant_occurrences > 1: logger.warning( f"WARNING {self.score_filename_base}: " f"multiple variant occurrences of {chrom}:{pos} {variant}") variant_index = scores[self.variant_col_name].index(variant_detail) for native, output in self.config.columns.items(): # FIXME: this conversion should come from schema val = scores[native][variant_index] try: if val in set(["", " "]): aline[output] = self.score_file.no_score_value else: aline[output] = float(val) logger.debug( f"DAE frequency: aline[{output}]={aline[output]}") except ValueError as ex: logger.error( f"problem with: {output}: {chrom}:{pos} - {val}") logger.error(ex) raise ex
def __init__(self, chrom: str, variant_desc: VariantDesc): self.chrom = chrom self.variant_desc = variant_desc self.cshl_position = self.variant_desc.position if VariantType.is_cnv(self.variant_desc.variant_type): self.cshl_location = f"{self.chrom}:" \ f"{self.variant_desc.position}-" \ f"{self.variant_desc.end_position}" else: self.cshl_location = f"{self.chrom}:{self.cshl_position}" self.cshl_variant = str(variant_desc) self.cshl_variant_full = variant_desc.to_cshl_full()
def set_ref_alt(self, var, ref, alt, length, seq, typ): if ref is not None: assert alt is not None assert var is None assert length is None assert seq is None assert not VariantType.is_cnv(typ) self.reference = ref self.alternate = alt if var is not None: assert ref is None assert alt is None assert length is None assert seq is None assert not VariantType.is_cnv(typ) self.set_ref_alt_from_variant(var) self.trim_equal_ref_alt_parts() assert self.reference is not None assert self.alternate is not None
def details(self) -> Optional[VariantDetails]: if self._details is None: if VariantType.is_cnv(self._variant_type): self._details = VariantDetails.from_cnv(self) elif self.alternative is None: return None else: self._details = VariantDetails.from_vcf( self.chromosome, self.position, self.reference, self.alternative, ) return self._details
def test_cnv_impala(cnv_impala): vs = cnv_impala.query_variants( effect_types=["CNV+", "CNV-"], variant_type="cnv+ or cnv-", inheritance="denovo" ) vs = list(vs) print(vs) for v in vs: assert v.alt_alleles for aa in v.alt_alleles: print(aa) assert VariantType.is_cnv(aa.variant_type) assert len(vs) == 12
def do_annotate(self, aline, variant, liftover_variants): if VariantType.is_cnv(variant.variant_type): logger.info( f"skip trying to add position score for CNV variant {variant}") self._scores_not_found(aline) return if self.liftover: variant = liftover_variants.get(self.liftover) if variant is None: self._scores_not_found(aline) return scores = self._fetch_scores(variant) logger.debug( f"{self.score_file.score_filename} looking for score of {variant}") if not scores: logger.debug( f"{self.score_file.score_filename} score not found" ) self._scores_not_found(aline) return counts = scores["COUNT"] total_count = sum(counts) for score_name in self.score_names: column_name = getattr(self.config.columns, score_name) values = list( map(lambda x: self._convert_score(x), scores[score_name]) ) assert len(values) > 0 if len(values) == 1: aline[column_name] = values[0] else: values = list(filter(None, values)) total_sum = sum( [c * v for (c, v) in zip(counts, values)] ) aline[column_name] = \ (total_sum / total_count) if total_sum \ else self.score_file.no_score_value logger.debug( f"aline[{column_name}]={aline[column_name]}")
def liftover_variant(self, variant): assert isinstance(variant, SummaryAllele) if VariantType.is_cnv(variant.variant_type): return try: lo_variant = liftover_variant(variant.chrom, variant.position, variant.reference, variant.alternative, self.liftover, self.target_genome) if lo_variant is None: return lo_chrom, lo_pos, lo_ref, lo_alt = lo_variant result = SummaryAllele(lo_chrom, lo_pos, lo_ref, lo_alt) result.variant_type return result except Exception as ex: logger.warning(f"problem in variant {variant} liftover: {ex}")
def do_annotate(self, aline, variant, liftover_variants): if VariantType.is_cnv(variant.variant_type): logger.info( f"skip trying to add VCF info score for CNV variant {variant}") self._scores_not_found(aline) return if self.liftover: variant = liftover_variants.get(self.liftover) if variant is None: self._scores_not_found(aline) return chrom = variant.chromosome pos = variant.position logger.debug( f"{self.score_file.score_filename}: looking for VCF frequency of " f"{variant}; {chrom}:{pos};") scores = self.score_file.fetch_scores(chrom, pos, pos) if not scores: self._scores_not_found(aline) return logger.debug( f"scores found: {scores}") assert len(scores["REF"]) == len(scores["ALT"]) refs = scores["REF"] alts = scores["ALT"] for index, (ref, alt) in enumerate(zip(refs, alts)): if variant.reference == ref and variant.alternative == alt: for name, output in self.config.columns.items(): aline[output] = scores[name][index] logger.debug( f"VCF frequency: aline[{output}]={aline[output]}") return
def read_variant_type(stream): return VariantType(read_int8(stream))
def _summary_variant_from_dae_record(self, summary_index, rec): rec["cshl_position"] = int(rec["cshl_position"]) position, reference, alternative = dae2vcf_variant( self._adjust_chrom_prefix(rec["chrom"]), rec["cshl_position"], rec["cshl_variant"], self.genome.get_genomic_sequence(), ) rec["position"] = position rec["reference"] = reference rec["alternative"] = alternative rec["all.nParCalled"] = int(rec["all.nParCalled"]) rec["all.nAltAlls"] = int(rec["all.nAltAlls"]) rec["all.prcntParCalled"] = float(rec["all.prcntParCalled"]) rec["all.altFreq"] = float(rec["all.altFreq"]) rec["summary_variant_index"] = summary_index parents_called = int(rec.get("all.nParCalled", 0)) ref_allele_count = 2 * int(rec.get("all.nParCalled", 0)) - int( rec.get("all.nAltAlls", 0) ) ref_allele_prcnt = 0.0 if parents_called > 0: ref_allele_prcnt = ref_allele_count / 2.0 / parents_called ref = { "chrom": rec["chrom"], "position": rec["position"], "reference": rec["reference"], "alternative": None, "variant_type": None, "cshl_position": rec["cshl_position"], "cshl_variant": rec["cshl_variant"], "summary_variant_index": rec["summary_variant_index"], "allele_index": 0, "af_parents_called_count": parents_called, "af_parents_called_percent": float( rec.get("all.prcntParCalled", 0.0) ), "af_allele_count": ref_allele_count, "af_allele_freq": ref_allele_prcnt, } alt = { "chrom": rec["chrom"], "position": rec["position"], "reference": rec["reference"], "alternative": rec["alternative"], "variant_type": VariantType.from_cshl_variant(rec["cshl_variant"]), "cshl_position": rec["cshl_position"], "cshl_variant": rec["cshl_variant"], "summary_variant_index": rec["summary_variant_index"], "allele_index": 1, "af_parents_called_count": int(rec.get("all.nParCalled", 0)), "af_parents_called_percent": float( rec.get("all.prcntParCalled", 0.0) ), "af_allele_count": int(rec.get("all.nAltAlls", 0)), "af_allele_freq": float(rec.get("all.altFreq", 0.0)), } summary_variant = SummaryVariantFactory.summary_variant_from_records( [ref, alt], transmission_type=self.transmission_type ) return summary_variant
def count_variant(v, dataset_id, agps, config, person_ids, denovo_flag): filters = config.datasets[dataset_id] members = set() for aa in v.alt_alleles: for member in aa.variant_in_members: if member is not None: members.add(member) for ps in filters.person_sets: pids = set(person_ids[dataset_id][ps.set_name]) for statistic in filters.statistics: dump = {} if statistic.category == "denovo" and not denovo_flag: continue if statistic.category == "rare" and denovo_flag: continue stat_id = statistic.id do_count = True in_members = len(pids.intersection(members)) > 0 do_count = do_count and in_members dump[1] = do_count if statistic.get("effects"): ets = set(expand_effect_types(statistic.effects)) in_effect_types = len(ets.intersection(v.effect_types)) > 0 do_count = do_count and in_effect_types dump[2] = do_count if statistic.get("scores"): for score in statistic.scores: score_name = score["name"] score_min = score.get("min") score_max = score.get("max") score_value = v.get_attribute(score_name)[0] if score_value is None: do_count = False if score_min: do_count = do_count and score_value >= score_min if score_max: do_count = do_count and score_value <= score_max dump[3] = do_count if statistic.get("category") == "rare": aa = v.alt_alleles[0] freq = aa.get_attribute("af_allele_freq") if freq: do_count = do_count and freq <= 1.0 dump[4] = do_count if statistic.get("variant_types"): variant_types = { VariantType.from_name(t) for t in statistic.variant_types } do_count = do_count and \ len(variant_types.intersection(v.variant_types)) dump[5] = do_count if statistic.get("roles"): roles = {Role.from_name(r) for r in statistic.roles} v_roles = set(v.alt_alleles[0].variant_in_roles) do_count = do_count and \ len(v_roles.intersection(roles)) dump[6] = do_count # if v.position == 152171343: # from pprint import pprint # print(100*"+") # print(ps.set_name, stat_id, do_count, v) # # for aa in v.alt_alleles: # # print(aa.attributes) # pprint(dump) # print(100*"+") if do_count: add_variant_count(v, agps, dataset_id, ps.set_name, stat_id)
def main(gpf_instance=None, argv=None): description = "Generate autism gene profile statistics tool" parser = argparse.ArgumentParser(description=description) parser.add_argument('--verbose', '-V', '-v', action='count', default=0) default_dbfile = os.path.join(os.getenv("DAE_DB_DIR", "./"), "agpdb") parser.add_argument("--dbfile", default=default_dbfile) parser.add_argument( "--gene-sets-genes", action="store_true", help="Generate AGPs only for genes contained in the config's gene sets" ) parser.add_argument( "--genes", help="Comma separated list of genes to generate statistics for") parser.add_argument("--drop", action="store_true") args = parser.parse_args(argv) if args.verbose == 1: logging.basicConfig(level=logging.WARNING) elif args.verbose == 2: logging.basicConfig(level=logging.INFO) elif args.verbose >= 3: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.ERROR) logging.getLogger("impala").setLevel(logging.WARNING) start = time.time() if gpf_instance is None: gpf_instance = GPFInstance() config = gpf_instance._autism_gene_profile_config # gpf_instance.gene_sets_db.get_all_gene_sets("main") collections_gene_sets = [] for gs_category in config.gene_sets: for gs in gs_category.sets: gs_id = gs["set_id"] collection_id = gs["collection_id"] collections_gene_sets.append( (collection_id, gpf_instance.gene_sets_db.get_gene_set(collection_id, gs_id))) # collections_gene_sets = [] # for name in config.gene_sets: # gene_set = gpf_instance.gene_sets_db.get_gene_set("main", name) # collections_gene_sets.append(gene_set) logger.info(f"collected gene sets: {len(collections_gene_sets)}") # gene_sets = list( # filter(lambda gs: gs["name"] in config.gene_sets, gene_sets) # ) gene_symbols = set() if args.genes: gene_symbols = [gs.strip() for gs in args.genes.split(",")] gene_symbols = set(gene_symbols) elif args.gene_sets_genes: for _, gs in collections_gene_sets: gene_symbols = gene_symbols.union(gs["syms"]) else: gene_models = gpf_instance.get_genome().get_gene_models().gene_models gene_symbols = set(gene_models.keys()) gs_count = len(gene_symbols) logger.info(f"Collected {gs_count} gene symbols") has_denovo = False has_rare = False person_ids = dict() for dataset_id, filters in config.datasets.items(): genotype_data = gpf_instance.get_genotype_data(dataset_id) assert genotype_data is not None, dataset_id person_ids[dataset_id] = dict() for ps in filters.person_sets: person_set_query = (ps.collection_name, [ps.set_name]) person_ids[dataset_id][ps.set_name] = \ genotype_data._transform_person_set_collection_query( person_set_query, None ) for stat in filters.statistics: if stat.category == "denovo": has_denovo = True elif stat.category == "rare": has_rare = True agps = dict() gene_symbols = list(gene_symbols) gs_count = len(gene_symbols) elapsed = time.time() - start logger.info(f"data collected: {elapsed:.2f} secs") start = time.time() for idx, sym in enumerate(gene_symbols, 1): gs, agp = generate_agp(gpf_instance, sym, collections_gene_sets) agps[gs] = agp if idx % 25 == 0: elapsed = time.time() - start logger.info(f"Generated {idx}/{gs_count} AGP statistics " f"{elapsed:.2f} secs") logger.info("Done generating AGP statistics!") generate_end = time.time() elapsed = generate_end - start logger.info(f"Took {elapsed:.2f} secs") if has_denovo: logger.info("Collecting denovo variants") denovo_variants = dict() for dataset_id, filters in config.datasets.items(): genotype_data = gpf_instance.get_genotype_data(dataset_id) assert genotype_data is not None, dataset_id if args.gene_sets_genes or args.genes: genes = gene_symbols else: genes = None denovo_variants[dataset_id] = list( genotype_data.query_variants(genes=genes, inheritance="denovo")) logger.info("Done collecting denovo variants") logger.info("Counting denovo variants...") fill_variant_counts(denovo_variants, agps, config, person_ids, True) logger.info("Done counting denovo variants") if has_rare: logger.info("Collecting rare variants") rare_variants = dict() for dataset_id, filters in config.datasets.items(): genotype_data = gpf_instance.get_genotype_data(dataset_id) assert genotype_data is not None, dataset_id if args.gene_sets_genes or args.genes: genes = gene_symbols else: genes = None rare_variants[dataset_id] = [] for statistic in filters.statistics: if statistic.category == "denovo": continue kwargs = dict() kwargs["roles"] = "prb or sib" if statistic.effects is not None: kwargs["effect_types"] = \ expand_effect_types(statistic.effects) if statistic.variant_types: variant_types = [ VariantType.from_name(statistic.variant_types).repr() ] kwargs["variant_type"] = " or ".join(variant_types) if statistic.scores: scores = [] for score in statistic.scores: min_max = (score.min, score.max) score_filter = (score.name, min_max) scores.append(score_filter) kwargs["real_attr_filter"] = scores if statistic.variant_types: roles = [Role.from_name(statistic.roles).repr()] kwargs["roles"] = " or ".join(roles) rare_variants[dataset_id].extend( list( genotype_data.query_variants( genes=genes, inheritance=[ "not denovo and " "not possible_denovo and not possible_omission", "mendelian or missing" ], frequency_filter=[("af_allele_freq", (None, 1.0))], **kwargs))) logger.info("Done collecting rare variants") logger.info("Counting rare variants...") fill_variant_counts(rare_variants, agps, config, person_ids, False) logger.info("Done counting rare variants") logger.info("Calculating rates...") calculate_rates(gpf_instance, agps, config) logger.info("Done calculating rates") elapsed = time.time() - generate_end logger.info(f"Took {elapsed:.2f} secs") agpdb = AutismGeneProfileDB( gpf_instance._autism_gene_profile_config.to_dict(), args.dbfile, clear=True) agpdb.clear_all_tables() agpdb.populate_data_tables(gpf_instance.get_genotype_data_ids()) logger.info("Inserting statistics into DB") agpdb.insert_agps(agps.values()) logger.info("Building AGP output view") agpdb.build_agp_view() logger.info("Generating cache table") agpdb.generate_cache_table() logger.info("Done")
def variant_type_converter(a): if not isinstance(a, VariantType): return VariantType.from_name(a) return a