def _format_criterias(cls, standard_criterias): """ Replicates functionality from denovo gene set config parser. Given a TOML config's standard criterias, it does additional formatting which was done before in the parser. """ effect_type_criterias = [] for name, criteria in standard_criterias.effect_types.segments.items(): effect_type_criterias.append({ "property": "effect_types", "name": name, "value": expand_effect_types(criteria), }) sex_criterias = [] for name, criteria in standard_criterias.sexes.segments.items(): sex_criterias.append({ "property": "sexes", "name": name, "value": [Sex.from_name(criteria)], }) return (effect_type_criterias, sex_criterias)
def calc(self, effect_types, gene_syms, variants, children_by_sex): from dae.utils.effect_utils import expand_effect_types requested_effect_types = expand_effect_types(effect_types) enrichment_events = self.event_counter.events( variants, children_by_sex, requested_effect_types ) self.background.calc_stats( effect_types, enrichment_events, gene_syms, children_by_sex ) return enrichment_events
def __init__( self, dataset, enrichment_tool, gene_syms): self.dataset = dataset self.gene_syms = gene_syms self.tool = enrichment_tool self.results = None enrichment_config = self.tool.config assert enrichment_config is not None effect_types = expand_effect_types(enrichment_config.effect_types) self.person_set_collection = self.dataset.get_person_set_collection( enrichment_config.selected_person_set_collections[0] ) self.gh = GenotypeHelper( self.dataset, self.person_set_collection, effect_types=effect_types)
def get_variant_count(gene_symbol, person_set, effect, variants, person_ids, dataset_id): pids = set(person_ids[dataset_id][person_set]) ets = set(expand_effect_types(effect)) def filter_variant(fv): members = set() for aa in fv.alt_alleles: for member in aa.variant_in_members: if member is not None: members.add(member) in_members = len(pids.intersection(members)) > 0 in_effect_types = len(ets.intersection(fv.effect_types)) > 0 in_gene_syms = gene_symbol in fv.effect_gene_symbols return in_members and in_effect_types and in_gene_syms return len(list(filter(filter_variant, variants[dataset_id])))
def query_variants(self, regions=None, genes=None, effect_types=None, family_ids=None, person_ids=None, person_set_collection=None, inheritance=None, roles=None, sexes=None, variant_type=None, real_attr_filter=None, frequency_filter=None, ultra_rare=None, return_reference=None, return_unknown=None, limit=None, study_filters=None, affected_status=None, **kwargs): if len(kwargs): # FIXME This will remain so it can be used for discovering # when excess kwargs are passed in order to fix such cases. logger.warning( "received excess keyword arguments when querying variants!") logger.warning("kwargs received: {}".format(list(kwargs.keys()))) logger.info(f"study_filters: {study_filters}") if study_filters is not None and self.study_id not in study_filters: return person_ids = self._transform_person_set_collection_query( person_set_collection, person_ids) if person_ids is not None and len(person_ids) == 0: return if effect_types: effect_types = expand_effect_types(effect_types) for variant in self._backend.query_variants( regions=regions, genes=genes, effect_types=effect_types, family_ids=family_ids, person_ids=person_ids, inheritance=inheritance, roles=roles, sexes=sexes, variant_type=variant_type, real_attr_filter=real_attr_filter, ultra_rare=ultra_rare, frequency_filter=frequency_filter, return_reference=return_reference, return_unknown=return_unknown, limit=limit, affected_status=affected_status): for allele in variant.alleles: if allele.get_attribute("study_name") is None: allele.update_attributes({"study_name": self.name}) if allele.get_attribute("study_phenotype") is None: allele.update_attributes( {"study_phenotype": self.study_phenotype}) yield variant
def main(gpf_instance=None, argv=None): description = "Generate autism gene profile statistics tool" parser = argparse.ArgumentParser(description=description) parser.add_argument('--verbose', '-V', '-v', action='count', default=0) default_dbfile = os.path.join(os.getenv("DAE_DB_DIR", "./"), "agpdb") parser.add_argument("--dbfile", default=default_dbfile) parser.add_argument( "--gene-sets-genes", action="store_true", help="Generate AGPs only for genes contained in the config's gene sets" ) parser.add_argument( "--genes", help="Comma separated list of genes to generate statistics for") parser.add_argument("--drop", action="store_true") args = parser.parse_args(argv) if args.verbose == 1: logging.basicConfig(level=logging.WARNING) elif args.verbose == 2: logging.basicConfig(level=logging.INFO) elif args.verbose >= 3: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.ERROR) logging.getLogger("impala").setLevel(logging.WARNING) start = time.time() if gpf_instance is None: gpf_instance = GPFInstance() config = gpf_instance._autism_gene_profile_config # gpf_instance.gene_sets_db.get_all_gene_sets("main") collections_gene_sets = [] for gs_category in config.gene_sets: for gs in gs_category.sets: gs_id = gs["set_id"] collection_id = gs["collection_id"] collections_gene_sets.append( (collection_id, gpf_instance.gene_sets_db.get_gene_set(collection_id, gs_id))) # collections_gene_sets = [] # for name in config.gene_sets: # gene_set = gpf_instance.gene_sets_db.get_gene_set("main", name) # collections_gene_sets.append(gene_set) logger.info(f"collected gene sets: {len(collections_gene_sets)}") # gene_sets = list( # filter(lambda gs: gs["name"] in config.gene_sets, gene_sets) # ) gene_symbols = set() if args.genes: gene_symbols = [gs.strip() for gs in args.genes.split(",")] gene_symbols = set(gene_symbols) elif args.gene_sets_genes: for _, gs in collections_gene_sets: gene_symbols = gene_symbols.union(gs["syms"]) else: gene_models = gpf_instance.get_genome().get_gene_models().gene_models gene_symbols = set(gene_models.keys()) gs_count = len(gene_symbols) logger.info(f"Collected {gs_count} gene symbols") has_denovo = False has_rare = False person_ids = dict() for dataset_id, filters in config.datasets.items(): genotype_data = gpf_instance.get_genotype_data(dataset_id) assert genotype_data is not None, dataset_id person_ids[dataset_id] = dict() for ps in filters.person_sets: person_set_query = (ps.collection_name, [ps.set_name]) person_ids[dataset_id][ps.set_name] = \ genotype_data._transform_person_set_collection_query( person_set_query, None ) for stat in filters.statistics: if stat.category == "denovo": has_denovo = True elif stat.category == "rare": has_rare = True agps = dict() gene_symbols = list(gene_symbols) gs_count = len(gene_symbols) elapsed = time.time() - start logger.info(f"data collected: {elapsed:.2f} secs") start = time.time() for idx, sym in enumerate(gene_symbols, 1): gs, agp = generate_agp(gpf_instance, sym, collections_gene_sets) agps[gs] = agp if idx % 25 == 0: elapsed = time.time() - start logger.info(f"Generated {idx}/{gs_count} AGP statistics " f"{elapsed:.2f} secs") logger.info("Done generating AGP statistics!") generate_end = time.time() elapsed = generate_end - start logger.info(f"Took {elapsed:.2f} secs") if has_denovo: logger.info("Collecting denovo variants") denovo_variants = dict() for dataset_id, filters in config.datasets.items(): genotype_data = gpf_instance.get_genotype_data(dataset_id) assert genotype_data is not None, dataset_id if args.gene_sets_genes or args.genes: genes = gene_symbols else: genes = None denovo_variants[dataset_id] = list( genotype_data.query_variants(genes=genes, inheritance="denovo")) logger.info("Done collecting denovo variants") logger.info("Counting denovo variants...") fill_variant_counts(denovo_variants, agps, config, person_ids, True) logger.info("Done counting denovo variants") if has_rare: logger.info("Collecting rare variants") rare_variants = dict() for dataset_id, filters in config.datasets.items(): genotype_data = gpf_instance.get_genotype_data(dataset_id) assert genotype_data is not None, dataset_id if args.gene_sets_genes or args.genes: genes = gene_symbols else: genes = None rare_variants[dataset_id] = [] for statistic in filters.statistics: if statistic.category == "denovo": continue kwargs = dict() kwargs["roles"] = "prb or sib" if statistic.effects is not None: kwargs["effect_types"] = \ expand_effect_types(statistic.effects) if statistic.variant_types: variant_types = [ VariantType.from_name(statistic.variant_types).repr() ] kwargs["variant_type"] = " or ".join(variant_types) if statistic.scores: scores = [] for score in statistic.scores: min_max = (score.min, score.max) score_filter = (score.name, min_max) scores.append(score_filter) kwargs["real_attr_filter"] = scores if statistic.variant_types: roles = [Role.from_name(statistic.roles).repr()] kwargs["roles"] = " or ".join(roles) rare_variants[dataset_id].extend( list( genotype_data.query_variants( genes=genes, inheritance=[ "not denovo and " "not possible_denovo and not possible_omission", "mendelian or missing" ], frequency_filter=[("af_allele_freq", (None, 1.0))], **kwargs))) logger.info("Done collecting rare variants") logger.info("Counting rare variants...") fill_variant_counts(rare_variants, agps, config, person_ids, False) logger.info("Done counting rare variants") logger.info("Calculating rates...") calculate_rates(gpf_instance, agps, config) logger.info("Done calculating rates") elapsed = time.time() - generate_end logger.info(f"Took {elapsed:.2f} secs") agpdb = AutismGeneProfileDB( gpf_instance._autism_gene_profile_config.to_dict(), args.dbfile, clear=True) agpdb.clear_all_tables() agpdb.populate_data_tables(gpf_instance.get_genotype_data_ids()) logger.info("Inserting statistics into DB") agpdb.insert_agps(agps.values()) logger.info("Building AGP output view") agpdb.build_agp_view() logger.info("Generating cache table") agpdb.generate_cache_table() logger.info("Done")
def count_variant(v, dataset_id, agps, config, person_ids, denovo_flag): filters = config.datasets[dataset_id] members = set() for aa in v.alt_alleles: for member in aa.variant_in_members: if member is not None: members.add(member) for ps in filters.person_sets: pids = set(person_ids[dataset_id][ps.set_name]) for statistic in filters.statistics: dump = {} if statistic.category == "denovo" and not denovo_flag: continue if statistic.category == "rare" and denovo_flag: continue stat_id = statistic.id do_count = True in_members = len(pids.intersection(members)) > 0 do_count = do_count and in_members dump[1] = do_count if statistic.get("effects"): ets = set(expand_effect_types(statistic.effects)) in_effect_types = len(ets.intersection(v.effect_types)) > 0 do_count = do_count and in_effect_types dump[2] = do_count if statistic.get("scores"): for score in statistic.scores: score_name = score["name"] score_min = score.get("min") score_max = score.get("max") score_value = v.get_attribute(score_name)[0] if score_value is None: do_count = False if score_min: do_count = do_count and score_value >= score_min if score_max: do_count = do_count and score_value <= score_max dump[3] = do_count if statistic.get("category") == "rare": aa = v.alt_alleles[0] freq = aa.get_attribute("af_allele_freq") if freq: do_count = do_count and freq <= 1.0 dump[4] = do_count if statistic.get("variant_types"): variant_types = { VariantType.from_name(t) for t in statistic.variant_types } do_count = do_count and \ len(variant_types.intersection(v.variant_types)) dump[5] = do_count if statistic.get("roles"): roles = {Role.from_name(r) for r in statistic.roles} v_roles = set(v.alt_alleles[0].variant_in_roles) do_count = do_count and \ len(v_roles.intersection(roles)) dump[6] = do_count # if v.position == 152171343: # from pprint import pprint # print(100*"+") # print(ps.set_name, stat_id, do_count, v) # # for aa in v.alt_alleles: # # print(aa.attributes) # pprint(dump) # print(100*"+") if do_count: add_variant_count(v, agps, dataset_id, ps.set_name, stat_id)
def transform_kwargs(self, **kwargs): logger.debug(f"kwargs in study wrapper: {kwargs}") self._add_inheritance_to_query( "not possible_denovo and not possible_omission", kwargs) kwargs = self._add_people_with_people_group(kwargs) if "querySummary" in kwargs: kwargs["query_summary"] = kwargs["querySummary"] del kwargs["querySummary"] if "uniqueFamilyVariants" in kwargs: kwargs["unique_family_variants"] = kwargs["uniqueFamilyVariants"] del kwargs["uniqueFamilyVariants"] if "regions" in kwargs: kwargs["regions"] = list(map(Region.from_str, kwargs["regions"])) if "presentInChild" in kwargs or "presentInParent" in kwargs: if "presentInChild" in kwargs: present_in_child = set(kwargs["presentInChild"]) kwargs.pop("presentInChild") else: present_in_child = set() if "presentInParent" in kwargs: present_in_parent = \ set(kwargs["presentInParent"]["presentInParent"]) rarity = kwargs["presentInParent"].get("rarity", None) kwargs.pop("presentInParent") else: present_in_parent = set() rarity = None roles_query = self._transform_present_in_child_and_parent_roles( present_in_child, present_in_parent) self._add_roles_to_query(roles_query, kwargs) inheritance = \ self._transform_present_in_child_and_parent_inheritance( present_in_child, present_in_parent ) self._add_inheritance_to_query(inheritance, kwargs) if present_in_parent != {"neither"} and rarity is not None: frequency_filter = kwargs.get("frequency_filter", []) arg, val = \ self._transform_present_in_child_and_parent_frequency( present_in_child, present_in_parent, rarity, frequency_filter ) if arg is not None: kwargs[arg] = val if ("minAltFrequencyPercent" in kwargs or "maxAltFrequencyPercent" in kwargs): min_value = kwargs.pop("minAltFrequencyPercent", None) max_value = kwargs.pop("maxAltFrequencyPercent", None) if "real_attr_filter" not in kwargs: kwargs["real_attr_filter"] = [] value_range = self._transform_min_max_alt_frequency( min_value, max_value) if value_range is not None: kwargs["real_attr_filter"].append(value_range) if "genomicScores" in kwargs: genomic_scores = kwargs.pop("genomicScores", []) if "real_attr_filter" not in kwargs: kwargs["real_attr_filter"] = [] kwargs["real_attr_filter"] += self._transform_genomic_scores( genomic_scores) if "geneWeights" in kwargs: gene_weights = kwargs.pop("geneWeights", {}) genes = self._transform_gene_weights(gene_weights) if genes is not None: if "genes" not in kwargs: kwargs["genes"] = [] kwargs["genes"] += genes if "gender" in kwargs: sexes = set(kwargs["gender"]) if sexes != set(["female", "male", "unspecified"]): sexes = [ContainsNode(sex_converter(sex)) for sex in sexes] kwargs["gender"] = OrNode(sexes) else: kwargs["gender"] = None if "variantTypes" in kwargs: variant_types = set(kwargs["variantTypes"]) if variant_types != {"ins", "del", "sub", "CNV"}: if "CNV" in variant_types: variant_types.remove("CNV") variant_types.add("CNV+") variant_types.add("CNV-") variant_types = [ ContainsNode(variant_type_converter(t)) for t in variant_types ] kwargs["variantTypes"] = OrNode(variant_types) else: del kwargs["variantTypes"] if "effectTypes" in kwargs: kwargs["effectTypes"] = expand_effect_types(kwargs["effectTypes"]) if kwargs.get("studyFilters"): request = set([sf["studyId"] for sf in kwargs["studyFilters"]]) if kwargs.get("allowed_studies"): request = request & set(kwargs.pop("allowed_studies")) kwargs["study_filters"] = request del kwargs["studyFilters"] elif kwargs.get("allowed_studies"): kwargs["study_filters"] = set(kwargs.pop("allowed_studies")) if "personFilters" in kwargs: person_filters = kwargs.pop("personFilters") if person_filters: matching_person_ids = self._transform_filters_to_ids( person_filters) if matching_person_ids is not None and kwargs.get("personIds"): kwargs["personIds"] = set.intersection( matching_person_ids, set(kwargs.pop("personIds"))) else: kwargs["personIds"] = matching_person_ids if "familyFilters" in kwargs: family_filters = kwargs.pop("familyFilters") if family_filters: matching_family_ids = self._transform_filters_to_ids( family_filters) if matching_family_ids is not None and kwargs.get("familyIds"): kwargs["familyIds"] = set.intersection( matching_family_ids, set(kwargs.pop("familyIds"))) else: kwargs["familyIds"] = matching_family_ids if "personIds" in kwargs: kwargs["personIds"] = list(kwargs["personIds"]) if "familyTypes" in kwargs: family_ids_with_types = set() for family_type in kwargs["familyTypes"]: family_type = FamilyType.from_name(family_type) family_ids_with_types = set.union( family_ids_with_types, self.study_wrapper.families.families_by_type.get( family_type, set())) if "familyIds" in kwargs: family_ids_with_types = set.intersection( family_ids_with_types, set(kwargs.pop("familyIds"))) kwargs["familyIds"] = family_ids_with_types if kwargs.get("inheritanceTypeFilter"): inheritance = kwargs.get("inheritance", []) inheritance.append("any({})".format(",".join( kwargs["inheritanceTypeFilter"]))) kwargs["inheritance"] = inheritance kwargs.pop("inheritanceTypeFilter") if "affectedStatus" in kwargs: statuses = kwargs.pop("affectedStatus") kwargs["affected_status"] = [status.lower() for status in statuses] for key in list(kwargs.keys()): if key in self.FILTER_RENAMES_MAP: kwargs[self.FILTER_RENAMES_MAP[key]] = kwargs[key] kwargs.pop(key) return kwargs