def _format_criterias(cls, standard_criterias):
        """
        Replicates functionality from denovo gene set config parser.
        Given a TOML config's standard criterias, it does additional formatting
        which was done before in the parser.
        """

        effect_type_criterias = []
        for name, criteria in standard_criterias.effect_types.segments.items():
            effect_type_criterias.append({
                "property":
                "effect_types",
                "name":
                name,
                "value":
                expand_effect_types(criteria),
            })
        sex_criterias = []
        for name, criteria in standard_criterias.sexes.segments.items():
            sex_criterias.append({
                "property": "sexes",
                "name": name,
                "value": [Sex.from_name(criteria)],
            })
        return (effect_type_criterias, sex_criterias)
Exemplo n.º 2
0
    def calc(self, effect_types, gene_syms, variants, children_by_sex):
        from dae.utils.effect_utils import expand_effect_types

        requested_effect_types = expand_effect_types(effect_types)
        enrichment_events = self.event_counter.events(
            variants, children_by_sex, requested_effect_types
        )
        self.background.calc_stats(
            effect_types, enrichment_events, gene_syms, children_by_sex
        )
        return enrichment_events
Exemplo n.º 3
0
    def __init__(
            self, dataset, enrichment_tool, gene_syms):
        self.dataset = dataset
        self.gene_syms = gene_syms
        self.tool = enrichment_tool
        self.results = None
        enrichment_config = self.tool.config
        assert enrichment_config is not None
        effect_types = expand_effect_types(enrichment_config.effect_types)

        self.person_set_collection = self.dataset.get_person_set_collection(
            enrichment_config.selected_person_set_collections[0]
        )

        self.gh = GenotypeHelper(
            self.dataset, self.person_set_collection,
            effect_types=effect_types)
Exemplo n.º 4
0
def get_variant_count(gene_symbol, person_set, effect, variants, person_ids,
                      dataset_id):
    pids = set(person_ids[dataset_id][person_set])
    ets = set(expand_effect_types(effect))

    def filter_variant(fv):
        members = set()
        for aa in fv.alt_alleles:
            for member in aa.variant_in_members:
                if member is not None:
                    members.add(member)
        in_members = len(pids.intersection(members)) > 0
        in_effect_types = len(ets.intersection(fv.effect_types)) > 0
        in_gene_syms = gene_symbol in fv.effect_gene_symbols

        return in_members and in_effect_types and in_gene_syms

    return len(list(filter(filter_variant, variants[dataset_id])))
Exemplo n.º 5
0
    def query_variants(self,
                       regions=None,
                       genes=None,
                       effect_types=None,
                       family_ids=None,
                       person_ids=None,
                       person_set_collection=None,
                       inheritance=None,
                       roles=None,
                       sexes=None,
                       variant_type=None,
                       real_attr_filter=None,
                       frequency_filter=None,
                       ultra_rare=None,
                       return_reference=None,
                       return_unknown=None,
                       limit=None,
                       study_filters=None,
                       affected_status=None,
                       **kwargs):

        if len(kwargs):
            # FIXME This will remain so it can be used for discovering
            # when excess kwargs are passed in order to fix such cases.
            logger.warning(
                "received excess keyword arguments when querying variants!")
            logger.warning("kwargs received: {}".format(list(kwargs.keys())))

        logger.info(f"study_filters: {study_filters}")

        if study_filters is not None and self.study_id not in study_filters:
            return

        person_ids = self._transform_person_set_collection_query(
            person_set_collection, person_ids)

        if person_ids is not None and len(person_ids) == 0:
            return

        if effect_types:
            effect_types = expand_effect_types(effect_types)

        for variant in self._backend.query_variants(
                regions=regions,
                genes=genes,
                effect_types=effect_types,
                family_ids=family_ids,
                person_ids=person_ids,
                inheritance=inheritance,
                roles=roles,
                sexes=sexes,
                variant_type=variant_type,
                real_attr_filter=real_attr_filter,
                ultra_rare=ultra_rare,
                frequency_filter=frequency_filter,
                return_reference=return_reference,
                return_unknown=return_unknown,
                limit=limit,
                affected_status=affected_status):

            for allele in variant.alleles:
                if allele.get_attribute("study_name") is None:
                    allele.update_attributes({"study_name": self.name})
                if allele.get_attribute("study_phenotype") is None:
                    allele.update_attributes(
                        {"study_phenotype": self.study_phenotype})
            yield variant
Exemplo n.º 6
0
def main(gpf_instance=None, argv=None):
    description = "Generate autism gene profile statistics tool"
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument('--verbose', '-V', '-v', action='count', default=0)
    default_dbfile = os.path.join(os.getenv("DAE_DB_DIR", "./"), "agpdb")
    parser.add_argument("--dbfile", default=default_dbfile)
    parser.add_argument(
        "--gene-sets-genes",
        action="store_true",
        help="Generate AGPs only for genes contained in the config's gene sets"
    )
    parser.add_argument(
        "--genes",
        help="Comma separated list of genes to generate statistics for")
    parser.add_argument("--drop", action="store_true")

    args = parser.parse_args(argv)
    if args.verbose == 1:
        logging.basicConfig(level=logging.WARNING)
    elif args.verbose == 2:
        logging.basicConfig(level=logging.INFO)
    elif args.verbose >= 3:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.ERROR)
    logging.getLogger("impala").setLevel(logging.WARNING)

    start = time.time()
    if gpf_instance is None:
        gpf_instance = GPFInstance()

    config = gpf_instance._autism_gene_profile_config

    # gpf_instance.gene_sets_db.get_all_gene_sets("main")

    collections_gene_sets = []

    for gs_category in config.gene_sets:
        for gs in gs_category.sets:
            gs_id = gs["set_id"]
            collection_id = gs["collection_id"]

            collections_gene_sets.append(
                (collection_id,
                 gpf_instance.gene_sets_db.get_gene_set(collection_id, gs_id)))

    # collections_gene_sets = []
    # for name in config.gene_sets:
    #     gene_set = gpf_instance.gene_sets_db.get_gene_set("main", name)
    #     collections_gene_sets.append(gene_set)
    logger.info(f"collected gene sets: {len(collections_gene_sets)}")

    # gene_sets = list(
    #     filter(lambda gs: gs["name"] in config.gene_sets, gene_sets)
    # )
    gene_symbols = set()
    if args.genes:
        gene_symbols = [gs.strip() for gs in args.genes.split(",")]
        gene_symbols = set(gene_symbols)
    elif args.gene_sets_genes:
        for _, gs in collections_gene_sets:
            gene_symbols = gene_symbols.union(gs["syms"])
    else:
        gene_models = gpf_instance.get_genome().get_gene_models().gene_models
        gene_symbols = set(gene_models.keys())
    gs_count = len(gene_symbols)
    logger.info(f"Collected {gs_count} gene symbols")
    has_denovo = False
    has_rare = False
    person_ids = dict()
    for dataset_id, filters in config.datasets.items():
        genotype_data = gpf_instance.get_genotype_data(dataset_id)
        assert genotype_data is not None, dataset_id
        person_ids[dataset_id] = dict()
        for ps in filters.person_sets:
            person_set_query = (ps.collection_name, [ps.set_name])
            person_ids[dataset_id][ps.set_name] = \
                genotype_data._transform_person_set_collection_query(
                    person_set_query, None
                )
        for stat in filters.statistics:
            if stat.category == "denovo":
                has_denovo = True
            elif stat.category == "rare":
                has_rare = True

    agps = dict()
    gene_symbols = list(gene_symbols)
    gs_count = len(gene_symbols)
    elapsed = time.time() - start
    logger.info(f"data collected: {elapsed:.2f} secs")

    start = time.time()
    for idx, sym in enumerate(gene_symbols, 1):
        gs, agp = generate_agp(gpf_instance, sym, collections_gene_sets)
        agps[gs] = agp
        if idx % 25 == 0:
            elapsed = time.time() - start
            logger.info(f"Generated {idx}/{gs_count} AGP statistics "
                        f"{elapsed:.2f} secs")

    logger.info("Done generating AGP statistics!")
    generate_end = time.time()
    elapsed = generate_end - start
    logger.info(f"Took {elapsed:.2f} secs")

    if has_denovo:
        logger.info("Collecting denovo variants")
        denovo_variants = dict()
        for dataset_id, filters in config.datasets.items():
            genotype_data = gpf_instance.get_genotype_data(dataset_id)
            assert genotype_data is not None, dataset_id
            if args.gene_sets_genes or args.genes:
                genes = gene_symbols
            else:
                genes = None

            denovo_variants[dataset_id] = list(
                genotype_data.query_variants(genes=genes,
                                             inheritance="denovo"))
        logger.info("Done collecting denovo variants")
        logger.info("Counting denovo variants...")
        fill_variant_counts(denovo_variants, agps, config, person_ids, True)
        logger.info("Done counting denovo variants")

    if has_rare:
        logger.info("Collecting rare variants")
        rare_variants = dict()
        for dataset_id, filters in config.datasets.items():
            genotype_data = gpf_instance.get_genotype_data(dataset_id)
            assert genotype_data is not None, dataset_id
            if args.gene_sets_genes or args.genes:
                genes = gene_symbols
            else:
                genes = None

            rare_variants[dataset_id] = []
            for statistic in filters.statistics:
                if statistic.category == "denovo":
                    continue
                kwargs = dict()
                kwargs["roles"] = "prb or sib"

                if statistic.effects is not None:
                    kwargs["effect_types"] = \
                        expand_effect_types(statistic.effects)

                if statistic.variant_types:
                    variant_types = [
                        VariantType.from_name(statistic.variant_types).repr()
                    ]
                    kwargs["variant_type"] = " or ".join(variant_types)

                if statistic.scores:
                    scores = []
                    for score in statistic.scores:
                        min_max = (score.min, score.max)
                        score_filter = (score.name, min_max)
                        scores.append(score_filter)
                    kwargs["real_attr_filter"] = scores

                if statistic.variant_types:
                    roles = [Role.from_name(statistic.roles).repr()]
                    kwargs["roles"] = " or ".join(roles)

                rare_variants[dataset_id].extend(
                    list(
                        genotype_data.query_variants(
                            genes=genes,
                            inheritance=[
                                "not denovo and "
                                "not possible_denovo and not possible_omission",
                                "mendelian or missing"
                            ],
                            frequency_filter=[("af_allele_freq", (None, 1.0))],
                            **kwargs)))
        logger.info("Done collecting rare variants")
        logger.info("Counting rare variants...")
        fill_variant_counts(rare_variants, agps, config, person_ids, False)
        logger.info("Done counting rare variants")

    logger.info("Calculating rates...")
    calculate_rates(gpf_instance, agps, config)
    logger.info("Done calculating rates")
    elapsed = time.time() - generate_end
    logger.info(f"Took {elapsed:.2f} secs")

    agpdb = AutismGeneProfileDB(
        gpf_instance._autism_gene_profile_config.to_dict(),
        args.dbfile,
        clear=True)

    agpdb.clear_all_tables()
    agpdb.populate_data_tables(gpf_instance.get_genotype_data_ids())
    logger.info("Inserting statistics into DB")
    agpdb.insert_agps(agps.values())
    logger.info("Building AGP output view")
    agpdb.build_agp_view()
    logger.info("Generating cache table")
    agpdb.generate_cache_table()
    logger.info("Done")
Exemplo n.º 7
0
def count_variant(v, dataset_id, agps, config, person_ids, denovo_flag):
    filters = config.datasets[dataset_id]
    members = set()

    for aa in v.alt_alleles:
        for member in aa.variant_in_members:
            if member is not None:
                members.add(member)

    for ps in filters.person_sets:
        pids = set(person_ids[dataset_id][ps.set_name])
        for statistic in filters.statistics:
            dump = {}
            if statistic.category == "denovo" and not denovo_flag:
                continue
            if statistic.category == "rare" and denovo_flag:
                continue

            stat_id = statistic.id
            do_count = True

            in_members = len(pids.intersection(members)) > 0

            do_count = do_count and in_members
            dump[1] = do_count

            if statistic.get("effects"):
                ets = set(expand_effect_types(statistic.effects))
                in_effect_types = len(ets.intersection(v.effect_types)) > 0
                do_count = do_count and in_effect_types
                dump[2] = do_count

            if statistic.get("scores"):
                for score in statistic.scores:
                    score_name = score["name"]
                    score_min = score.get("min")
                    score_max = score.get("max")
                    score_value = v.get_attribute(score_name)[0]

                    if score_value is None:
                        do_count = False

                    if score_min:
                        do_count = do_count and score_value >= score_min
                    if score_max:
                        do_count = do_count and score_value <= score_max

                dump[3] = do_count

            if statistic.get("category") == "rare":
                aa = v.alt_alleles[0]
                freq = aa.get_attribute("af_allele_freq")

                if freq:
                    do_count = do_count and freq <= 1.0
                dump[4] = do_count

            if statistic.get("variant_types"):
                variant_types = {
                    VariantType.from_name(t)
                    for t in statistic.variant_types
                }
                do_count = do_count and \
                    len(variant_types.intersection(v.variant_types))
                dump[5] = do_count

            if statistic.get("roles"):
                roles = {Role.from_name(r) for r in statistic.roles}
                v_roles = set(v.alt_alleles[0].variant_in_roles)
                do_count = do_count and \
                    len(v_roles.intersection(roles))
                dump[6] = do_count

            # if v.position == 152171343:
            #     from pprint import pprint
            #     print(100*"+")
            #     print(ps.set_name, stat_id, do_count, v)
            #     # for aa in v.alt_alleles:
            #     #     print(aa.attributes)
            #     pprint(dump)
            #     print(100*"+")
            if do_count:
                add_variant_count(v, agps, dataset_id, ps.set_name, stat_id)
Exemplo n.º 8
0
    def transform_kwargs(self, **kwargs):
        logger.debug(f"kwargs in study wrapper: {kwargs}")
        self._add_inheritance_to_query(
            "not possible_denovo and not possible_omission", kwargs)

        kwargs = self._add_people_with_people_group(kwargs)

        if "querySummary" in kwargs:
            kwargs["query_summary"] = kwargs["querySummary"]
            del kwargs["querySummary"]

        if "uniqueFamilyVariants" in kwargs:
            kwargs["unique_family_variants"] = kwargs["uniqueFamilyVariants"]
            del kwargs["uniqueFamilyVariants"]

        if "regions" in kwargs:
            kwargs["regions"] = list(map(Region.from_str, kwargs["regions"]))

        if "presentInChild" in kwargs or "presentInParent" in kwargs:
            if "presentInChild" in kwargs:
                present_in_child = set(kwargs["presentInChild"])
                kwargs.pop("presentInChild")
            else:
                present_in_child = set()

            if "presentInParent" in kwargs:
                present_in_parent = \
                    set(kwargs["presentInParent"]["presentInParent"])
                rarity = kwargs["presentInParent"].get("rarity", None)
                kwargs.pop("presentInParent")
            else:
                present_in_parent = set()
                rarity = None

            roles_query = self._transform_present_in_child_and_parent_roles(
                present_in_child, present_in_parent)
            self._add_roles_to_query(roles_query, kwargs)

            inheritance = \
                self._transform_present_in_child_and_parent_inheritance(
                    present_in_child, present_in_parent
                )
            self._add_inheritance_to_query(inheritance, kwargs)

            if present_in_parent != {"neither"} and rarity is not None:
                frequency_filter = kwargs.get("frequency_filter", [])
                arg, val = \
                    self._transform_present_in_child_and_parent_frequency(
                        present_in_child, present_in_parent,
                        rarity, frequency_filter
                    )
                if arg is not None:
                    kwargs[arg] = val

        if ("minAltFrequencyPercent" in kwargs
                or "maxAltFrequencyPercent" in kwargs):
            min_value = kwargs.pop("minAltFrequencyPercent", None)
            max_value = kwargs.pop("maxAltFrequencyPercent", None)
            if "real_attr_filter" not in kwargs:
                kwargs["real_attr_filter"] = []
            value_range = self._transform_min_max_alt_frequency(
                min_value, max_value)
            if value_range is not None:
                kwargs["real_attr_filter"].append(value_range)

        if "genomicScores" in kwargs:
            genomic_scores = kwargs.pop("genomicScores", [])
            if "real_attr_filter" not in kwargs:
                kwargs["real_attr_filter"] = []
            kwargs["real_attr_filter"] += self._transform_genomic_scores(
                genomic_scores)

        if "geneWeights" in kwargs:
            gene_weights = kwargs.pop("geneWeights", {})
            genes = self._transform_gene_weights(gene_weights)
            if genes is not None:
                if "genes" not in kwargs:
                    kwargs["genes"] = []
                kwargs["genes"] += genes

        if "gender" in kwargs:
            sexes = set(kwargs["gender"])
            if sexes != set(["female", "male", "unspecified"]):
                sexes = [ContainsNode(sex_converter(sex)) for sex in sexes]
                kwargs["gender"] = OrNode(sexes)
            else:
                kwargs["gender"] = None

        if "variantTypes" in kwargs:
            variant_types = set(kwargs["variantTypes"])

            if variant_types != {"ins", "del", "sub", "CNV"}:
                if "CNV" in variant_types:
                    variant_types.remove("CNV")
                    variant_types.add("CNV+")
                    variant_types.add("CNV-")

                variant_types = [
                    ContainsNode(variant_type_converter(t))
                    for t in variant_types
                ]
                kwargs["variantTypes"] = OrNode(variant_types)
            else:
                del kwargs["variantTypes"]

        if "effectTypes" in kwargs:
            kwargs["effectTypes"] = expand_effect_types(kwargs["effectTypes"])

        if kwargs.get("studyFilters"):
            request = set([sf["studyId"] for sf in kwargs["studyFilters"]])
            if kwargs.get("allowed_studies"):
                request = request & set(kwargs.pop("allowed_studies"))
            kwargs["study_filters"] = request

            del kwargs["studyFilters"]
        elif kwargs.get("allowed_studies"):
            kwargs["study_filters"] = set(kwargs.pop("allowed_studies"))

        if "personFilters" in kwargs:
            person_filters = kwargs.pop("personFilters")
            if person_filters:
                matching_person_ids = self._transform_filters_to_ids(
                    person_filters)
                if matching_person_ids is not None and kwargs.get("personIds"):
                    kwargs["personIds"] = set.intersection(
                        matching_person_ids, set(kwargs.pop("personIds")))
                else:
                    kwargs["personIds"] = matching_person_ids

        if "familyFilters" in kwargs:
            family_filters = kwargs.pop("familyFilters")
            if family_filters:
                matching_family_ids = self._transform_filters_to_ids(
                    family_filters)
                if matching_family_ids is not None and kwargs.get("familyIds"):
                    kwargs["familyIds"] = set.intersection(
                        matching_family_ids, set(kwargs.pop("familyIds")))
                else:
                    kwargs["familyIds"] = matching_family_ids

        if "personIds" in kwargs:
            kwargs["personIds"] = list(kwargs["personIds"])

        if "familyTypes" in kwargs:
            family_ids_with_types = set()
            for family_type in kwargs["familyTypes"]:
                family_type = FamilyType.from_name(family_type)
                family_ids_with_types = set.union(
                    family_ids_with_types,
                    self.study_wrapper.families.families_by_type.get(
                        family_type, set()))
            if "familyIds" in kwargs:
                family_ids_with_types = set.intersection(
                    family_ids_with_types, set(kwargs.pop("familyIds")))
            kwargs["familyIds"] = family_ids_with_types

        if kwargs.get("inheritanceTypeFilter"):
            inheritance = kwargs.get("inheritance", [])
            inheritance.append("any({})".format(",".join(
                kwargs["inheritanceTypeFilter"])))
            kwargs["inheritance"] = inheritance

            kwargs.pop("inheritanceTypeFilter")
        if "affectedStatus" in kwargs:
            statuses = kwargs.pop("affectedStatus")
            kwargs["affected_status"] = [status.lower() for status in statuses]

        for key in list(kwargs.keys()):
            if key in self.FILTER_RENAMES_MAP:
                kwargs[self.FILTER_RENAMES_MAP[key]] = kwargs[key]
                kwargs.pop(key)

        return kwargs