示例#1
0
def main(gpf_instance=None, argv=None):
    description = "Generate autism gene profile statistics tool"
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument('--verbose', '-V', '-v', action='count', default=0)
    default_dbfile = os.path.join(os.getenv("DAE_DB_DIR", "./"), "agpdb")
    parser.add_argument("--dbfile", default=default_dbfile)
    parser.add_argument(
        "--gene-sets-genes",
        action="store_true",
        help="Generate AGPs only for genes contained in the config's gene sets"
    )
    parser.add_argument(
        "--genes",
        help="Comma separated list of genes to generate statistics for")
    parser.add_argument("--drop", action="store_true")

    args = parser.parse_args(argv)
    if args.verbose == 1:
        logging.basicConfig(level=logging.WARNING)
    elif args.verbose == 2:
        logging.basicConfig(level=logging.INFO)
    elif args.verbose >= 3:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.ERROR)
    logging.getLogger("impala").setLevel(logging.WARNING)

    start = time.time()
    if gpf_instance is None:
        gpf_instance = GPFInstance()

    config = gpf_instance._autism_gene_profile_config

    # gpf_instance.gene_sets_db.get_all_gene_sets("main")

    collections_gene_sets = []

    for gs_category in config.gene_sets:
        for gs in gs_category.sets:
            gs_id = gs["set_id"]
            collection_id = gs["collection_id"]

            collections_gene_sets.append(
                (collection_id,
                 gpf_instance.gene_sets_db.get_gene_set(collection_id, gs_id)))

    # collections_gene_sets = []
    # for name in config.gene_sets:
    #     gene_set = gpf_instance.gene_sets_db.get_gene_set("main", name)
    #     collections_gene_sets.append(gene_set)
    logger.info(f"collected gene sets: {len(collections_gene_sets)}")

    # gene_sets = list(
    #     filter(lambda gs: gs["name"] in config.gene_sets, gene_sets)
    # )
    gene_symbols = set()
    if args.genes:
        gene_symbols = [gs.strip() for gs in args.genes.split(",")]
        gene_symbols = set(gene_symbols)
    elif args.gene_sets_genes:
        for _, gs in collections_gene_sets:
            gene_symbols = gene_symbols.union(gs["syms"])
    else:
        gene_models = gpf_instance.get_genome().get_gene_models().gene_models
        gene_symbols = set(gene_models.keys())
    gs_count = len(gene_symbols)
    logger.info(f"Collected {gs_count} gene symbols")
    has_denovo = False
    has_rare = False
    person_ids = dict()
    for dataset_id, filters in config.datasets.items():
        genotype_data = gpf_instance.get_genotype_data(dataset_id)
        assert genotype_data is not None, dataset_id
        person_ids[dataset_id] = dict()
        for ps in filters.person_sets:
            person_set_query = (ps.collection_name, [ps.set_name])
            person_ids[dataset_id][ps.set_name] = \
                genotype_data._transform_person_set_collection_query(
                    person_set_query, None
                )
        for stat in filters.statistics:
            if stat.category == "denovo":
                has_denovo = True
            elif stat.category == "rare":
                has_rare = True

    agps = dict()
    gene_symbols = list(gene_symbols)
    gs_count = len(gene_symbols)
    elapsed = time.time() - start
    logger.info(f"data collected: {elapsed:.2f} secs")

    start = time.time()
    for idx, sym in enumerate(gene_symbols, 1):
        gs, agp = generate_agp(gpf_instance, sym, collections_gene_sets)
        agps[gs] = agp
        if idx % 25 == 0:
            elapsed = time.time() - start
            logger.info(f"Generated {idx}/{gs_count} AGP statistics "
                        f"{elapsed:.2f} secs")

    logger.info("Done generating AGP statistics!")
    generate_end = time.time()
    elapsed = generate_end - start
    logger.info(f"Took {elapsed:.2f} secs")

    if has_denovo:
        logger.info("Collecting denovo variants")
        denovo_variants = dict()
        for dataset_id, filters in config.datasets.items():
            genotype_data = gpf_instance.get_genotype_data(dataset_id)
            assert genotype_data is not None, dataset_id
            if args.gene_sets_genes or args.genes:
                genes = gene_symbols
            else:
                genes = None

            denovo_variants[dataset_id] = list(
                genotype_data.query_variants(genes=genes,
                                             inheritance="denovo"))
        logger.info("Done collecting denovo variants")
        logger.info("Counting denovo variants...")
        fill_variant_counts(denovo_variants, agps, config, person_ids, True)
        logger.info("Done counting denovo variants")

    if has_rare:
        logger.info("Collecting rare variants")
        rare_variants = dict()
        for dataset_id, filters in config.datasets.items():
            genotype_data = gpf_instance.get_genotype_data(dataset_id)
            assert genotype_data is not None, dataset_id
            if args.gene_sets_genes or args.genes:
                genes = gene_symbols
            else:
                genes = None

            rare_variants[dataset_id] = []
            for statistic in filters.statistics:
                if statistic.category == "denovo":
                    continue
                kwargs = dict()
                kwargs["roles"] = "prb or sib"

                if statistic.effects is not None:
                    kwargs["effect_types"] = \
                        expand_effect_types(statistic.effects)

                if statistic.variant_types:
                    variant_types = [
                        VariantType.from_name(statistic.variant_types).repr()
                    ]
                    kwargs["variant_type"] = " or ".join(variant_types)

                if statistic.scores:
                    scores = []
                    for score in statistic.scores:
                        min_max = (score.min, score.max)
                        score_filter = (score.name, min_max)
                        scores.append(score_filter)
                    kwargs["real_attr_filter"] = scores

                if statistic.variant_types:
                    roles = [Role.from_name(statistic.roles).repr()]
                    kwargs["roles"] = " or ".join(roles)

                rare_variants[dataset_id].extend(
                    list(
                        genotype_data.query_variants(
                            genes=genes,
                            inheritance=[
                                "not denovo and "
                                "not possible_denovo and not possible_omission",
                                "mendelian or missing"
                            ],
                            frequency_filter=[("af_allele_freq", (None, 1.0))],
                            **kwargs)))
        logger.info("Done collecting rare variants")
        logger.info("Counting rare variants...")
        fill_variant_counts(rare_variants, agps, config, person_ids, False)
        logger.info("Done counting rare variants")

    logger.info("Calculating rates...")
    calculate_rates(gpf_instance, agps, config)
    logger.info("Done calculating rates")
    elapsed = time.time() - generate_end
    logger.info(f"Took {elapsed:.2f} secs")

    agpdb = AutismGeneProfileDB(
        gpf_instance._autism_gene_profile_config.to_dict(),
        args.dbfile,
        clear=True)

    agpdb.clear_all_tables()
    agpdb.populate_data_tables(gpf_instance.get_genotype_data_ids())
    logger.info("Inserting statistics into DB")
    agpdb.insert_agps(agps.values())
    logger.info("Building AGP output view")
    agpdb.build_agp_view()
    logger.info("Generating cache table")
    agpdb.generate_cache_table()
    logger.info("Done")
def main(argv=sys.argv[1:], gpf_instance=None):
    if gpf_instance is None:
        gpf_instance = GPFInstance()

    argv = parse_cli_arguments(argv, gpf_instance)

    if argv.verbose == 1:
        logging.basicConfig(level=logging.WARNING)
    elif argv.verbose == 2:
        logging.basicConfig(level=logging.INFO)
    elif argv.verbose >= 3:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.ERROR)

    logging.getLogger("impala").setLevel(logging.WARNING)

    if argv.studies is None:
        study_ids = [
            gd.study_id for gd in gpf_instance.get_all_genotype_data()
            if not gd.is_group
        ]
    else:
        study_ids = [sid.strip() for sid in argv.studies.split(",")]

    logger.info(f"building summary variants tables for studies: {study_ids}")

    for study_id in study_ids:
        study = gpf_instance.get_genotype_data(study_id)
        assert study.study_id == study_id

        study_backend = study._backend
        if not isinstance(study_backend, ImpalaVariants):
            logger.warning(f"not an impala study: {study_id}; skipping...")
            continue

        if study_backend.variants_table is None:
            logger.warning(f"study {study_id} has no variants; skipping...")
            continue

        drop_summary_table(study_id, study_backend)
        partitions = create_summary_table(study_id, study_backend)

        summary_schema = collect_summary_schema(study_backend)
        summary_table = summary_table_name_temp(study_id, study_backend)
        pedigree_table = f"{study_backend.db}.{study_backend.pedigree_table}"
        variants_table = f"{study_backend.db}.{study_backend.variants_table}"

        partition_bins = {}

        logger.info(f"collecting partitions {partitions} from "
                    f"variants table {variants_table}")

        for partition in partitions:
            partition_bins[partition] = variants_parition_bins(
                study_backend, partition)

        logger.info(f"variant table partitions: {partition_bins}")

        impala = study_backend._impala_helpers
        started = time.time()

        region_bin_helpers = RegionBinsHelper(study_backend.table_properties,
                                              gpf_instance.get_genome())
        region_bin_helpers._build_region_bins()

        logger.info(
            f"region bins calculated: {region_bin_helpers.region_bins}")

        assert set(partition_bins["region_bin"]).issubset(
            set(region_bin_helpers.region_bins.keys()))

        all_partitions = list(itertools.product(*partition_bins.values()))
        for index, partition in enumerate(all_partitions):
            partition = {
                key: value
                for key, value in zip(partition_bins.keys(), partition)
            }
            logger.info(f"building summary table for partition: "
                        f"{index}/{len(all_partitions)}; "
                        f"{partition} of {study_id}")

            part_started = time.time()
            for q in insert_into_summary_table(pedigree_table, variants_table,
                                               summary_table, summary_schema,
                                               partition,
                                               region_bin_helpers.region_bins,
                                               argv.split_size):
                repeat = 10
                while repeat > 0:
                    try:
                        with closing(impala.connection()) as connection:
                            with connection.cursor() as cursor:
                                logger.debug(
                                    f"going to run summary query: {q}")
                                cursor.execute(q)
                                break
                    except Exception as ex:
                        logger.exception(f"error executing {q}")
                        time.sleep(6)
                        repeat -= 1
                        if repeat == 0:
                            raise ex

            part_elapsed = time.time() - part_started

            logger.info(f"processing partition "
                        f"{index}/{len(all_partitions)} of {study_id} "
                        f"took {part_elapsed:.2f} secs; "
                        f"{partition} ")
            elapsed = time.time() - started
            logger.info(f"processing partition "
                        f"{index}/{len(all_partitions)} of {study_id}; "
                        f"total time {elapsed:.2f} secs")

        rename_summary_table(study_id, study_backend)