def main(gpf_instance=None, argv=None): description = "Generate autism gene profile statistics tool" parser = argparse.ArgumentParser(description=description) parser.add_argument('--verbose', '-V', '-v', action='count', default=0) default_dbfile = os.path.join(os.getenv("DAE_DB_DIR", "./"), "agpdb") parser.add_argument("--dbfile", default=default_dbfile) parser.add_argument( "--gene-sets-genes", action="store_true", help="Generate AGPs only for genes contained in the config's gene sets" ) parser.add_argument( "--genes", help="Comma separated list of genes to generate statistics for") parser.add_argument("--drop", action="store_true") args = parser.parse_args(argv) if args.verbose == 1: logging.basicConfig(level=logging.WARNING) elif args.verbose == 2: logging.basicConfig(level=logging.INFO) elif args.verbose >= 3: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.ERROR) logging.getLogger("impala").setLevel(logging.WARNING) start = time.time() if gpf_instance is None: gpf_instance = GPFInstance() config = gpf_instance._autism_gene_profile_config # gpf_instance.gene_sets_db.get_all_gene_sets("main") collections_gene_sets = [] for gs_category in config.gene_sets: for gs in gs_category.sets: gs_id = gs["set_id"] collection_id = gs["collection_id"] collections_gene_sets.append( (collection_id, gpf_instance.gene_sets_db.get_gene_set(collection_id, gs_id))) # collections_gene_sets = [] # for name in config.gene_sets: # gene_set = gpf_instance.gene_sets_db.get_gene_set("main", name) # collections_gene_sets.append(gene_set) logger.info(f"collected gene sets: {len(collections_gene_sets)}") # gene_sets = list( # filter(lambda gs: gs["name"] in config.gene_sets, gene_sets) # ) gene_symbols = set() if args.genes: gene_symbols = [gs.strip() for gs in args.genes.split(",")] gene_symbols = set(gene_symbols) elif args.gene_sets_genes: for _, gs in collections_gene_sets: gene_symbols = gene_symbols.union(gs["syms"]) else: gene_models = gpf_instance.get_genome().get_gene_models().gene_models gene_symbols = set(gene_models.keys()) gs_count = len(gene_symbols) logger.info(f"Collected {gs_count} gene symbols") has_denovo = False has_rare = False person_ids = dict() for dataset_id, filters in config.datasets.items(): genotype_data = gpf_instance.get_genotype_data(dataset_id) assert genotype_data is not None, dataset_id person_ids[dataset_id] = dict() for ps in filters.person_sets: person_set_query = (ps.collection_name, [ps.set_name]) person_ids[dataset_id][ps.set_name] = \ genotype_data._transform_person_set_collection_query( person_set_query, None ) for stat in filters.statistics: if stat.category == "denovo": has_denovo = True elif stat.category == "rare": has_rare = True agps = dict() gene_symbols = list(gene_symbols) gs_count = len(gene_symbols) elapsed = time.time() - start logger.info(f"data collected: {elapsed:.2f} secs") start = time.time() for idx, sym in enumerate(gene_symbols, 1): gs, agp = generate_agp(gpf_instance, sym, collections_gene_sets) agps[gs] = agp if idx % 25 == 0: elapsed = time.time() - start logger.info(f"Generated {idx}/{gs_count} AGP statistics " f"{elapsed:.2f} secs") logger.info("Done generating AGP statistics!") generate_end = time.time() elapsed = generate_end - start logger.info(f"Took {elapsed:.2f} secs") if has_denovo: logger.info("Collecting denovo variants") denovo_variants = dict() for dataset_id, filters in config.datasets.items(): genotype_data = gpf_instance.get_genotype_data(dataset_id) assert genotype_data is not None, dataset_id if args.gene_sets_genes or args.genes: genes = gene_symbols else: genes = None denovo_variants[dataset_id] = list( genotype_data.query_variants(genes=genes, inheritance="denovo")) logger.info("Done collecting denovo variants") logger.info("Counting denovo variants...") fill_variant_counts(denovo_variants, agps, config, person_ids, True) logger.info("Done counting denovo variants") if has_rare: logger.info("Collecting rare variants") rare_variants = dict() for dataset_id, filters in config.datasets.items(): genotype_data = gpf_instance.get_genotype_data(dataset_id) assert genotype_data is not None, dataset_id if args.gene_sets_genes or args.genes: genes = gene_symbols else: genes = None rare_variants[dataset_id] = [] for statistic in filters.statistics: if statistic.category == "denovo": continue kwargs = dict() kwargs["roles"] = "prb or sib" if statistic.effects is not None: kwargs["effect_types"] = \ expand_effect_types(statistic.effects) if statistic.variant_types: variant_types = [ VariantType.from_name(statistic.variant_types).repr() ] kwargs["variant_type"] = " or ".join(variant_types) if statistic.scores: scores = [] for score in statistic.scores: min_max = (score.min, score.max) score_filter = (score.name, min_max) scores.append(score_filter) kwargs["real_attr_filter"] = scores if statistic.variant_types: roles = [Role.from_name(statistic.roles).repr()] kwargs["roles"] = " or ".join(roles) rare_variants[dataset_id].extend( list( genotype_data.query_variants( genes=genes, inheritance=[ "not denovo and " "not possible_denovo and not possible_omission", "mendelian or missing" ], frequency_filter=[("af_allele_freq", (None, 1.0))], **kwargs))) logger.info("Done collecting rare variants") logger.info("Counting rare variants...") fill_variant_counts(rare_variants, agps, config, person_ids, False) logger.info("Done counting rare variants") logger.info("Calculating rates...") calculate_rates(gpf_instance, agps, config) logger.info("Done calculating rates") elapsed = time.time() - generate_end logger.info(f"Took {elapsed:.2f} secs") agpdb = AutismGeneProfileDB( gpf_instance._autism_gene_profile_config.to_dict(), args.dbfile, clear=True) agpdb.clear_all_tables() agpdb.populate_data_tables(gpf_instance.get_genotype_data_ids()) logger.info("Inserting statistics into DB") agpdb.insert_agps(agps.values()) logger.info("Building AGP output view") agpdb.build_agp_view() logger.info("Generating cache table") agpdb.generate_cache_table() logger.info("Done")
def main(argv=sys.argv[1:], gpf_instance=None): if gpf_instance is None: gpf_instance = GPFInstance() argv = parse_cli_arguments(argv, gpf_instance) if argv.verbose == 1: logging.basicConfig(level=logging.WARNING) elif argv.verbose == 2: logging.basicConfig(level=logging.INFO) elif argv.verbose >= 3: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.ERROR) logging.getLogger("impala").setLevel(logging.WARNING) if argv.studies is None: study_ids = [ gd.study_id for gd in gpf_instance.get_all_genotype_data() if not gd.is_group ] else: study_ids = [sid.strip() for sid in argv.studies.split(",")] logger.info(f"building summary variants tables for studies: {study_ids}") for study_id in study_ids: study = gpf_instance.get_genotype_data(study_id) assert study.study_id == study_id study_backend = study._backend if not isinstance(study_backend, ImpalaVariants): logger.warning(f"not an impala study: {study_id}; skipping...") continue if study_backend.variants_table is None: logger.warning(f"study {study_id} has no variants; skipping...") continue drop_summary_table(study_id, study_backend) partitions = create_summary_table(study_id, study_backend) summary_schema = collect_summary_schema(study_backend) summary_table = summary_table_name_temp(study_id, study_backend) pedigree_table = f"{study_backend.db}.{study_backend.pedigree_table}" variants_table = f"{study_backend.db}.{study_backend.variants_table}" partition_bins = {} logger.info(f"collecting partitions {partitions} from " f"variants table {variants_table}") for partition in partitions: partition_bins[partition] = variants_parition_bins( study_backend, partition) logger.info(f"variant table partitions: {partition_bins}") impala = study_backend._impala_helpers started = time.time() region_bin_helpers = RegionBinsHelper(study_backend.table_properties, gpf_instance.get_genome()) region_bin_helpers._build_region_bins() logger.info( f"region bins calculated: {region_bin_helpers.region_bins}") assert set(partition_bins["region_bin"]).issubset( set(region_bin_helpers.region_bins.keys())) all_partitions = list(itertools.product(*partition_bins.values())) for index, partition in enumerate(all_partitions): partition = { key: value for key, value in zip(partition_bins.keys(), partition) } logger.info(f"building summary table for partition: " f"{index}/{len(all_partitions)}; " f"{partition} of {study_id}") part_started = time.time() for q in insert_into_summary_table(pedigree_table, variants_table, summary_table, summary_schema, partition, region_bin_helpers.region_bins, argv.split_size): repeat = 10 while repeat > 0: try: with closing(impala.connection()) as connection: with connection.cursor() as cursor: logger.debug( f"going to run summary query: {q}") cursor.execute(q) break except Exception as ex: logger.exception(f"error executing {q}") time.sleep(6) repeat -= 1 if repeat == 0: raise ex part_elapsed = time.time() - part_started logger.info(f"processing partition " f"{index}/{len(all_partitions)} of {study_id} " f"took {part_elapsed:.2f} secs; " f"{partition} ") elapsed = time.time() - started logger.info(f"processing partition " f"{index}/{len(all_partitions)} of {study_id}; " f"total time {elapsed:.2f} secs") rename_summary_table(study_id, study_backend)