def upload_nodes(client_registry, cohort_def, cohort_tree, timer=None): """ takes a cohort definition and a taxonomy tree that has been populated with analytics results. uploads the individual nodes with those attributes that are part of the api """ if timer is None: timer = CheckpointTimer("Cohort_Tree_Upload") cohort_name = cohort_def.get_value('display_name_short') timer.checkpoint('begin cohort tree upload: ' + cohort_name) nodes_client = client_registry[cohort_phylo_tree_nodes.COLLECTION_NAME] cohort_id = cohort_def.get_nest_id().get_value() node_tles = _extract_cohort_node_tles(cohort_tree, cohort_id) num_uploaded = nodes_client.bulk_create_entries_async(node_tles, batch_size=3000) assert(not num_uploaded is None) timer.checkpoint('tree upload complete for: ' + cohort_name) return
def run(http_client, db_engine, data_dir, subsample, data_flavor_key): """ db_engine, sqla_md (sqlalchemy.Engine): a postgres hook to the db we will use. Tables must already exist in the db. data_dir (string): location to write data files subsample (bool) if true, only load a 100 samples from the biom_table and process. results will not be valid but all api endpoints will be populated with data """ timer = CheckpointTimer('mmbdb_seed') timer.checkpoint("mmbdb_seed_job: Begin") exit_code = 0 ############### ##Connect CRUD clients ################ if DB_VS_API: sqla_md = nest_db.get_global_sqlalchemy_metadata() clients = client_registry.make_db_client_registry(db_engine, sqla_md) for client in clients.values(): jobs_auth.set_db_user(client) else: jobs_auth.login_jobs_user(http_client) clients = client_registry.make_api_client_registry(http_client) ################### ##Download Raw Data ################### timer.checkpoint("Downloading biom data if necessary") host_user = container_users.make_host_user_container_user() seed_data = get_data_flavor(data_flavor_key, data_dir, host_user, subsample) biom_table = seed_data.get_biom_table() timer.checkpoint("Download complete.") timer.checkpoint("Downloaded/Loaded All Patient Metadata") #################### ##Upload Primitive Data #################### timer.checkpoint('uploading tornado_run: Begin') tornado_run_tle = biom_etl.upload_tornado_run(clients, biom_table) tornado_run_nest_id = tornado_run_tle.get_nest_id() tornado_run_id = tornado_run_nest_id.get_value() timer.checkpoint('uploading tornado_run: End') timer.checkpoint('uploading otu_defs: Begin') otu_defs = biom_etl.upload_otu_defs(clients, biom_table, tornado_run_id) timer.checkpoint('uploading otu_defs: End') timer.checkpoint('uploading geno_samples: Begin') geno_samples = biom_etl.upload_geno_samples(clients, biom_table, tornado_run_id, otu_defs) timer.checkpoint('uploading geno_samples: End') #################### #Define Cohorts #################### all_cohort_tles = dict() cohort_configs = seed_data.get_cohort_configs() for cohort_config in cohort_configs: cohort_key = cohort_config['display_name_short'] timer.checkpoint('uploading cohort: ' + str(cohort_key)) tornado_sample_keys = seed_data.get_tornado_sample_keys(cohort_key) sample_ids = cohort_etl.tornado_sample_keys_to_nest_ids( tornado_sample_keys, geno_samples) cohort_tle = cohort_etl.upload_cohort(clients, cohort_config, sample_ids, tornado_run_id) all_cohort_tles[cohort_key] = cohort_tle #################### ##Define Comparisons #################### all_comparisons = list() comparison_configs = seed_data.get_comparison_configs() for comparison_config in comparison_configs: comparison_key = comparison_config['comparison_key'] baseline_key = comparison_config['baseline_cohort_key'] baseline_cohort_tle = all_cohort_tles[baseline_key] variant_key = comparison_config['variant_cohort_key'] variant_cohort_tle = all_cohort_tles[variant_key] patient_key = comparison_config['patient_cohort_key'] patient_cohort_tle = all_cohort_tles[patient_key] timer.checkpoint('fst begin for: ' + comparison_key) fst_results = fst_etl.get_fst_of_comparison(comparison_key, seed_data, baseline_cohort_tle, variant_cohort_tle, otu_defs, geno_samples, data_dir, host_user) timer.checkpoint('api upload begin: ' + comparison_key) comparison_tle = comparison_etl.upload_comparison( clients, comparison_config, baseline_cohort_tle, variant_cohort_tle, patient_cohort_tle, fst_results) timer.checkpoint('api upload done') all_comparisons.append(comparison_tle) if DO_TREE_ANALYTICS: ############### ###Cohort Node Analytics ############### #this also does the upload asap to reduce memory footprint cohort_analysis.compute_all_for_cohorts(clients, all_cohort_tles.values(), geno_samples, otu_defs, num_quantiles=NUM_QUANTILES, num_bins=NUM_BINS, timer=timer) ############### ###Comparison Node Analytics ############### taxonomy_empty_tree = otu_analysis.compute_taxonomy_tree(otu_defs) for comp in all_comparisons: tree = taxonomy_empty_tree.copy() log('begin analytics for comparison: ' + comp.get_value('display_name')) timer.checkpoint('computing comparison analytics:begin') comparison_analysis.compute_all(comp, otu_defs, tree) timer.checkpoint('computing comparison analytics:end') timer.checkpoint('uploading comparison analytics nodes:begin') comparison_tree_etl.upload_nodes(clients, comp, tree) timer.checkpoint('uploading comparison analytics nodes:end') timer.checkpoint("mmbdb_seed_job: Done") return exit_code
def run(http_client, db_engine, data_dir, subsample, flavor_name): """ http_client (NestHttpClient): an http client configured for a particular api server (NOT USED, db_utils reads direct from CONFIG) db_engine (NOT USED, db_utils reads direct from CONFIG) data_dir (str): location to write data files subsample (bool): ignored flavor_name (str): ignored """ # TODO fix this data_dir = '/' timer = CheckpointTimer('knoweng_seed') timer.checkpoint("knoweng_seed_job: Begin") exit_code = 0 #will read from nest_config parameters shared with flask db_utils.init_crud_clients() network_base_dir = os.path.join(data_dir, 'networks') merged_networks = networks.get_merged_network_info(network_base_dir) all_species = networks.get_species(network_base_dir) collections = networks.get_collections(all_species, merged_networks) analysis_networks = networks.get_analysis_networks(all_species, merged_networks) timer.checkpoint("Loading public gene sets") load_public_gene_sets(merged_networks, network_base_dir) timer.checkpoint("Loading species") load_species(all_species) timer.checkpoint("Loading collections") load_collections(collections) timer.checkpoint("Loading analysis networks") load_analysis_networks(analysis_networks) timer.checkpoint("knoweng_seed_job: Done") return exit_code