示例#1
0
def upload_nodes(client_registry, cohort_def, cohort_tree, timer=None):
    """
    takes a cohort definition and a taxonomy tree that has been populated
    with analytics results. uploads the individual nodes with those attributes
    that are part of the api
    """
    if timer is None:
        timer = CheckpointTimer("Cohort_Tree_Upload")
    cohort_name = cohort_def.get_value('display_name_short')
    timer.checkpoint('begin cohort tree upload: ' + cohort_name)
    nodes_client = client_registry[cohort_phylo_tree_nodes.COLLECTION_NAME]
    cohort_id = cohort_def.get_nest_id().get_value()
    node_tles = _extract_cohort_node_tles(cohort_tree, cohort_id)
    num_uploaded = nodes_client.bulk_create_entries_async(node_tles, batch_size=3000)
    assert(not num_uploaded is None)
    timer.checkpoint('tree upload complete for: ' + cohort_name)
    return
示例#2
0
def run(http_client, db_engine, data_dir, subsample, data_flavor_key):
    """
    db_engine, sqla_md (sqlalchemy.Engine): a postgres hook to the
        db we will use. Tables must already exist in the db.
    data_dir (string): location to write data files 
    subsample (bool) if true, only load a 100 samples from the biom_table 
        and process. results will not be valid but all api endpoints will 
        be populated with data
    """
    timer = CheckpointTimer('mmbdb_seed')
    timer.checkpoint("mmbdb_seed_job: Begin")
    exit_code = 0

    ###############
    ##Connect CRUD clients
    ################
    if DB_VS_API:
        sqla_md = nest_db.get_global_sqlalchemy_metadata()
        clients = client_registry.make_db_client_registry(db_engine, sqla_md)
        for client in clients.values():
            jobs_auth.set_db_user(client)
    else:
        jobs_auth.login_jobs_user(http_client)
        clients = client_registry.make_api_client_registry(http_client)

    ###################
    ##Download Raw Data
    ###################

    timer.checkpoint("Downloading biom data if necessary")
    host_user = container_users.make_host_user_container_user()
    seed_data = get_data_flavor(data_flavor_key, data_dir, host_user,
                                subsample)
    biom_table = seed_data.get_biom_table()
    timer.checkpoint("Download complete.")
    timer.checkpoint("Downloaded/Loaded All Patient Metadata")

    ####################
    ##Upload Primitive Data
    ####################

    timer.checkpoint('uploading tornado_run: Begin')
    tornado_run_tle = biom_etl.upload_tornado_run(clients, biom_table)
    tornado_run_nest_id = tornado_run_tle.get_nest_id()
    tornado_run_id = tornado_run_nest_id.get_value()
    timer.checkpoint('uploading tornado_run: End')

    timer.checkpoint('uploading otu_defs: Begin')
    otu_defs = biom_etl.upload_otu_defs(clients, biom_table, tornado_run_id)
    timer.checkpoint('uploading otu_defs: End')

    timer.checkpoint('uploading geno_samples: Begin')
    geno_samples = biom_etl.upload_geno_samples(clients, biom_table,
                                                tornado_run_id, otu_defs)
    timer.checkpoint('uploading geno_samples: End')

    ####################
    #Define Cohorts
    ####################
    all_cohort_tles = dict()

    cohort_configs = seed_data.get_cohort_configs()

    for cohort_config in cohort_configs:
        cohort_key = cohort_config['display_name_short']
        timer.checkpoint('uploading cohort: ' + str(cohort_key))
        tornado_sample_keys = seed_data.get_tornado_sample_keys(cohort_key)
        sample_ids = cohort_etl.tornado_sample_keys_to_nest_ids(
            tornado_sample_keys, geno_samples)
        cohort_tle = cohort_etl.upload_cohort(clients, cohort_config,
                                              sample_ids, tornado_run_id)
        all_cohort_tles[cohort_key] = cohort_tle

    ####################
    ##Define Comparisons
    ####################
    all_comparisons = list()

    comparison_configs = seed_data.get_comparison_configs()

    for comparison_config in comparison_configs:
        comparison_key = comparison_config['comparison_key']
        baseline_key = comparison_config['baseline_cohort_key']
        baseline_cohort_tle = all_cohort_tles[baseline_key]
        variant_key = comparison_config['variant_cohort_key']
        variant_cohort_tle = all_cohort_tles[variant_key]
        patient_key = comparison_config['patient_cohort_key']
        patient_cohort_tle = all_cohort_tles[patient_key]

        timer.checkpoint('fst begin for: ' + comparison_key)
        fst_results = fst_etl.get_fst_of_comparison(comparison_key, seed_data,
                                                    baseline_cohort_tle,
                                                    variant_cohort_tle,
                                                    otu_defs, geno_samples,
                                                    data_dir, host_user)

        timer.checkpoint('api upload begin: ' + comparison_key)
        comparison_tle = comparison_etl.upload_comparison(
            clients, comparison_config, baseline_cohort_tle,
            variant_cohort_tle, patient_cohort_tle, fst_results)
        timer.checkpoint('api upload done')

        all_comparisons.append(comparison_tle)

    if DO_TREE_ANALYTICS:

        ###############
        ###Cohort Node Analytics
        ###############
        #this also does the upload asap to reduce memory footprint
        cohort_analysis.compute_all_for_cohorts(clients,
                                                all_cohort_tles.values(),
                                                geno_samples,
                                                otu_defs,
                                                num_quantiles=NUM_QUANTILES,
                                                num_bins=NUM_BINS,
                                                timer=timer)

        ###############
        ###Comparison Node Analytics
        ###############
        taxonomy_empty_tree = otu_analysis.compute_taxonomy_tree(otu_defs)
        for comp in all_comparisons:
            tree = taxonomy_empty_tree.copy()
            log('begin analytics for comparison: ' +
                comp.get_value('display_name'))
            timer.checkpoint('computing comparison analytics:begin')
            comparison_analysis.compute_all(comp, otu_defs, tree)
            timer.checkpoint('computing comparison analytics:end')

            timer.checkpoint('uploading comparison analytics nodes:begin')
            comparison_tree_etl.upload_nodes(clients, comp, tree)
            timer.checkpoint('uploading comparison analytics nodes:end')

    timer.checkpoint("mmbdb_seed_job: Done")
    return exit_code
示例#3
0
def run(http_client, db_engine, data_dir, subsample, flavor_name):
    """
    http_client (NestHttpClient): an http client configured for a particular api server (NOT USED, db_utils reads direct from CONFIG)
    db_engine  (NOT USED, db_utils reads direct from CONFIG)
    data_dir (str): location to write data files
    subsample (bool): ignored
    flavor_name (str): ignored
    """

    # TODO fix this
    data_dir = '/'

    timer = CheckpointTimer('knoweng_seed')
    timer.checkpoint("knoweng_seed_job: Begin")
    exit_code = 0

    #will read from nest_config parameters shared with flask
    db_utils.init_crud_clients()

    network_base_dir = os.path.join(data_dir, 'networks')
    merged_networks = networks.get_merged_network_info(network_base_dir)
    all_species = networks.get_species(network_base_dir)
    collections = networks.get_collections(all_species, merged_networks)
    analysis_networks = networks.get_analysis_networks(all_species,
                                                       merged_networks)

    timer.checkpoint("Loading public gene sets")
    load_public_gene_sets(merged_networks, network_base_dir)
    timer.checkpoint("Loading species")
    load_species(all_species)
    timer.checkpoint("Loading collections")
    load_collections(collections)
    timer.checkpoint("Loading analysis networks")
    load_analysis_networks(analysis_networks)
    timer.checkpoint("knoweng_seed_job: Done")

    return exit_code