Пример #1
0
 def checkpoint(self, msg):
     """
     the final logged message will be
     [<time_since_start>, <time_since_last_checkpoint>] msg
     """
     checkpoint_msg = self._make_checkpoint_message(msg)
     log(checkpoint_msg)
     return
Пример #2
0
def load_biom_table(filename, subsample=False):
    biom_table = biom.load_table(filename)

    if subsample:
        #biom_table = biom_table.subsample(500, axis="observation", by_id=False)
        biom_table = biom_table.subsample(100, axis="sample", by_id=True)

    log('biom_table shape: ' + str(biom_table.shape))
    return biom_table
Пример #3
0
def run(http_client, hello_world_data_dir):
    """
    hello_world_data_dir when inside a container is probably:
     /code_live/data/project/hello_world
    """
    log("hello_world_seed_job: Begin")
    exit_code = 0
    jobs_auth.login_jobs_user(http_client)
    

    x0 = hello_tablelike.HelloTablelikeDTO(1.0, 2.0, 'x',
        ['a','b'], [1.1, 2.2], NestId(0), [NestId(1), NestId(2)], 
        {'x':'innerx'}, 5, [6,7])

    client_makers = hw_api_clients.get_api_client_makers()
    client = client_makers['hello_tablelike'].get_crud_client(http_client)

    eve_atts0 = client.create_entry(x0.to_tablelike_entry())
    if eve_atts0 is None:
        exit_code = 1 

    log('downloading test file from box')
    file_owner = container_users.make_host_user_container_user()
    abs_filename_0 = hello_world_data_dir + '/hello/box_test_0.txt'
    box_url_0 = "https://uofi.box.com/shared/static/y8a7qmgskm73rpovf16j96yr3st7ea96.txt"
    box_downloads.download_from_box_no_auth(box_url_0, abs_filename_0, 
        file_owner=file_owner, force=True)
    box_downloads.download_from_box_no_auth(box_url_0, abs_filename_0, 
        file_owner=file_owner, force=False)
    assert(os.path.isfile(abs_filename_0))
    log('verifying test download contents')
    verify_test_file_0(abs_filename_0)

    log("hello_world_seed_job: Done")
    return exit_code
Пример #4
0
def upload_geno_samples(client_registry, biom_table, tornado_run_id, otu_defs):
    """
    uploads basic description of genome sample data from the biom_table
    (one entry per sample). 

    Note: currently does not upload the otu values themselves, as we don't
    support sparse data well yet. (that data is just kept in memory in this
    seed_job)

    returns a dict of tornado_sample_key to TablelikeEntry's that conform to
    the geno_samples schema (with eve_attributes returned by the upload)
    """
    geno_samples_schema = geno_samples.generate_schema()
    geno_samples_client = client_registry[geno_samples.COLLECTION_NAME]

    tornado_sample_keys = biom_table.ids('sample')
    log('num geno_sample ids to upload: ' + str(len(tornado_sample_keys)))
    geno_samples_list = list()
    otu_counts_lookup = dict()  #k: sample_key, v: otu_counts_of_sample
    for tornado_sample_key in tornado_sample_keys:
        bt_sample_metadata = biom_table.metadata(tornado_sample_key, 'sample')
        study_key = bt_sample_metadata['Source_mapping_file']
        #study_sample_key = bt_sample_metadata['external']
        #it seems like the mappings file doesn't tell us anything unique... the sample_ids
        #in the biom_table are universally unique, and they exist in the SampleId column of the
        #patient_metadata files
        study_sample_key = tornado_sample_key
        tle = tablelike_entry.TablelikeEntry(geno_samples_schema)
        tle.set_value('tornado_sample_key', tornado_sample_key)
        tle.set_value('study_sample_key', study_sample_key)
        tle.set_value('tornado_run_id', tornado_run_id)
        tle.set_value('study_key', study_key)
        geno_samples_list.append(tle)

    geno_samples_list = geno_samples_client.bulk_create_entries(
        geno_samples_list)
    assert (not geno_samples_list is None)

    log('geno_sample defs uploaded')
    log('extracting otu_counts for each geno_sample')
    #for now, add the gross_counts field after we've uploaded to the
    #server. the big data is then available to other analytics but
    #we don't have a good way to POST it yet (TablelikeSchema doesn't handle
    #sparse arrays yet)
    otu_counts_of_samples = _extract_otu_counts_for_samples(
        biom_table, otu_defs)
    for tle in geno_samples_list:
        sample_tornado_key = tle.get_value('tornado_sample_key')
        otu_counts_of_sample = otu_counts_of_samples[sample_tornado_key]
        tle.set_value('otu_counts', otu_counts_of_sample)

    geno_samples_lookup = dict()
    for tle in geno_samples_list:
        key = tle.get_value('tornado_sample_key')
        geno_samples_lookup[key] = tle

    return geno_samples_lookup
Пример #5
0
def run(http_client, db_engine, data_dir, subsample, data_flavor_key):
    """
    db_engine, sqla_md (sqlalchemy.Engine): a postgres hook to the
        db we will use. Tables must already exist in the db.
    data_dir (string): location to write data files 
    subsample (bool) if true, only load a 100 samples from the biom_table 
        and process. results will not be valid but all api endpoints will 
        be populated with data
    """
    timer = CheckpointTimer('mmbdb_seed')
    timer.checkpoint("mmbdb_seed_job: Begin")
    exit_code = 0

    ###############
    ##Connect CRUD clients
    ################
    if DB_VS_API:
        sqla_md = nest_db.get_global_sqlalchemy_metadata()
        clients = client_registry.make_db_client_registry(db_engine, sqla_md)
        for client in clients.values():
            jobs_auth.set_db_user(client)
    else:
        jobs_auth.login_jobs_user(http_client)
        clients = client_registry.make_api_client_registry(http_client)

    ###################
    ##Download Raw Data
    ###################

    timer.checkpoint("Downloading biom data if necessary")
    host_user = container_users.make_host_user_container_user()
    seed_data = get_data_flavor(data_flavor_key, data_dir, host_user,
                                subsample)
    biom_table = seed_data.get_biom_table()
    timer.checkpoint("Download complete.")
    timer.checkpoint("Downloaded/Loaded All Patient Metadata")

    ####################
    ##Upload Primitive Data
    ####################

    timer.checkpoint('uploading tornado_run: Begin')
    tornado_run_tle = biom_etl.upload_tornado_run(clients, biom_table)
    tornado_run_nest_id = tornado_run_tle.get_nest_id()
    tornado_run_id = tornado_run_nest_id.get_value()
    timer.checkpoint('uploading tornado_run: End')

    timer.checkpoint('uploading otu_defs: Begin')
    otu_defs = biom_etl.upload_otu_defs(clients, biom_table, tornado_run_id)
    timer.checkpoint('uploading otu_defs: End')

    timer.checkpoint('uploading geno_samples: Begin')
    geno_samples = biom_etl.upload_geno_samples(clients, biom_table,
                                                tornado_run_id, otu_defs)
    timer.checkpoint('uploading geno_samples: End')

    ####################
    #Define Cohorts
    ####################
    all_cohort_tles = dict()

    cohort_configs = seed_data.get_cohort_configs()

    for cohort_config in cohort_configs:
        cohort_key = cohort_config['display_name_short']
        timer.checkpoint('uploading cohort: ' + str(cohort_key))
        tornado_sample_keys = seed_data.get_tornado_sample_keys(cohort_key)
        sample_ids = cohort_etl.tornado_sample_keys_to_nest_ids(
            tornado_sample_keys, geno_samples)
        cohort_tle = cohort_etl.upload_cohort(clients, cohort_config,
                                              sample_ids, tornado_run_id)
        all_cohort_tles[cohort_key] = cohort_tle

    ####################
    ##Define Comparisons
    ####################
    all_comparisons = list()

    comparison_configs = seed_data.get_comparison_configs()

    for comparison_config in comparison_configs:
        comparison_key = comparison_config['comparison_key']
        baseline_key = comparison_config['baseline_cohort_key']
        baseline_cohort_tle = all_cohort_tles[baseline_key]
        variant_key = comparison_config['variant_cohort_key']
        variant_cohort_tle = all_cohort_tles[variant_key]
        patient_key = comparison_config['patient_cohort_key']
        patient_cohort_tle = all_cohort_tles[patient_key]

        timer.checkpoint('fst begin for: ' + comparison_key)
        fst_results = fst_etl.get_fst_of_comparison(comparison_key, seed_data,
                                                    baseline_cohort_tle,
                                                    variant_cohort_tle,
                                                    otu_defs, geno_samples,
                                                    data_dir, host_user)

        timer.checkpoint('api upload begin: ' + comparison_key)
        comparison_tle = comparison_etl.upload_comparison(
            clients, comparison_config, baseline_cohort_tle,
            variant_cohort_tle, patient_cohort_tle, fst_results)
        timer.checkpoint('api upload done')

        all_comparisons.append(comparison_tle)

    if DO_TREE_ANALYTICS:

        ###############
        ###Cohort Node Analytics
        ###############
        #this also does the upload asap to reduce memory footprint
        cohort_analysis.compute_all_for_cohorts(clients,
                                                all_cohort_tles.values(),
                                                geno_samples,
                                                otu_defs,
                                                num_quantiles=NUM_QUANTILES,
                                                num_bins=NUM_BINS,
                                                timer=timer)

        ###############
        ###Comparison Node Analytics
        ###############
        taxonomy_empty_tree = otu_analysis.compute_taxonomy_tree(otu_defs)
        for comp in all_comparisons:
            tree = taxonomy_empty_tree.copy()
            log('begin analytics for comparison: ' +
                comp.get_value('display_name'))
            timer.checkpoint('computing comparison analytics:begin')
            comparison_analysis.compute_all(comp, otu_defs, tree)
            timer.checkpoint('computing comparison analytics:end')

            timer.checkpoint('uploading comparison analytics nodes:begin')
            comparison_tree_etl.upload_nodes(clients, comp, tree)
            timer.checkpoint('uploading comparison analytics nodes:end')

    timer.checkpoint("mmbdb_seed_job: Done")
    return exit_code