示例#1
0
def _prof_main():
    start_time = CheckpointTimer.current_time()
    exit_code = main(sys.argv[1:])
    if exit_code == 0:
        status = 'SUCCESS'
    else:
        status = 'FAILURE'

    end_time = CheckpointTimer.current_time()
    elapsed_secs = end_time - start_time
    formatted_secs = CheckpointTimer.format_elapsed_secs(elapsed_secs)

    print('nest_ops exit_code: ' + str(exit_code) + ' (' + status +
          '). Took: ' + formatted_secs)
    sys.exit(exit_code)
示例#2
0
    def __init__(self,
                 job_key,
                 container_user,
                 job_run_file_space,
                 config_jdata,
                 log_stdout=True):

        self.job_key = job_key
        self.file_space = job_run_file_space
        self.container_user = container_user
        self.logger = _setup_logger(job_key, job_run_file_space, log_stdout)
        self.success = None
        self.config_jdata = config_jdata
        self.runtime_objects = dict()
        self.timer = CheckpointTimer(job_key)
        return
示例#3
0
def compute_cohort_aggregates(cohort_def, taxonomy_tree,
        num_quantiles=8, num_bins=20, unique_otus_bin_max=20000, 
        timer=CheckpointTimer('cohort_aggregates')):
    """
    takes a cohort def (that includes the geno_sample ids in the cohort),
    and the master list of all geno_samples that were part of the tornado run.
    Also takes the otu defs and a copy of an AttributeTree that represents just
    the taxonomy.
    runs all the analytics for every node on the tree using the cohort's samples
    as input data and associates the outputs to attributes of the taxonomy tree
    nodes
    this method provides aggregates (or things like histogram bins) for metrics
    that are already assumed to have been computed by compute_cohort_metrics
    """
    cohort_name = cohort_def.get_value('display_name_short')
    timer.checkpoint('begin cohort aggregates: ' + cohort_name)

    compute_quantiles(taxonomy_tree, 'abundance_frac_per_sample', 
        'abundance_frac_quantiles', num_quantiles)

    compute_medians(taxonomy_tree, 'abundance_frac_quantiles',
        'abundance_frac_median')

    compute_means(taxonomy_tree, 'abundance_frac_per_sample',
        'abundance_frac_mean')

    compute_zero_separated_histograms(taxonomy_tree, 'abundance_frac_per_sample',
        'abundance_frac_histo', 0.0, 1.0, num_bins)

    compute_quantiles(taxonomy_tree, 'num_unique_otus_per_sample', 
        'num_unique_otus_quantiles', num_quantiles)

    compute_medians(taxonomy_tree, 'num_unique_otus_quantiles',
        'num_unique_otus_median')

    compute_means(taxonomy_tree, 'num_unique_otus_per_sample',
        'num_unique_otus_mean')

    compute_means(taxonomy_tree, 'normalized_entropy_per_sample',
        'normalized_entropy_mean')

    compute_zero_separated_histograms(taxonomy_tree, 'normalized_entropy_per_sample',
        'normalized_entropy_histo', 0.0, 1.0, num_bins)

    compute_pdf_scatterplots(taxonomy_tree, 
        'abundance_frac_quantiles', 'abundance_frac_density_plot')

    compute_pdf_scatterplots(taxonomy_tree, 
        'num_unique_otus_quantiles', 'num_unique_otus_density_plot')

    compute_zero_separated_histograms(taxonomy_tree, 'num_unique_otus_per_sample',
        'num_unique_otus_histo', 0.0, unique_otus_bin_max, num_bins)

    timer.checkpoint('end cohort aggregates: ' + cohort_name)
    #pretty_print_jdata(taxonomy_tree.to_jdata())
    return taxonomy_tree
示例#4
0
def run(job_key, config_abs_filename, wix_data_dir):
    """
    entry into the wix environment to trigger a job

    config_abs_filename (string) is a filename or None

    wix_data_dir (string) is a directory name that all data
        from all jobs will be put into, organized by
        job name and run
    """
    start_time = CheckpointTimer.current_time()
    jcx = _make_job_context(job_key, config_abs_filename, wix_data_dir)
    _symlink_current_run(jcx)
    _write_config_copy_to_rundir(jcx)
    jcx.log("WIX RUNNER: Running job '" + job_key + "', run at: '" +
            jcx.get_run_dir() + ', start time = ' + str(start_time))

    if job_key in COMMAND_LOOKUP:
        try:
            COMMAND_LOOKUP[job_key](jcx)
            if jcx.succeeded():
                exit_code = 0  #success exit code
            else:
                exit_code = 1
        except Exception as e:
            stacktrace = traceback.format_exc()
            jcx.log('Exception = ' + str(e))
            jcx.log(str(stacktrace))
            exit_code = 1
    else:
        jcx.log('not a valid wix command')
        exit_code = 1
    if exit_code == 0:
        status = 'SUCCESS'
    else:
        status = 'FAILURE'
    elapsed_secs = CheckpointTimer.current_time() - start_time
    formatted_secs = CheckpointTimer.format_elapsed_secs(elapsed_secs)
    jcx.log("WIX RUNNER complete: (" + status + ") Took: " + formatted_secs)
    return exit_code
示例#5
0
def compute_cohort_metrics(cohort_def, all_geno_samples, otu_defs, taxonomy_tree,
    timer=CheckpointTimer('cohort_metrics')):
    """
    takes a cohort def (that includes the geno_sample ids in the cohort),
    and the master list of all geno_samples that were part of the tornado run.
    Also takes the otu defs and a copy of an AttributeTree that represents just
    the taxonomy.

    this computes all of the core metrics that we need at every node, at every sample
    """
    cohort_name = cohort_def.get_value('display_name_short')
    timer.checkpoint('begin cohort basic metrics: ' + cohort_name)
    geno_sample_lst = subset_samples_by_cohort(all_geno_samples, cohort_def)
    sum_gross_counts_per_sample(taxonomy_tree, otu_defs, geno_sample_lst)
    compute_abundance_frac_per_sample(taxonomy_tree)
    compute_unique_otus_per_sample(taxonomy_tree)
    compute_normalized_entropy_per_sample(taxonomy_tree)
    timer.checkpoint('end cohort basic metrics: ' + cohort_name)
    return
示例#6
0
def upload_nodes(client_registry, cohort_def, cohort_tree, timer=None):
    """
    takes a cohort definition and a taxonomy tree that has been populated
    with analytics results. uploads the individual nodes with those attributes
    that are part of the api
    """
    if timer is None:
        timer = CheckpointTimer("Cohort_Tree_Upload")
    cohort_name = cohort_def.get_value('display_name_short')
    timer.checkpoint('begin cohort tree upload: ' + cohort_name)
    nodes_client = client_registry[cohort_phylo_tree_nodes.COLLECTION_NAME]
    cohort_id = cohort_def.get_nest_id().get_value()
    node_tles = _extract_cohort_node_tles(cohort_tree, cohort_id)
    num_uploaded = nodes_client.bulk_create_entries_async(node_tles, batch_size=3000)
    assert(not num_uploaded is None)
    timer.checkpoint('tree upload complete for: ' + cohort_name)
    return
示例#7
0
def compute_all_for_cohorts(client_registry, cohort_tles, all_geno_samples, otu_defs,
    num_quantiles=8, num_bins=20, timer=CheckpointTimer("cohort_analysis")):
    """
    compute all analytics for each cohort in a list. they will share some
    scaling factors between all cohorts.

    """
    #deduce the structure of the tree, which we will copy each time
    taxonomy_empty_tree = otu_analysis.compute_taxonomy_tree(otu_defs)

    global_max_unique_otus = compute_global_max_unique_otus(cohort_tles,
        all_geno_samples, otu_defs, taxonomy_empty_tree)

    for cohort in cohort_tles:
        tree = taxonomy_empty_tree.copy()
        compute_cohort_metrics(cohort, all_geno_samples, otu_defs, tree, timer=timer)
        compute_cohort_aggregates(cohort, tree, 
            num_quantiles=num_quantiles,
            num_bins=num_bins,
            unique_otus_bin_max=global_max_unique_otus,
            timer=timer)

        cohort_tree_etl.upload_nodes(client_registry, cohort, tree, timer=timer)
    return 
示例#8
0
def run(http_client, db_engine, data_dir, subsample, data_flavor_key):
    """
    db_engine, sqla_md (sqlalchemy.Engine): a postgres hook to the
        db we will use. Tables must already exist in the db.
    data_dir (string): location to write data files 
    subsample (bool) if true, only load a 100 samples from the biom_table 
        and process. results will not be valid but all api endpoints will 
        be populated with data
    """
    timer = CheckpointTimer('mmbdb_seed')
    timer.checkpoint("mmbdb_seed_job: Begin")
    exit_code = 0

    ###############
    ##Connect CRUD clients
    ################
    if DB_VS_API:
        sqla_md = nest_db.get_global_sqlalchemy_metadata()
        clients = client_registry.make_db_client_registry(db_engine, sqla_md)
        for client in clients.values():
            jobs_auth.set_db_user(client)
    else:
        jobs_auth.login_jobs_user(http_client)
        clients = client_registry.make_api_client_registry(http_client)

    ###################
    ##Download Raw Data
    ###################

    timer.checkpoint("Downloading biom data if necessary")
    host_user = container_users.make_host_user_container_user()
    seed_data = get_data_flavor(data_flavor_key, data_dir, host_user,
                                subsample)
    biom_table = seed_data.get_biom_table()
    timer.checkpoint("Download complete.")
    timer.checkpoint("Downloaded/Loaded All Patient Metadata")

    ####################
    ##Upload Primitive Data
    ####################

    timer.checkpoint('uploading tornado_run: Begin')
    tornado_run_tle = biom_etl.upload_tornado_run(clients, biom_table)
    tornado_run_nest_id = tornado_run_tle.get_nest_id()
    tornado_run_id = tornado_run_nest_id.get_value()
    timer.checkpoint('uploading tornado_run: End')

    timer.checkpoint('uploading otu_defs: Begin')
    otu_defs = biom_etl.upload_otu_defs(clients, biom_table, tornado_run_id)
    timer.checkpoint('uploading otu_defs: End')

    timer.checkpoint('uploading geno_samples: Begin')
    geno_samples = biom_etl.upload_geno_samples(clients, biom_table,
                                                tornado_run_id, otu_defs)
    timer.checkpoint('uploading geno_samples: End')

    ####################
    #Define Cohorts
    ####################
    all_cohort_tles = dict()

    cohort_configs = seed_data.get_cohort_configs()

    for cohort_config in cohort_configs:
        cohort_key = cohort_config['display_name_short']
        timer.checkpoint('uploading cohort: ' + str(cohort_key))
        tornado_sample_keys = seed_data.get_tornado_sample_keys(cohort_key)
        sample_ids = cohort_etl.tornado_sample_keys_to_nest_ids(
            tornado_sample_keys, geno_samples)
        cohort_tle = cohort_etl.upload_cohort(clients, cohort_config,
                                              sample_ids, tornado_run_id)
        all_cohort_tles[cohort_key] = cohort_tle

    ####################
    ##Define Comparisons
    ####################
    all_comparisons = list()

    comparison_configs = seed_data.get_comparison_configs()

    for comparison_config in comparison_configs:
        comparison_key = comparison_config['comparison_key']
        baseline_key = comparison_config['baseline_cohort_key']
        baseline_cohort_tle = all_cohort_tles[baseline_key]
        variant_key = comparison_config['variant_cohort_key']
        variant_cohort_tle = all_cohort_tles[variant_key]
        patient_key = comparison_config['patient_cohort_key']
        patient_cohort_tle = all_cohort_tles[patient_key]

        timer.checkpoint('fst begin for: ' + comparison_key)
        fst_results = fst_etl.get_fst_of_comparison(comparison_key, seed_data,
                                                    baseline_cohort_tle,
                                                    variant_cohort_tle,
                                                    otu_defs, geno_samples,
                                                    data_dir, host_user)

        timer.checkpoint('api upload begin: ' + comparison_key)
        comparison_tle = comparison_etl.upload_comparison(
            clients, comparison_config, baseline_cohort_tle,
            variant_cohort_tle, patient_cohort_tle, fst_results)
        timer.checkpoint('api upload done')

        all_comparisons.append(comparison_tle)

    if DO_TREE_ANALYTICS:

        ###############
        ###Cohort Node Analytics
        ###############
        #this also does the upload asap to reduce memory footprint
        cohort_analysis.compute_all_for_cohorts(clients,
                                                all_cohort_tles.values(),
                                                geno_samples,
                                                otu_defs,
                                                num_quantiles=NUM_QUANTILES,
                                                num_bins=NUM_BINS,
                                                timer=timer)

        ###############
        ###Comparison Node Analytics
        ###############
        taxonomy_empty_tree = otu_analysis.compute_taxonomy_tree(otu_defs)
        for comp in all_comparisons:
            tree = taxonomy_empty_tree.copy()
            log('begin analytics for comparison: ' +
                comp.get_value('display_name'))
            timer.checkpoint('computing comparison analytics:begin')
            comparison_analysis.compute_all(comp, otu_defs, tree)
            timer.checkpoint('computing comparison analytics:end')

            timer.checkpoint('uploading comparison analytics nodes:begin')
            comparison_tree_etl.upload_nodes(clients, comp, tree)
            timer.checkpoint('uploading comparison analytics nodes:end')

    timer.checkpoint("mmbdb_seed_job: Done")
    return exit_code
示例#9
0
def run(http_client, db_engine, data_dir, subsample, flavor_name):
    """
    http_client (NestHttpClient): an http client configured for a particular api server (NOT USED, db_utils reads direct from CONFIG)
    db_engine  (NOT USED, db_utils reads direct from CONFIG)
    data_dir (str): location to write data files
    subsample (bool): ignored
    flavor_name (str): ignored
    """

    # TODO fix this
    data_dir = '/'

    timer = CheckpointTimer('knoweng_seed')
    timer.checkpoint("knoweng_seed_job: Begin")
    exit_code = 0

    #will read from nest_config parameters shared with flask
    db_utils.init_crud_clients()

    network_base_dir = os.path.join(data_dir, 'networks')
    merged_networks = networks.get_merged_network_info(network_base_dir)
    all_species = networks.get_species(network_base_dir)
    collections = networks.get_collections(all_species, merged_networks)
    analysis_networks = networks.get_analysis_networks(all_species,
                                                       merged_networks)

    timer.checkpoint("Loading public gene sets")
    load_public_gene_sets(merged_networks, network_base_dir)
    timer.checkpoint("Loading species")
    load_species(all_species)
    timer.checkpoint("Loading collections")
    load_collections(collections)
    timer.checkpoint("Loading analysis networks")
    load_analysis_networks(analysis_networks)
    timer.checkpoint("knoweng_seed_job: Done")

    return exit_code
示例#10
0
class JobContext(object):
    """
    A context container for a job run. Provides logging, 
    tmp disk directory, system user, etc, to a single
    run of a job.

    Here we call the code that is configured to run
    as a single standalone instance a 'job', and each
    invocation of that code a 'run'.
    """
    def __init__(self,
                 job_key,
                 container_user,
                 job_run_file_space,
                 config_jdata,
                 log_stdout=True):

        self.job_key = job_key
        self.file_space = job_run_file_space
        self.container_user = container_user
        self.logger = _setup_logger(job_key, job_run_file_space, log_stdout)
        self.success = None
        self.config_jdata = config_jdata
        self.runtime_objects = dict()
        self.timer = CheckpointTimer(job_key)
        return

    def get_job_key(self):
        return self.job_key

    def get_config_jdata(self):
        """
        Returns the JSON-style object (nested dicts, lists,
        strings, ints, and floats) associated with the job run.
        The particular data inside is not validated, it's 
        whatever was is the config file or input.
        """
        return self.config_jdata

    def set_runtime_object(self, name, obj):
        self.runtime_objects[name] = obj
        return

    def runtime(self):
        return self.runtime_objects

    def get_parent_job_dir(self):
        """
        the root directory that all jobs of this run's type
        share. Normally, job runs do not write to this directory
        directly. Instead, use get_job_global_data_dir to write
        data that has a scope across more than one run.
        """
        d = self.file_space.get_parent_job_dir()
        return d

    def get_job_global_data_dir(self):
        """
        Returns a (string) filename of the directory on
        the local filesystem shared by all jobs of the
        same type. Appropriate for raw data files that
        are loaded each run and for results cached 
        between runs.
        """
        d = self.file_space.get_job_global_data_dir()
        return d

    def get_run_dir(self):
        """
        returns the root dir of the current RUN. Prefer
        to put data in the get_run_local_data_dir, which
        is inside of here.
        """
        return self.file_space.get_run_dir()

    def get_run_id(self):
        """
        a relative identifier of the run. unique to the
        (job_key, vm it's running on).
        """
        return self.file_space.get_run_idx()

    def get_run_local_data_dir(self):
        """
        Returns a (string) filename of the data directory
        that the current run should use for results and
        temporary disk space during the run. A subdir
        of get_run_dir().
        """
        d = self.file_space.get_run_local_data_dir()
        return d

    def get_external_run_dir(self, job_key, run_index):
        """
        get the "run_local_data_dir" of a run of any type of
        job that shares the same root directory as this job
        (e.g in wix, if you want run 23 of a job named hello_world,
        this would return:
            dn = jcx.get_external_run_dir('hello_world', 23)
            dn == 'data/wix/hello_world/run_023'

        """
        dn = self.file_space.get_external_run_dir(job_key, run_index)
        return dn

    def log(self, message):
        """
        logs a message to either a log file or stdout, 
        depending how this JobContext is configured.
        """
        self.logger.debug(message)
        return

    def checkpoint(self, message):
        """
        logs a message, but includes the time since
        the last checkpoint and the time since the
        job was triggered.
        """
        checkpoint_msg = self.timer._make_checkpoint_message(message)
        self.log(checkpoint_msg)
        return

    def get_container_user(self):
        """
        Returns the ContainerUser that should do any system
        level (linux files and processes) work for the job.
        """
        return self.container_user

    def declare_success(self):
        """
        To be called by the job when it completes. The job
        runner will return a success exit-code to the commandline
        """
        self.success = True
        return

    def declare_failure(self):
        """
        To be called by the job when it completes. The job
        runner will return a Failure exit-code to the commandline
        """
        self.success = False
        return

    def succeeded(self):
        if self.success == None:
            self.log("WARNING: success/failure of the job run was \
                not declared, returning succeeded=True anyway.")
        r = (self.success == True)
        return r