class DataDrivenRelationProcess(object):

    def __init__(self, es):
        self.es = es
        self.es_query=ESQuery(self.es)
        self.logger = logging.getLogger(__name__)

    def process_all(self, dry_run, 
            ddr_workers_production,
            ddr_workers_score,
            ddr_queue_production_score,
            ddr_queue_score_result):
        start_time = time.time()

        target_data, disease_data = self.es_query.get_disease_to_targets_vectors()

        self.logger.info('Retrieved all the associations data in %i s'%(time.time()-start_time))
        self.logger.info('target data length: %s size in memory: %f Kb'%(len(target_data),sys.getsizeof(target_data)/1024.))
        self.logger.info('disease data length: %s size in memory: %f Kb' % (len(disease_data),sys.getsizeof(disease_data)/1024.))

        '''sort the lists and keep using always the same order in all the steps'''
        disease_keys = sorted(disease_data.keys())
        target_keys = sorted(target_data.keys())

        self.logger.info('getting disese labels')
        disease_id_to_label = self.es_query.get_disease_labels(disease_keys)
        disease_labels = [disease_id_to_label[hit_id] for hit_id in disease_keys]
        self.logger.info('getting target labels')
        target_id_to_label = self.es_query.get_target_labels(target_keys)
        target_labels = [target_id_to_label[hit_id] for hit_id in target_keys]

        #setup elasticsearch
        self.loader = Loader(self.es, dry_run=dry_run)
        if not dry_run:
            #need to directly get the versioned index name for this function
            self.loader.create_new_index(Const.ELASTICSEARCH_RELATION_INDEX_NAME)
            self.loader.prepare_for_bulk_indexing(self.loader.get_versioned_index(Const.ELASTICSEARCH_RELATION_INDEX_NAME))


        #calculate and store disease-to-disease in multiple processess
        self.logger.info('handling disease-to-disease')
        handle_pairs(RelationType.SHARED_TARGET, disease_labels, disease_data, disease_keys, 
            target_keys, 0.19, 1024, self.loader, dry_run, 
            ddr_workers_production, ddr_workers_score, 
            ddr_queue_production_score, ddr_queue_score_result)
        self.logger.info('handled disease-to-disease')

        #calculate and store target-to-target in multiple processess
        self.logger.info('handling target-to-target')
        handle_pairs(RelationType.SHARED_DISEASE, target_labels, target_data, target_keys, 
            disease_keys, 0.19, 1024, self.loader, dry_run, 
            ddr_workers_production, ddr_workers_score, 
            ddr_queue_production_score, ddr_queue_score_result)
        self.logger.info('handled target-to-target')

        #cleanup elasticsearch
        if not dry_run:
            self.loader.flush_all_and_wait(Const.ELASTICSEARCH_RELATION_INDEX_NAME)
            #restore old pre-load settings
            #note this automatically does all prepared indexes
            self.loader.restore_after_bulk_indexing()
예제 #2
0
class ScoringProcess():
    def __init__(self, redis_host, redis_port, es_hosts):

        self.logger = logging.getLogger(__name__)

        self.es_hosts = es_hosts
        self.es = new_es_client(self.es_hosts)
        self.es_loader = Loader(self.es)
        self.es_query = ESQuery(self.es)

        self.redis_host = redis_host
        self.redis_port = redis_port
        self.r_server = new_redis_client(self.redis_host, self.redis_port)

    def process_all(self, scoring_weights, is_direct_do_not_propagate,
                    datasources_to_datatypes, dry_run, num_workers_produce,
                    num_workers_score, max_queued_produce_to_score):

        lookup_data = LookUpDataRetriever(
            self.es,
            self.r_server,
            targets=[],
            data_types=(LookUpDataType.DISEASE, LookUpDataType.TARGET,
                        LookUpDataType.ECO, LookUpDataType.HPA)).lookup

        targets = list(self.es_query.get_all_target_ids_with_evidence_data())

        #setup elasticsearch
        if not dry_run:
            self.es_loader.create_new_index(
                Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME)
            self.es_loader.prepare_for_bulk_indexing(
                self.es_loader.get_versioned_index(
                    Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME))

        self.logger.info('setting up stages')

        #bake the arguments for the setup into function objects
        produce_evidence_local_init_baked = functools.partial(
            produce_evidence_local_init, self.es_hosts, scoring_weights,
            is_direct_do_not_propagate, datasources_to_datatypes)
        score_producer_local_init_baked = functools.partial(
            score_producer_local_init, self.es_hosts, self.redis_host,
            self.redis_port, lookup_data, datasources_to_datatypes, dry_run)

        #this doesn't need to be in the external config, since its so content light
        #as to be meaningless
        max_queued_score_out = 10000

        #pipeline stage for making the lists of the target/disease pairs and evidence
        pipeline_stage = pr.flat_map(
            produce_evidence,
            targets,
            workers=num_workers_produce,
            maxsize=max_queued_produce_to_score,
            on_start=produce_evidence_local_init_baked,
            on_done=produce_evidence_local_shutdown)

        #pipeline stage for scoring the evidence sets
        #includes writing to elasticsearch
        pipeline_stage = pr.each(score_producer,
                                 pipeline_stage,
                                 workers=num_workers_score,
                                 maxsize=max_queued_score_out,
                                 on_start=score_producer_local_init_baked,
                                 on_done=score_producer_local_shutdown)

        #loop over the end of the pipeline to make sure everything is finished
        self.logger.info('stages created, running scoring and writing')
        pr.run(pipeline_stage)
        self.logger.info('stages created, ran scoring and writing')

        #cleanup elasticsearch
        if not dry_run:
            self.logger.info('flushing data to index')
            self.es_loader.flush_all_and_wait(
                Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME)
            #restore old pre-load settings
            #note this automatically does all prepared indexes
            self.es_loader.restore_after_bulk_indexing()
            self.logger.info('flushed data to index')

        self.logger.info("DONE")

    """
    Run a series of QC tests on EFO elasticsearch index. Returns a dictionary
    of string test names and result objects
    """

    def qc(self, esquery):

        #number of eco entries
        association_count = 0
        #Note: try to avoid doing this more than once!
        for association in esquery.get_all_associations():
            association_count += 1
            if association_count % 1000 == 0:
                self.logger.debug("checking %d", association_count)

        #put the metrics into a single dict
        metrics = dict()
        metrics["association.count"] = association_count

        return metrics