class DataDrivenRelationProcess(object): def __init__(self, es): self.es = es self.es_query=ESQuery(self.es) self.logger = logging.getLogger(__name__) def process_all(self, dry_run, ddr_workers_production, ddr_workers_score, ddr_queue_production_score, ddr_queue_score_result): start_time = time.time() target_data, disease_data = self.es_query.get_disease_to_targets_vectors() self.logger.info('Retrieved all the associations data in %i s'%(time.time()-start_time)) self.logger.info('target data length: %s size in memory: %f Kb'%(len(target_data),sys.getsizeof(target_data)/1024.)) self.logger.info('disease data length: %s size in memory: %f Kb' % (len(disease_data),sys.getsizeof(disease_data)/1024.)) '''sort the lists and keep using always the same order in all the steps''' disease_keys = sorted(disease_data.keys()) target_keys = sorted(target_data.keys()) self.logger.info('getting disese labels') disease_id_to_label = self.es_query.get_disease_labels(disease_keys) disease_labels = [disease_id_to_label[hit_id] for hit_id in disease_keys] self.logger.info('getting target labels') target_id_to_label = self.es_query.get_target_labels(target_keys) target_labels = [target_id_to_label[hit_id] for hit_id in target_keys] #setup elasticsearch self.loader = Loader(self.es, dry_run=dry_run) if not dry_run: #need to directly get the versioned index name for this function self.loader.create_new_index(Const.ELASTICSEARCH_RELATION_INDEX_NAME) self.loader.prepare_for_bulk_indexing(self.loader.get_versioned_index(Const.ELASTICSEARCH_RELATION_INDEX_NAME)) #calculate and store disease-to-disease in multiple processess self.logger.info('handling disease-to-disease') handle_pairs(RelationType.SHARED_TARGET, disease_labels, disease_data, disease_keys, target_keys, 0.19, 1024, self.loader, dry_run, ddr_workers_production, ddr_workers_score, ddr_queue_production_score, ddr_queue_score_result) self.logger.info('handled disease-to-disease') #calculate and store target-to-target in multiple processess self.logger.info('handling target-to-target') handle_pairs(RelationType.SHARED_DISEASE, target_labels, target_data, target_keys, disease_keys, 0.19, 1024, self.loader, dry_run, ddr_workers_production, ddr_workers_score, ddr_queue_production_score, ddr_queue_score_result) self.logger.info('handled target-to-target') #cleanup elasticsearch if not dry_run: self.loader.flush_all_and_wait(Const.ELASTICSEARCH_RELATION_INDEX_NAME) #restore old pre-load settings #note this automatically does all prepared indexes self.loader.restore_after_bulk_indexing()
class ScoringProcess(): def __init__(self, redis_host, redis_port, es_hosts): self.logger = logging.getLogger(__name__) self.es_hosts = es_hosts self.es = new_es_client(self.es_hosts) self.es_loader = Loader(self.es) self.es_query = ESQuery(self.es) self.redis_host = redis_host self.redis_port = redis_port self.r_server = new_redis_client(self.redis_host, self.redis_port) def process_all(self, scoring_weights, is_direct_do_not_propagate, datasources_to_datatypes, dry_run, num_workers_produce, num_workers_score, max_queued_produce_to_score): lookup_data = LookUpDataRetriever( self.es, self.r_server, targets=[], data_types=(LookUpDataType.DISEASE, LookUpDataType.TARGET, LookUpDataType.ECO, LookUpDataType.HPA)).lookup targets = list(self.es_query.get_all_target_ids_with_evidence_data()) #setup elasticsearch if not dry_run: self.es_loader.create_new_index( Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME) self.es_loader.prepare_for_bulk_indexing( self.es_loader.get_versioned_index( Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME)) self.logger.info('setting up stages') #bake the arguments for the setup into function objects produce_evidence_local_init_baked = functools.partial( produce_evidence_local_init, self.es_hosts, scoring_weights, is_direct_do_not_propagate, datasources_to_datatypes) score_producer_local_init_baked = functools.partial( score_producer_local_init, self.es_hosts, self.redis_host, self.redis_port, lookup_data, datasources_to_datatypes, dry_run) #this doesn't need to be in the external config, since its so content light #as to be meaningless max_queued_score_out = 10000 #pipeline stage for making the lists of the target/disease pairs and evidence pipeline_stage = pr.flat_map( produce_evidence, targets, workers=num_workers_produce, maxsize=max_queued_produce_to_score, on_start=produce_evidence_local_init_baked, on_done=produce_evidence_local_shutdown) #pipeline stage for scoring the evidence sets #includes writing to elasticsearch pipeline_stage = pr.each(score_producer, pipeline_stage, workers=num_workers_score, maxsize=max_queued_score_out, on_start=score_producer_local_init_baked, on_done=score_producer_local_shutdown) #loop over the end of the pipeline to make sure everything is finished self.logger.info('stages created, running scoring and writing') pr.run(pipeline_stage) self.logger.info('stages created, ran scoring and writing') #cleanup elasticsearch if not dry_run: self.logger.info('flushing data to index') self.es_loader.flush_all_and_wait( Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME) #restore old pre-load settings #note this automatically does all prepared indexes self.es_loader.restore_after_bulk_indexing() self.logger.info('flushed data to index') self.logger.info("DONE") """ Run a series of QC tests on EFO elasticsearch index. Returns a dictionary of string test names and result objects """ def qc(self, esquery): #number of eco entries association_count = 0 #Note: try to avoid doing this more than once! for association in esquery.get_all_associations(): association_count += 1 if association_count % 1000 == 0: self.logger.debug("checking %d", association_count) #put the metrics into a single dict metrics = dict() metrics["association.count"] = association_count return metrics