def validation_on_start(eco_scores_uri, schema_uri, excluded_biotypes, datasources_to_datatypes, es_hosts, es_index_gene, es_index_eco, es_index_efo, cache_target, cache_target_u2e, cache_target_contains, cache_eco, cache_efo, cache_efo_contains): logger = logging.getLogger(__name__) validator = opentargets_validator.helpers.generate_validator_from_schema( schema_uri) lookup_data = LookUpDataRetriever( new_es_client(es_hosts), gene_index=es_index_gene, gene_cache_size=cache_target, gene_cache_u2e_size=cache_target_u2e, gene_cache_contains_size=cache_target_contains, eco_index=es_index_eco, eco_cache_size=cache_efo_contains, efo_index=es_index_efo, efo_cache_size=cache_efo, efo_cache_contains_size=cache_efo_contains).lookup datasources_to_datatypes = datasources_to_datatypes evidence_manager = EvidenceManager(lookup_data, eco_scores_uri, excluded_biotypes, datasources_to_datatypes) return logger, validator, lookup_data, datasources_to_datatypes, evidence_manager
def score_producer_local_init(datasources_to_datatypes, dry_run, es_hosts, es_index_gene, es_index_eco, es_index_hpa, es_index_efo): scorer = Scorer() lookup_data = LookUpDataRetriever(new_es_client(es_hosts), ( LookUpDataType.DISEASE, LookUpDataType.TARGET, LookUpDataType.ECO, LookUpDataType.HPA ), gene_index=es_index_gene, eco_index=es_index_eco, hpa_index=es_index_hpa, efo_index=es_index_efo).lookup return scorer, lookup_data, datasources_to_datatypes, dry_run
def score_producer_local_init( datasources_to_datatypes, dry_run, es_hosts, es_index_gene, es_index_hpa, es_index_efo, gene_cache_size, hpa_cache_size, efo_cache_size, ): scorer = Scorer() lookup_data = LookUpDataRetriever(new_es_client(es_hosts), gene_index=es_index_gene, gene_cache_size=gene_cache_size, hpa_index=es_index_hpa, hpa_cache_size=hpa_cache_size, efo_index=es_index_efo, efo_cache_size=efo_cache_size).lookup return scorer, lookup_data, datasources_to_datatypes, dry_run
def validation_on_start(eco_scores_uri, schema_uri, excluded_biotypes, datasources_to_datatypes, es_hosts, es_index_gene, es_index_eco, es_index_efo): logger = logging.getLogger(__name__) validator = opentargets_validator.helpers.generate_validator_from_schema( schema_uri) lookup_data = LookUpDataRetriever( new_es_client(es_hosts), (LookUpDataType.DISEASE, LookUpDataType.TARGET, LookUpDataType.ECO), gene_index=es_index_gene, eco_index=es_index_eco, efo_index=es_index_efo).lookup datasources_to_datatypes = datasources_to_datatypes evidence_manager = EvidenceManager(lookup_data, eco_scores_uri, excluded_biotypes, datasources_to_datatypes) return logger, validator, lookup_data, datasources_to_datatypes, evidence_manager
def generate(self, es): # pre-load into indexed shelf dicts self.logger.info("Starting pre-loading") #create lookup tables self.lookup_data = LookUpDataRetriever( es, (LookUpDataType.TARGET, LookUpDataType.DISEASE), gene_index=self.es_index_gene, efo_index=self.es_index_efo).lookup # these are all separate files # intentional, partly because its what chembl API gives us, and partly because # it is easier for partners to add information to existing chembl records # TODO potentially load these in separate processes? self.logger.debug("Loading molecules") mols = self.create_shelf_multi(self.chembl_molecule_uris, get_parent_id) self.logger.debug("Loaded %d molecules", len(mols)) self.logger.debug("Loading indications") indications = self.create_shelf_multi( self.chembl_indication_uris, lambda x: x["molecule_chembl_id"]) self.logger.debug("Loaded %d indications", len(indications)) self.logger.debug("Loading mechanisms") mechanisms = self.create_shelf_multi(self.chembl_mechanism_uris, lambda x: x["molecule_chembl_id"]) self.logger.debug("Loaded %d mechanisms", len(mechanisms)) self.logger.debug("Loading targets") targets = self.create_shelf(self.chembl_target_uris, lambda x: x["target_chembl_id"]) self.logger.debug("Loaded %d targets", len(targets)) self.logger.info("Completed pre-loading") drugs = {} #TODO finish for ident in mols: parent_mol = None child_mols = [] for mol in mols[ident]: if mol["molecule_chembl_id"] == ident: #this is the parent assert parent_mol is None parent_mol = mol else: #this is a child assert mol not in child_mols child_mols.append(mol) assert parent_mol is not None, ident #TODO sure no grandparenting child_mols = sorted(child_mols) drug = self.handle_drug(ident, parent_mol, indications, mechanisms, targets) #append information from children for child_mol in child_mols: self.handle_drug_child(drug, child_mol["molecule_chembl_id"], child_mol, indications, mechanisms, targets) if "indications" in drug: drug["number_of_indications"] = len(drug["indications"]) else: drug["number_of_indications"] = 0 if "mechanisms_of_action" in drug: drug["number_of_mechanisms_of_action"] = len( drug["mechanisms_of_action"]) else: drug["number_of_mechanisms_of_action"] = 0 # only keep those with indications or mechanisms if drug["number_of_indications"] > 0 \ or drug["number_of_mechanisms_of_action"] > 0: drugs[ident] = drug return drugs
def process_all(self, scoring_weights, is_direct_do_not_propagate, datasources_to_datatypes, dry_run, num_workers_produce, num_workers_score, max_queued_produce_to_score): lookup_data = LookUpDataRetriever( self.es, self.r_server, targets=[], data_types=(LookUpDataType.DISEASE, LookUpDataType.TARGET, LookUpDataType.ECO, LookUpDataType.HPA)).lookup targets = list(self.es_query.get_all_target_ids_with_evidence_data()) #setup elasticsearch if not dry_run: self.es_loader.create_new_index( Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME) self.es_loader.prepare_for_bulk_indexing( self.es_loader.get_versioned_index( Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME)) self.logger.info('setting up stages') #bake the arguments for the setup into function objects produce_evidence_local_init_baked = functools.partial( produce_evidence_local_init, self.es_hosts, scoring_weights, is_direct_do_not_propagate, datasources_to_datatypes) score_producer_local_init_baked = functools.partial( score_producer_local_init, self.es_hosts, self.redis_host, self.redis_port, lookup_data, datasources_to_datatypes, dry_run) #this doesn't need to be in the external config, since its so content light #as to be meaningless max_queued_score_out = 10000 #pipeline stage for making the lists of the target/disease pairs and evidence pipeline_stage = pr.flat_map( produce_evidence, targets, workers=num_workers_produce, maxsize=max_queued_produce_to_score, on_start=produce_evidence_local_init_baked, on_done=produce_evidence_local_shutdown) #pipeline stage for scoring the evidence sets #includes writing to elasticsearch pipeline_stage = pr.each(score_producer, pipeline_stage, workers=num_workers_score, maxsize=max_queued_score_out, on_start=score_producer_local_init_baked, on_done=score_producer_local_shutdown) #loop over the end of the pipeline to make sure everything is finished self.logger.info('stages created, running scoring and writing') pr.run(pipeline_stage) self.logger.info('stages created, ran scoring and writing') #cleanup elasticsearch if not dry_run: self.logger.info('flushing data to index') self.es_loader.flush_all_and_wait( Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME) #restore old pre-load settings #note this automatically does all prepared indexes self.es_loader.restore_after_bulk_indexing() self.logger.info('flushed data to index') self.logger.info("DONE")
def process_evidences_pipeline(filenames, first_n, es_client, redis_client, dry_run, output_folder, num_workers, num_writers, max_queued_events, eco_scores_uri, schema_uri, es_hosts, excluded_biotypes, datasources_to_datatypes): logger = logging.getLogger(__name__) if not filenames: logger.error('tried to run with no filenames at all') raise RuntimeError("Must specify at least one filename of evidence") # files that are not fetchable failed_filenames = list(itertools.ifilterfalse(IO.check_to_open, filenames)) for uri in failed_filenames: logger.warning('failed to fetch uri %s', uri) # get the filenames that are properly fetchable #sort the list for consistent behaviour checked_filenames = sorted((set(filenames) - set(failed_filenames))) logger.info('start evidence processing pipeline') #load lookup tables lookup_data = LookUpDataRetriever( es_client, redis_client, [], (LookUpDataType.TARGET, LookUpDataType.DISEASE, LookUpDataType.ECO)).lookup #create a iterable of lines from all file handles evs = IO.make_iter_lines(checked_filenames, first_n) #create functions with pre-baked arguments validation_on_start_baked = functools.partial(validation_on_start, lookup_data, eco_scores_uri, schema_uri, excluded_biotypes, datasources_to_datatypes) writer_global_init, writer_local_init, writer_main, writer_local_shutdown, writer_global_shutdown = setup_writers( dry_run, es_hosts, output_folder) if writer_global_init: writer_global_init() #here is the pipeline definition pl_stage = pr.map(process_evidence, evs, workers=num_workers, maxsize=max_queued_events, on_start=validation_on_start_baked) pl_stage = pr.map(writer_main, pl_stage, workers=num_writers, maxsize=max_queued_events, on_start=writer_local_init, on_done=writer_local_shutdown) logger.info('run evidence processing pipeline') results = reduce_tuple_with_sum(pr.to_iterable(pl_stage)) #perform any single-thread cleanup if writer_global_shutdown: writer_global_shutdown() logger.info("results (failed: %s, succeed: %s)", results[0], results[1]) if failed_filenames: raise RuntimeError('unable to handle %s', str(failed_filenames)) if not results[1]: raise RuntimeError("No evidence was sucessful!")
def generate(self, es): # pre-load into indexed shelf dicts self.logger.info("Starting pre-loading") # create lookup tables self.lookup_data = LookUpDataRetriever( es, gene_index=self.es_index_gene, gene_cache_size=self.cache_target, gene_cache_u2e_size=self.cache_target_u2e, gene_cache_contains_size=self.cache_target_contains, efo_index=self.es_index_efo, efo_cache_size=self.cache_efo, efo_cache_contains_size=self.cache_efo_contains).lookup # these are all separate files # intentional, partly because its what chembl API gives us, and partly because # it is easier for partners to add information to existing chembl records # TODO potentially load these in separate processes? self.logger.debug("Loading molecules") mols = self.create_shelf_multi(self.chembl_molecule_uris, get_parent_id) self.logger.debug("Loaded %d molecules", len(mols)) self.logger.debug("Loading indications") indications = self.create_shelf_multi( self.chembl_indication_uris, lambda x: x["molecule_chembl_id"]) self.logger.debug("Loaded %d indications", len(indications)) self.logger.debug("Loading mechanisms") mechanisms = self.create_shelf_multi(self.chembl_mechanism_uris, lambda x: x["molecule_chembl_id"]) self.logger.debug("Loaded %d mechanisms", len(mechanisms)) self.logger.debug("Loading targets") targets = self.create_shelf(self.chembl_target_uris, lambda x: x["target_chembl_id"]) self.logger.debug("Loaded %d targets", len(targets)) adverse_events = self.create_shelf_multi_csv(self.adverse_events_uris, "chembl_id", csv.excel) self.logger.debug("Loaded %d adverse events", len(adverse_events)) # technically this can be duplicate e.g. CHEMBL1236107 drugbank_ids = self.create_shelf_multi_csv(self.drugbank_uris, "From src:'1'", csv.excel_tab) self.logger.debug("Loaded %d drugbank ids", len(drugbank_ids)) self.logger.info("Completed pre-loading") drugs = {} # TODO finish for ident in mols: # all keys in mols parent_mol = None child_mols = [] # 1. Set parent mol and list of children for mol in mols[ident]: mol["molecule_chembl_id"] = self.str_hook( mol["molecule_chembl_id"]) if mol["molecule_chembl_id"] == ident: # this is the parent assert parent_mol is None parent_mol = mol else: # this is a child assert mol not in child_mols child_mols.append(mol) # ToDo: check with AF assert parent_mol is not None, ident # TODO sure no grandparenting child_mols = sorted(child_mols, key=lambda x: x["molecule_chembl_id"]) drug = self.handle_drug(ident, parent_mol, indications, mechanisms, targets, adverse_events, drugbank_ids) # append information from children for child_mol in child_mols: self.handle_drug_child(drug, child_mol["molecule_chembl_id"], child_mol, indications, mechanisms, targets, adverse_events, drugbank_ids) if "indications" in drug: drug["number_of_indications"] = len(drug["indications"]) # buld a summary of therapeutic areas covered by indications # TODO avoid repeat EFO lookup by doing inside handle_indication() indication_therapeutic_areas = defaultdict(int) for indication in drug["indications"]: efo_id = indication["efo_id"] stored_efo = self.lookup_data.available_efos.get_efo( efo_id) if "therapeutic_codes" in stored_efo and "therapeutic_labels" in stored_efo: for ta_code, ta_label in zip( stored_efo["therapeutic_codes"], stored_efo["therapeutic_labels"]): indication_therapeutic_areas[ta_code, ta_label] += 1 drug["indication_therapeutic_areas"] = [] for (ta_code, ta_label), value in sorted( indication_therapeutic_areas.items(), key=lambda x: x[1], reverse=True): indication_therapeutic_area = {} indication_therapeutic_area["therapeutic_code"] = ta_code indication_therapeutic_area["therapeutic_label"] = ta_label indication_therapeutic_area["count"] = value drug["indication_therapeutic_areas"].append( indication_therapeutic_area) drug["indication_therapeutic_areas"] = tuple( drug["indication_therapeutic_areas"]) else: drug["number_of_indications"] = 0 if "mechanisms_of_action" in drug: drug["number_of_mechanisms_of_action"] = len( drug["mechanisms_of_action"]) else: drug["number_of_mechanisms_of_action"] = 0 # Aggregate indication refs, empty array if no indications present. drug["indication_refs"] = self.generateAggregatedIndicationRefs( drug) # only keep those with indications or mechanisms if drug["number_of_indications"] == 0 \ and drug["number_of_mechanisms_of_action"] == 0: continue drugs[ident] = drug return drugs