class QCRunner(object): def __init__(self, es): self.es = es self.esquery = ESQuery(es) self._logger = logging.getLogger(__name__ + ".QCRunner") def run_associationQC(self): self.run_evidence2associationQC() def run_evidence2associationQC(self): computed_assocations_ids = set(self.esquery.get_all_associations_ids()) missing_assocations_ids = set() total_evidence = self.esquery.count_elements_in_index( Const.ELASTICSEARCH_DATA_INDEX_NAME + '*') self._logger.info('Starting to analyse %i evidence' % total_evidence) for as_id in self.esquery.get_all_target_disease_pair_from_evidence(): if as_id not in computed_assocations_ids: self._logger.error( 'Association id %s was not computed or stored' % as_id) missing_assocations_ids.add(as_id) if missing_assocations_ids: self._logger.error('%i associations not found' % len(missing_assocations_ids)) self._logger.error('\n'.join(list(missing_assocations_ids))) else: self._logger.info('no missing annotation found')
def merge_data(self, genes, loader, r_server, data_config): esquery = ESQuery(loader.es) try: count = esquery.count_elements_in_index( Const.ELASTICSEARCH_ENSEMBL_INDEX_NAME) except NotFoundError as ex: self._logger.error( 'no Ensembl index in ES. Skipping. Has the --ensembl step been run? Are you pointing to the correct index? %s' % ex) raise ex for row in esquery.get_all_ensembl_genes(): if row['id'] in genes: gene = genes.get_gene(row['id']) gene.load_ensembl_data(row) genes.add_gene(gene) else: gene = Gene() gene.load_ensembl_data(row) genes.add_gene(gene) self._clean_non_reference_genes(genes) self._logger.info("STATS AFTER ENSEMBL PARSING:\n" + genes.get_stats())
def __init__(self, loader, r_server): self.loader = loader self.esquery = ESQuery(loader.es) self.r_server = r_server self.logger = logging.getLogger(__name__) '''define data processing handlers''' self.data_handlers = defaultdict(lambda: SearchObject) self.data_handlers[SearchObjectTypes.TARGET] = SearchObjectTarget self.data_handlers[SearchObjectTypes.DISEASE] = SearchObjectDisease
def __init__(self, redis_host, redis_port, es_hosts): self.logger = logging.getLogger(__name__) self.es_hosts = es_hosts self.es = new_es_client(self.es_hosts) self.es_loader = Loader(self.es) self.es_query = ESQuery(self.es) self.redis_host = redis_host self.redis_port = redis_port self.r_server = new_redis_client(self.redis_host, self.redis_port)
def __init__(self, es=None, namespace=None, r_server=None, ttl = 60*60*24+7): self._es = es self.r_server = r_server self._es_query = ESQuery(self._es) self._table = RedisLookupTablePickle(namespace = namespace, r_server = self.r_server, ttl = ttl) self._logger = logging.getLogger(__name__) if self.r_server is not None: self._load_efo_data(r_server)
def __init__(self, es=None, namespace = None, r_server = None, ttl = 60*60*24+7, targets = [], autoload=True): self._logger = logging.getLogger(__name__) self._es = es self.r_server = r_server self._es_query = ESQuery(self._es) self._table = RedisLookupTablePickle(namespace = namespace, r_server = self.r_server, ttl = ttl) self._logger = logging.getLogger(__name__) self.uniprot2ensembl = {} if self.r_server and autoload: self.load_gene_data(self.r_server, targets)
class HPALookUpTable(object): """ A redis-based pickable hpa look up table using gene id as table id """ def __init__(self, es=None, namespace=None, r_server=None, ttl=(60 * 60 * 24 + 7)): self._es = es self.r_server = r_server self._es_query = ESQuery(self._es) self._table = RedisLookupTablePickle(namespace=namespace, r_server=self.r_server, ttl=ttl) self._logger = logging.getLogger(__name__) if self.r_server: self._load_hpa_data(self.r_server) def _load_hpa_data(self, r_server=None): for el in self._es_query.get_all_hpa(): self.set_hpa(el, r_server=self._get_r_server(r_server)) def get_hpa(self, idx, r_server=None): return self._table.get(idx, r_server=self._get_r_server(r_server)) def set_hpa(self, hpa, r_server=None): self._table.set(hpa['gene'], hpa, r_server=self._get_r_server(r_server)) def get_available_hpa_ids(self, r_server=None): return self._table.keys(self._get_r_server(r_server)) def __contains__(self, key, r_server=None): return self._table.__contains__(key, r_server=self._get_r_server(r_server)) def __getitem__(self, key, r_server=None): return self.get_hpa(key, r_server=self._get_r_server(r_server)) def __setitem__(self, key, value, r_server=None): self._table.set(key, value, r_server=self._get_r_server(r_server)) def keys(self, r_server=None): return self._table.keys(self._get_r_server(r_server)) def _get_r_server(self, r_server=None): return r_server if r_server else self.r_server
class ReactomeRetriever(): """ Will retrieve a Reactome object form the processed json stored in elasticsearch """ def __init__(self, es): self.es_query = ESQuery(es) self._cache = {} self.logger = logging.getLogger(__name__) def get_reaction(self, reaction_id): if reaction_id not in self._cache: reaction = ReactomeNode() reaction.load_json(self.es_query.get_reaction(reaction_id)) self._cache[reaction_id] = reaction return self._cache[reaction_id]
def __init__(self, es): self.es = es self.esquery = ESQuery(es) self._logger = logging.getLogger(__name__ + ".QCRunner")
class ECOLookUpTable(object): """ A redis-based pickable gene look up table """ def __init__(self, es, namespace=None, r_server=None, ttl=60 * 60 * 24 + 7): self._table = RedisLookupTablePickle(namespace=namespace, r_server=r_server, ttl=ttl) self._es = es self._es_query = ESQuery(es) self.r_server = r_server self._logger = logging.getLogger(__name__) if r_server is not None: self._load_eco_data(r_server) @staticmethod def get_ontology_code_from_url(url): #note, this is not a guaranteed solution #to do it properly, it has to be from the actual #ontology file or from OLS API if '/' in url: return url.split('/')[-1] else: #assume already a short code return url def _load_eco_data(self, r_server=None): self._logger = logging.getLogger(__name__) for eco in self._es_query.get_all_eco(): self._table.set(self.get_ontology_code_from_url(eco['code']), eco, r_server=self._get_r_server(r_server)) # TODO can be improved by sending elements in batches def get_eco(self, efo_id, r_server=None): return self._table.get(efo_id, r_server=self._get_r_server(r_server)) def set_eco(self, eco, r_server=None): self._table.set(self.get_ontology_code_from_url(eco['code']), eco, r_server=self._get_r_server(r_server)) def get_available_eco_ids(self, r_server=None): return self._table.keys(r_server=self._get_r_server(r_server)) def __contains__(self, key, r_server=None): return self._table.__contains__(key, r_server=self._get_r_server(r_server)) def __getitem__(self, key, r_server=None): return self.get_eco(key, r_server=self._get_r_server(r_server)) def __setitem__(self, key, value, r_server=None): self._table.set(key, value, r_server=self._get_r_server(r_server)) def _get_r_server(self, r_server=None): return r_server if r_server else self.r_server def keys(self, r_server=None): return self._table.keys(r_server=self._get_r_server(r_server))
def __init__(self, es, filename, datasources_to_datatypes): self.logger = logging.getLogger(__name__) self.esquery = ESQuery(es) self.filename = filename self.datasources_to_datatypes = datasources_to_datatypes
def produce_evidence_local_init(es_hosts, scoring_weights, is_direct_do_not_propagate, datasources_to_datatypes): es = new_es_client(es_hosts) es_query = ESQuery(es) return es_query, scoring_weights, is_direct_do_not_propagate, datasources_to_datatypes
class ScoringProcess(): def __init__(self, redis_host, redis_port, es_hosts): self.logger = logging.getLogger(__name__) self.es_hosts = es_hosts self.es = new_es_client(self.es_hosts) self.es_loader = Loader(self.es) self.es_query = ESQuery(self.es) self.redis_host = redis_host self.redis_port = redis_port self.r_server = new_redis_client(self.redis_host, self.redis_port) def process_all(self, scoring_weights, is_direct_do_not_propagate, datasources_to_datatypes, dry_run, num_workers_produce, num_workers_score, max_queued_produce_to_score): lookup_data = LookUpDataRetriever( self.es, self.r_server, targets=[], data_types=(LookUpDataType.DISEASE, LookUpDataType.TARGET, LookUpDataType.ECO, LookUpDataType.HPA)).lookup targets = list(self.es_query.get_all_target_ids_with_evidence_data()) #setup elasticsearch if not dry_run: self.es_loader.create_new_index( Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME) self.es_loader.prepare_for_bulk_indexing( self.es_loader.get_versioned_index( Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME)) self.logger.info('setting up stages') #bake the arguments for the setup into function objects produce_evidence_local_init_baked = functools.partial( produce_evidence_local_init, self.es_hosts, scoring_weights, is_direct_do_not_propagate, datasources_to_datatypes) score_producer_local_init_baked = functools.partial( score_producer_local_init, self.es_hosts, self.redis_host, self.redis_port, lookup_data, datasources_to_datatypes, dry_run) #this doesn't need to be in the external config, since its so content light #as to be meaningless max_queued_score_out = 10000 #pipeline stage for making the lists of the target/disease pairs and evidence pipeline_stage = pr.flat_map( produce_evidence, targets, workers=num_workers_produce, maxsize=max_queued_produce_to_score, on_start=produce_evidence_local_init_baked, on_done=produce_evidence_local_shutdown) #pipeline stage for scoring the evidence sets #includes writing to elasticsearch pipeline_stage = pr.each(score_producer, pipeline_stage, workers=num_workers_score, maxsize=max_queued_score_out, on_start=score_producer_local_init_baked, on_done=score_producer_local_shutdown) #loop over the end of the pipeline to make sure everything is finished self.logger.info('stages created, running scoring and writing') pr.run(pipeline_stage) self.logger.info('stages created, ran scoring and writing') #cleanup elasticsearch if not dry_run: self.logger.info('flushing data to index') self.es_loader.flush_all_and_wait( Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME) #restore old pre-load settings #note this automatically does all prepared indexes self.es_loader.restore_after_bulk_indexing() self.logger.info('flushed data to index') self.logger.info("DONE") """ Run a series of QC tests on EFO elasticsearch index. Returns a dictionary of string test names and result objects """ def qc(self, esquery): #number of eco entries association_count = 0 #Note: try to avoid doing this more than once! for association in esquery.get_all_associations(): association_count += 1 if association_count % 1000 == 0: self.logger.debug("checking %d", association_count) #put the metrics into a single dict metrics = dict() metrics["association.count"] = association_count return metrics
class Metrics: def __init__(self, es, filename, datasources_to_datatypes): self.logger = logging.getLogger(__name__) self.esquery = ESQuery(es) self.filename = filename self.datasources_to_datatypes = datasources_to_datatypes def generate_metrics(self): self.logger.info("Producing data release metrics") count_drug_w_evidence = self.esquery.count_drug_w_evidence() count_entity_w_association = self.esquery.count_entity_w_association() count_target_w_symbol = self.esquery.count_target_w_symbol() count_target_w_mp = self.esquery.count_target_w_mp() count_target_w_hallmark = self.esquery.count_target_w_hallmark() count_target_w_biomarker = self.esquery.count_target_w_biomarker() count_BRAF_evidence = self.esquery.count_BRAF_evidence() count_withdrawn_drug_evidence = self.esquery.count_withdrawn_drug_evidence( ) count_trinucleotide_evidence = self.esquery.count_trinucleotide_evidence( ) count_datatype_evidence = self.esquery.count_datatype_evidence() count_datatype_association = self.esquery.count_datatype_association() with open(self.filename, 'w') as metrics_output: metrics_output.write( "drugs(unique) with evidence:\t" + str(count_drug_w_evidence['aggregations']['general_drug'] ['value']) + "\n" + "diseases(unique) with association:\t" + str(count_entity_w_association['aggregations'] ['general_disease']['value']) + "\n" + "targets(unique) with association:\t" + str(count_entity_w_association['aggregations'] ['general_target']['value']) + "\n" + "targets with approved symbol:\t" + str(count_target_w_symbol['hits']['total']) + "\n" + "targets with mouse phenotype:\t" + str(count_target_w_mp['hits']['total']) + "\n" + "targets with cancer hallmark:\t" + str(count_target_w_hallmark['hits']['total']) + "\n" + "targets with cancer biomarker:\t" + str(count_target_w_biomarker['hits']['total']) + "\n" + "evidence link to BRAF:\t" + str(count_BRAF_evidence['hits']['total']) + "\n" + "evidence link to withdrawn drug:\t" + str(count_withdrawn_drug_evidence['hits']['total']) + "\n" "evidence link to trinucleotide expansion:\t" + str(count_trinucleotide_evidence['hits']['total']) + "\n") for ds in self.datasources_to_datatypes.iterkeys(): count_datasource_evidence = self.esquery.count_datasource_evidence( ds) metrics_output.write( "evidence from datasource " + ds + ":\t" + str(count_datasource_evidence['hits']['total']) + "\n") for item in count_datatype_evidence['aggregations']['datatypes'][ 'buckets']: datatype = item['key'] evidence_count = item['doc_count'] metrics_output.write("evidence from datatype " + datatype + ":\t" + str(evidence_count) + "\n") for item in count_datatype_association['aggregations'][ 'datatypes']['buckets']: datatype = item['key'] association_count = item['doc_count'] metrics_output.write("association from datatype " + datatype + ":\t" + str(association_count) + "\n") metrics_output.close() self.logger.info("Producing data release metrics - Completed")
class DataDrivenRelationProcess(object): def __init__(self, es): self.es = es self.es_query=ESQuery(self.es) self.logger = logging.getLogger(__name__) def process_all(self, dry_run, ddr_workers_production, ddr_workers_score, ddr_queue_production_score, ddr_queue_score_result): start_time = time.time() target_data, disease_data = self.es_query.get_disease_to_targets_vectors() self.logger.info('Retrieved all the associations data in %i s'%(time.time()-start_time)) self.logger.info('target data length: %s size in memory: %f Kb'%(len(target_data),sys.getsizeof(target_data)/1024.)) self.logger.info('disease data length: %s size in memory: %f Kb' % (len(disease_data),sys.getsizeof(disease_data)/1024.)) '''sort the lists and keep using always the same order in all the steps''' disease_keys = sorted(disease_data.keys()) target_keys = sorted(target_data.keys()) self.logger.info('getting disese labels') disease_id_to_label = self.es_query.get_disease_labels(disease_keys) disease_labels = [disease_id_to_label[hit_id] for hit_id in disease_keys] self.logger.info('getting target labels') target_id_to_label = self.es_query.get_target_labels(target_keys) target_labels = [target_id_to_label[hit_id] for hit_id in target_keys] #setup elasticsearch self.loader = Loader(self.es, dry_run=dry_run) if not dry_run: #need to directly get the versioned index name for this function self.loader.create_new_index(Const.ELASTICSEARCH_RELATION_INDEX_NAME) self.loader.prepare_for_bulk_indexing(self.loader.get_versioned_index(Const.ELASTICSEARCH_RELATION_INDEX_NAME)) #calculate and store disease-to-disease in multiple processess self.logger.info('handling disease-to-disease') handle_pairs(RelationType.SHARED_TARGET, disease_labels, disease_data, disease_keys, target_keys, 0.19, 1024, self.loader, dry_run, ddr_workers_production, ddr_workers_score, ddr_queue_production_score, ddr_queue_score_result) self.logger.info('handled disease-to-disease') #calculate and store target-to-target in multiple processess self.logger.info('handling target-to-target') handle_pairs(RelationType.SHARED_DISEASE, target_labels, target_data, target_keys, disease_keys, 0.19, 1024, self.loader, dry_run, ddr_workers_production, ddr_workers_score, ddr_queue_production_score, ddr_queue_score_result) self.logger.info('handled target-to-target') #cleanup elasticsearch if not dry_run: self.loader.flush_all_and_wait(Const.ELASTICSEARCH_RELATION_INDEX_NAME) #restore old pre-load settings #note this automatically does all prepared indexes self.loader.restore_after_bulk_indexing()
class GeneLookUpTable(object): """ A redis-based pickable gene look up table """ def __init__(self, es=None, namespace = None, r_server = None, ttl = 60*60*24+7, targets = [], autoload=True): self._logger = logging.getLogger(__name__) self._es = es self.r_server = r_server self._es_query = ESQuery(self._es) self._table = RedisLookupTablePickle(namespace = namespace, r_server = self.r_server, ttl = ttl) self._logger = logging.getLogger(__name__) self.uniprot2ensembl = {} if self.r_server and autoload: self.load_gene_data(self.r_server, targets) def load_gene_data(self, r_server = None, targets = []): data = None if targets: data = self._es_query.get_targets_by_id(targets) total = len(targets) if data is None: data = self._es_query.get_all_targets() total = self._es_query.count_all_targets() for target in data: self._table.set(target['id'],target, r_server=self._get_r_server(r_server))#TODO can be improved by sending elements in batches if target['uniprot_id']: self.uniprot2ensembl[target['uniprot_id']] = target['id'] for accession in target['uniprot_accessions']: self.uniprot2ensembl[accession] = target['id'] def get_gene(self, target_id, r_server = None): try: return self._table.get(target_id, r_server=self._get_r_server(r_server)) except KeyError: try: target = self._es_query.get_objects_by_id(target_id, Const.ELASTICSEARCH_GENE_NAME_INDEX_NAME, Const.ELASTICSEARCH_GENE_NAME_DOC_NAME, source_exclude='ortholog.*' ).next() except Exception as e: self._logger.exception('Cannot retrieve target from elasticsearch') raise KeyError() self.set_gene(target, r_server) return target def set_gene(self, target, r_server = None): self._table.set(target['id'],target, r_server=self._get_r_server(r_server)) def get_available_gene_ids(self, r_server = None): return self._table.keys(r_server = self._get_r_server(r_server)) def __contains__(self, key, r_server=None): redis_contain = self._table.__contains__(key, r_server=self._get_r_server(r_server)) if redis_contain: return True if not redis_contain: return self._es_query.exists(index=Const.ELASTICSEARCH_GENE_NAME_INDEX_NAME, doc_type=Const.ELASTICSEARCH_GENE_NAME_DOC_NAME, id=key, ) def __getitem__(self, key, r_server = None): return self.get_gene(key, self._get_r_server(r_server)) def __setitem__(self, key, value, r_server=None): self._table.set(key, value, self._get_r_server(r_server)) def __missing__(self, key): print key def keys(self, r_server=None): return self._table.keys(self._get_r_server(r_server)) def _get_r_server(self, r_server = None): return r_server if r_server else self.r_server
def __init__(self, es): self.es = es self.es_query=ESQuery(self.es) self.logger = logging.getLogger(__name__)
class EFOLookUpTable(object): """ A redis-based pickable efo look up table. Allows to grab the EFO saved in ES and load it up in memory/redis so that it can be accessed quickly from multiple processes, reducing memory usage by sharing. """ def __init__(self, es=None, namespace=None, r_server=None, ttl = 60*60*24+7): self._es = es self.r_server = r_server self._es_query = ESQuery(self._es) self._table = RedisLookupTablePickle(namespace = namespace, r_server = self.r_server, ttl = ttl) self._logger = logging.getLogger(__name__) if self.r_server is not None: self._load_efo_data(r_server) @staticmethod def get_ontology_code_from_url(url): #note, this is not a guaranteed solution #to do it properly, it has to be from the actual #ontology file or from OLS API if '/' in url: return url.split('/')[-1] else: #assume already a short code return url def _load_efo_data(self, r_server = None): self._logger = logging.getLogger(__name__) for i,efo in enumerate(self._es_query.get_all_diseases()): #TODO can be improved by sending elements in batches self.set_efo(efo, r_server=self._get_r_server(r_server)) if i % 1000 == 0: self._logger.debug("Loaded %s efo", i) def get_efo(self, efo_id, r_server=None): return self._table.get(efo_id, r_server=self._get_r_server(r_server)) def set_efo(self, efo, r_server=None): efo_key = efo['path_codes'][0][-1] self._table.set(efo_key,efo, r_server=self._get_r_server(r_server)) def get_available_gefo_ids(self, r_server=None): return self._table.keys(r_server=self._get_r_server(r_server)) def __contains__(self, key, r_server=None): return self._table.__contains__(key, r_server=self._get_r_server(r_server)) def __getitem__(self, key, r_server=None): return self.get_efo(key, r_server=self._get_r_server(r_server)) def __setitem__(self, key, value, r_server=None): self._table.set(key, value, r_server=self._get_r_server(r_server)) def keys(self, r_server=None): return self._table.keys(r_server=self._get_r_server(r_server)) def _get_r_server(self, r_server = None): return r_server if r_server else self.r_server
def __init__(self, es): self.es_query = ESQuery(es) self._cache = {} self.logger = logging.getLogger(__name__)
def main(): #parse config file, environment, and command line arguments mrtarget.cfg.setup_ops_parser() args = mrtarget.cfg.get_ops_args() #set up logging logger = None if args.log_config: if os.path.isfile(args.log_config) and os.access( args.log_config, os.R_OK): #read a log configuration file logging.config.fileConfig(args.log_config, disable_existing_loggers=False) logger = logging.getLogger(__name__ + ".main()") else: #unable to read the logging config file, abort logging.basicConfig() logger = logging.getLogger(__name__ + ".main()") logger.error("unable to read file {}".format(args.log_config)) return 1 else: #no logging config specified, fall back to default logging.basicConfig() logger = logging.getLogger(__name__ + ".main()") if not args.release_tag: logger.error('A [release-tag] has to be specified.') print('A [release-tag] has to be specified.', file=sys.stderr) return 1 else: Config.RELEASE_VERSION = args.release_tag logger.info('setting release version %s' % Config.RELEASE_VERSION) with RedisManager(args.redis_remote, args.redis_host, args.redis_port): es = new_es_client(args.elasticseach_nodes) redis = new_redis_client(args.redis_host, args.redis_port) #create a single query object for future use esquery = ESQuery(es) #read the data configuration data_config = mrtarget.cfg.get_data_config(args.data_config) #create something to accumulate qc metrics into over various steps qc_metrics = QCMetrics() with Loader(es, chunk_size=ElasticSearchConfiguration.bulk_load_chunk, dry_run=args.dry_run) as loader: if args.rea: process = ReactomeProcess( loader, data_config.reactome_pathway_data, data_config.reactome_pathway_relation) if not args.qc_only: process.process_all(args.dry_run) if not args.skip_qc: qc_metrics.update(process.qc(esquery)) if args.ens: process = EnsemblProcess(loader) if not args.qc_only: process.process(data_config.ensembl_filename, args.dry_run) if not args.skip_qc: qc_metrics.update(process.qc(esquery)) if args.unic: process = UniprotDownloader(loader) if not args.qc_only: process.process(data_config.uniprot_uri, args.dry_run) if not args.skip_qc: qc_metrics.update(process.qc(esquery)) if args.hpa: process = HPAProcess(loader, redis, args.elasticseach_nodes, data_config.tissue_translation_map, data_config.tissue_curation_map, data_config.hpa_normal_tissue, data_config.hpa_rna_level, data_config.hpa_rna_value, data_config.hpa_rna_zscore) if not args.qc_only: process.process_all(args.dry_run) if not args.skip_qc: qc_metrics.update(process.qc(esquery)) if args.gen: process = GeneManager( loader, redis, args.gen_plugin_places, data_config.gene_data_plugin_names, ) if not args.qc_only: process.merge_all(data_config, dry_run=args.dry_run) if not args.skip_qc: qc_metrics.update(process.qc(esquery)) if args.efo: process = EfoProcess(loader, data_config.ontology_efo, data_config.ontology_hpo, data_config.ontology_mp, data_config.disease_phenotype) if not args.qc_only: process.process_all(args.dry_run) if not args.skip_qc: qc_metrics.update(process.qc(esquery)) if args.eco: process = EcoProcess(loader, data_config.ontology_eco, data_config.ontology_so) if not args.qc_only: process.process_all(args.dry_run) if not args.skip_qc: qc_metrics.update(process.qc(esquery)) if args.val: es_output_folder = None if "elasticsearch_folder" in vars( args) and args.elasticsearch_folder is not None: es_output_folder = args.elasticsearch_folder process_evidences_pipeline( filenames=data_config.input_file, first_n=args.val_first_n, es_client=es, redis_client=redis, dry_run=args.dry_run, output_folder=es_output_folder, num_workers=args.val_workers_validator, num_writers=args.val_workers_writer, max_queued_events=args.val_queue_validator_writer, eco_scores_uri=data_config.eco_scores, schema_uri=data_config.schema, es_hosts=args.elasticseach_nodes, excluded_biotypes=data_config.excluded_biotypes, datasources_to_datatypes=data_config. datasources_to_datatypes) #TODO qc if args.assoc: process = ScoringProcess(args.redis_host, args.redis_port, args.elasticseach_nodes) if not args.qc_only: process.process_all(data_config.scoring_weights, data_config.is_direct_do_not_propagate, data_config.datasources_to_datatypes, args.dry_run, args.as_workers_production, args.as_workers_score, args.as_queue_production_score) if not args.skip_qc: qc_metrics.update(process.qc(esquery)) pass if args.ddr: process = DataDrivenRelationProcess(es) if not args.qc_only: process.process_all(args.dry_run, args.ddr_workers_production, args.ddr_workers_score, args.ddr_queue_production_score, args.ddr_queue_score_result) #TODO qc if args.sea: process = SearchObjectProcess(loader, redis) if not args.qc_only: process.process_all( data_config.chembl_target, data_config.chembl_mechanism, data_config.chembl_component, data_config.chembl_protein, data_config.chembl_molecule_set_uri_pattern, args.dry_run) #TODO qc if args.metric: process = Metrics( es, args.metric_file, data_config.datasources_to_datatypes).generate_metrics() if args.qc_in: #handle reading in previous qc from filename provided, and adding comparitive metrics qc_metrics.compare_with(args.qc_in) if args.qc_out: #handle writing out to a tsv file qc_metrics.write_out(args.qc_out) logger.info('`' + " ".join(sys.argv) + '` - finished') return 0
class SearchObjectProcess(object): def __init__(self, loader, r_server): self.loader = loader self.esquery = ESQuery(loader.es) self.r_server = r_server self.logger = logging.getLogger(__name__) '''define data processing handlers''' self.data_handlers = defaultdict(lambda: SearchObject) self.data_handlers[SearchObjectTypes.TARGET] = SearchObjectTarget self.data_handlers[SearchObjectTypes.DISEASE] = SearchObjectDisease def process_all(self, chembl_target_uri, chembl_mechanism_uri, chembl_component_uri, chembl_protein_uri, chembl_molecule_set_uri_pattern, dry_run): ''' process all the objects that needs to be returned by the search method :return: ''' #setup chembl handler self.chembl_handler = ChEMBLLookup(chembl_target_uri, chembl_mechanism_uri, chembl_component_uri, chembl_protein_uri, chembl_molecule_set_uri_pattern) self.chembl_handler.get_molecules_from_evidence(self.esquery) all_molecules = set() for target, molecules in self.chembl_handler.target2molecule.items(): all_molecules = all_molecules | molecules all_molecules = sorted(all_molecules) query_batch_size = 100 for i in range(0, len(all_molecules) + 1, query_batch_size): self.chembl_handler.populate_synonyms_for_molecule( all_molecules[i:i + query_batch_size], self.chembl_handler.molecule2synonyms) #setup elasticsearch if not dry_run: self.loader.create_new_index( Const.ELASTICSEARCH_DATA_SEARCH_INDEX_NAME) #need to directly get the versioned index name for this function self.loader.prepare_for_bulk_indexing( self.loader.get_versioned_index( Const.ELASTICSEARCH_DATA_SEARCH_INDEX_NAME)) #process targets '''get gene simplified objects and push them to the processing queue''' for i, target in enumerate(self.esquery.get_all_targets()): target[SearchObjectTypes.__ROOT__] = SearchObjectTypes.TARGET self.handle_search_object(target, dry_run) #process diseases '''get disease objects and push them to the processing queue''' self.logger.info( 'get disease objects and push them to the processing queue') for i, disease in enumerate(self.esquery.get_all_diseases()): disease[SearchObjectTypes.__ROOT__] = SearchObjectTypes.DISEASE self.handle_search_object(disease, dry_run) #cleanup elasticsearch if not dry_run: self.loader.flush_all_and_wait( Const.ELASTICSEARCH_DATA_SEARCH_INDEX_NAME) #restore old pre-load settings #note this automatically does all prepared indexes self.loader.restore_after_bulk_indexing() self.logger.info("DONE") def summarise_association(self, data): def cap_score(value): if value > 1: return 1.0 elif value < -1: return -1 return value return dict(total=[ dict(id=data_point['id'], score=cap_score(data_point['harmonic-sum']['overall'])) for data_point in data['total'] ], direct=[ dict(id=data_point['id'], score=cap_score( data_point['harmonic-sum']['overall'])) for data_point in data['direct'] ]) def handle_search_object(self, data, dry_run): '''process objects to simple search object''' so = self.data_handlers[data[SearchObjectTypes.__ROOT__]]() so.digest(json_input=data) '''inject drug data''' if not hasattr(so, 'drugs'): so.drugs = {} so.drugs['evidence_data'] = [] '''count associations ''' if data[SearchObjectTypes.__ROOT__] == SearchObjectTypes.TARGET: ass_data = self.esquery.get_associations_for_target( data['id'], fields=['id', 'harmonic-sum.overall'], size=20) so.set_associations( self.summarise_association(ass_data.top_associations), ass_data.associations_count) if so.id in self.chembl_handler.target2molecule: drugs_synonyms = set() for molecule in self.chembl_handler.target2molecule[so.id]: if molecule in self.chembl_handler.molecule2synonyms: drugs_synonyms = drugs_synonyms | set( self.chembl_handler.molecule2synonyms[molecule]) so.drugs['evidence_data'] = list(drugs_synonyms) elif data[SearchObjectTypes.__ROOT__] == SearchObjectTypes.DISEASE: ass_data = self.esquery.get_associations_for_disease( data['path_codes'][0][-1], fields=['id', 'harmonic-sum.overall'], size=20) so.set_associations( self.summarise_association(ass_data.top_associations), ass_data.associations_count) if so.id in self.chembl_handler.disease2molecule: drugs_synonyms = set() for molecule in self.chembl_handler.disease2molecule[so.id]: if molecule in self.chembl_handler.molecule2synonyms: drugs_synonyms = drugs_synonyms | set( self.chembl_handler.molecule2synonyms[molecule]) so.drugs['evidence_data'] = list(drugs_synonyms) else: so.set_associations() '''store search objects''' if not dry_run: self.loader.put( Const.ELASTICSEARCH_DATA_SEARCH_INDEX_NAME, Const.ELASTICSEARCH_DATA_SEARCH_DOC_NAME + '-' + so.type, so.id, so.to_json())