def validation_on_start(eco_scores_uri, schema_uri, excluded_biotypes, datasources_to_datatypes, es_hosts, es_index_gene, es_index_eco, es_index_efo, cache_target, cache_target_u2e, cache_target_contains, cache_eco, cache_efo, cache_efo_contains): logger = logging.getLogger(__name__) validator = opentargets_validator.helpers.generate_validator_from_schema( schema_uri) lookup_data = LookUpDataRetriever( new_es_client(es_hosts), gene_index=es_index_gene, gene_cache_size=cache_target, gene_cache_u2e_size=cache_target_u2e, gene_cache_contains_size=cache_target_contains, eco_index=es_index_eco, eco_cache_size=cache_efo_contains, efo_index=es_index_efo, efo_cache_size=cache_efo, efo_cache_contains_size=cache_efo_contains).lookup datasources_to_datatypes = datasources_to_datatypes evidence_manager = EvidenceManager(lookup_data, eco_scores_uri, excluded_biotypes, datasources_to_datatypes) return logger, validator, lookup_data, datasources_to_datatypes, evidence_manager
def _store_efo(self, dry_run): with URLZSource(self.es_mappings).open() as mappings_file: mappings = json.load(mappings_file) with URLZSource(self.es_settings).open() as settings_file: settings = json.load(settings_file) es = new_es_client(self.es_hosts) with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings): #write into elasticsearch chunk_size = 1000 #TODO make configurable actions = elasticsearch_actions(self.efos.items(), self.es_index, self.es_doc) failcount = 0 if not dry_run: results = None if self.workers_write > 0: results = elasticsearch.helpers.parallel_bulk(es, actions, thread_count=self.workers_write, queue_size=self.queue_write, chunk_size=chunk_size) else: results = elasticsearch.helpers.streaming_bulk(es, actions, chunk_size=chunk_size) for success, details in results: if not success: failcount += 1 if failcount: raise RuntimeError("%s relations failed to index" % failcount)
def setup_writers(dry_run, es_hosts, output_folder): global_init = None local_init = None main = None local_shutdown = None global_shutdown = None if dry_run: main = dry_run_main elif es_hosts: #have to bake the loader object in so that the prepare for bulk indexing works es_loader = Loader(new_es_client(es_hosts)) #use partial to "bake" arguments into the function we return global_init = functools.partial(elasticsearch_global_init, es_loader) local_init = functools.partial(elasticsearch_local_init, es_hosts) main = elasticsearch_main local_shutdown = elasticsearch_local_shutdown global_shutdown = functools.partial(elasticsearch_global_shutdown, es_loader) elif output_folder: #use partial to "bake" arguments into the function we return global_init = functools.partial(file_global_init, output_folder) local_init = functools.partial(file_local_init, output_folder) main = file_main local_shutdown = file_local_shutdown else: raise ValueError( "Must specify one of dry_run, es_hosts, output_folder") return global_init, local_init, main, local_shutdown, global_shutdown
def process_all(self, dry_run): es = new_es_client(self.es_hosts) threshold = 0.1 evidence_count = 3 target_data, disease_data = get_disease_to_targets_vectors( self.score_threshold, self.evidence_count, es, self.es_index_assoc) if len(target_data) == 0 or len(disease_data) == 0: raise Exception( 'Could not find a set of targets AND diseases that had the sufficient number' ' of evidences or acceptable harmonic sum score') '''sort the lists and keep using always the same order in all the steps''' disease_keys = sorted(disease_data.keys()) target_keys = sorted(target_data.keys()) self.logger.info('getting disese labels') disease_id_to_label = get_disease_labels(disease_keys, es, self.es_index_efo) disease_labels = [ disease_id_to_label[hit_id] for hit_id in disease_keys ] self.logger.info('getting target labels') target_id_to_label = get_target_labels(target_keys, es, self.es_index_gen) target_labels = [target_id_to_label[hit_id] for hit_id in target_keys] with URLZSource(self.es_mappings).open() as mappings_file: mappings = json.load(mappings_file) with URLZSource(self.es_settings).open() as settings_file: settings = json.load(settings_file) with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings): #calculate and store disease-to-disease in multiple processess self.logger.info('handling disease-to-disease') handle_pairs(RelationType.SHARED_TARGET, disease_labels, disease_data, disease_keys, target_keys, 0.19, 1024, es, dry_run, self.ddr_workers_production, self.ddr_workers_score, self.ddr_workers_write, self.ddr_queue_production_score, self.ddr_queue_score_result, self.ddr_queue_write, self.es_index, self.es_doc) self.logger.info('handled disease-to-disease') #calculate and store target-to-target in multiple processess self.logger.info('handling target-to-target') handle_pairs(RelationType.SHARED_DISEASE, target_labels, target_data, target_keys, disease_keys, 0.19, 1024, es, dry_run, self.ddr_workers_production, self.ddr_workers_score, self.ddr_workers_write, self.ddr_queue_production_score, self.ddr_queue_score_result, self.ddr_queue_write, self.es_index, self.es_doc) self.logger.info('handled target-to-target')
def score_producer_local_init(es_hosts, redis_host, redis_port, lookup_data, datasources_to_datatypes, dry_run): #set the R server to lookup into r_server = new_redis_client(redis_host, redis_port) scorer = Scorer() loader = Loader(new_es_client(es_hosts)) return scorer, loader, r_server, lookup_data, datasources_to_datatypes, dry_run
def process_all(self, dry_run): self.relations = dict() self.g.add_node('root', name="", species="") for row in self.downloader.get_pathway_data(): self.g.add_node(row['id'], name=row['name'], species=row['species']) children = set() for row in self.downloader.get_pathway_relations(): self.g.add_edge(row['id'], row['child']) children.add(row['child']) nodes_without_parent = set(self.g.nodes()) - children for node in nodes_without_parent: if node != 'root': self.g.add_edge('root', node) with URLZSource(self.es_mappings).open() as mappings_file: mappings = json.load(mappings_file) with URLZSource(self.es_settings).open() as settings_file: settings = json.load(settings_file) es = new_es_client(self.es_hosts) with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings): #write into elasticsearch chunk_size = 1000 #TODO make configurable docs = generate_documents(self.g) actions = elasticsearch_actions(docs, self.es_index, self.es_doc) failcount = 0 if not dry_run: results = None if self.workers_write > 0: results = elasticsearch.helpers.parallel_bulk( es, actions, thread_count=self.workers_write, queue_size=self.queue_write, chunk_size=chunk_size) else: results = elasticsearch.helpers.streaming_bulk( es, actions, chunk_size=chunk_size) for success, details in results: if not success: failcount += 1 if failcount: raise RuntimeError("%s relations failed to index" % failcount)
def __init__(self, redis_host, redis_port, es_hosts): self.logger = logging.getLogger(__name__) self.es_hosts = es_hosts self.es = new_es_client(self.es_hosts) self.es_loader = Loader(self.es) self.es_query = ESQuery(self.es) self.redis_host = redis_host self.redis_port = redis_port self.r_server = new_redis_client(self.redis_host, self.redis_port)
def merge_all(self, dry_run): es = new_es_client(self.es_hosts) #run the actual plugins for plugin_name in self.plugin_order: plugin = self.simplePluginManager.getPluginByName(plugin_name) # TODO remove the former redis object from all plugins plugin.plugin_object.merge_data(self.genes, es, None, self.data_config, self.es_config) with URLZSource(self.es_mappings).open() as mappings_file: mappings = json.load(mappings_file) with URLZSource(self.es_settings).open() as settings_file: settings = json.load(settings_file) # Hot fix issue 643: missing pathway in the association. Need a review for the reactome functions for geneid, gene in self.genes.iterate(): gene._create_suggestions() gene._create_facets() with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings): #write into elasticsearch chunk_size = 1000 #TODO make configurable actions = elasticsearch_actions(self.genes, self.es_index) failcount = 0 if not dry_run: results = None if self.workers_write > 0: results = elasticsearch.helpers.parallel_bulk( es, actions, thread_count=self.workers_write, queue_size=self.queue_write, chunk_size=chunk_size) else: results = elasticsearch.helpers.streaming_bulk( es, actions, chunk_size=chunk_size) for success, details in results: if not success: failcount += 1 if failcount: raise RuntimeError("%s relations failed to index" % failcount)
def score_producer_local_init(datasources_to_datatypes, dry_run, es_hosts, es_index_gene, es_index_eco, es_index_hpa, es_index_efo): scorer = Scorer() lookup_data = LookUpDataRetriever(new_es_client(es_hosts), ( LookUpDataType.DISEASE, LookUpDataType.TARGET, LookUpDataType.ECO, LookUpDataType.HPA ), gene_index=es_index_gene, eco_index=es_index_eco, hpa_index=es_index_hpa, efo_index=es_index_efo).lookup return scorer, lookup_data, datasources_to_datatypes, dry_run
def process_all(self, dry_run): ''' process all the objects that needs to be returned by the search method :return: ''' es = new_es_client(self.es_hosts) #setup chembl handler self.chembl_handler = ChEMBLLookup( self.chembl_target_uri, self.chembl_mechanism_uri, self.chembl_component_uri, self.chembl_protein_uri, self.chembl_molecule_set_uri_pattern) self.chembl_handler.get_molecules_from_evidence( es, self.es_index_val_right) all_molecules = set() for target, molecules in self.chembl_handler.target2molecule.items(): all_molecules = all_molecules | molecules all_molecules = sorted(all_molecules) query_batch_size = 100 for i in range(0, len(all_molecules) + 1, query_batch_size): self.chembl_handler.populate_synonyms_for_molecule( all_molecules[i:i + query_batch_size], self.chembl_handler.molecule2synonyms) with URLZSource(self.es_mappings).open() as mappings_file: mappings = json.load(mappings_file) with URLZSource(self.es_settings).open() as settings_file: settings = json.load(settings_file) with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings): #process targets self.logger.info('handling targets') targets = self.get_targets(es) so_it = self.handle_search_object(targets, es, SearchObjectTypes.TARGET) store_in_elasticsearch(so_it, dry_run, es, self.es_index, self.es_doc, self.workers_write, self.queue_write) #process diseases self.logger.info('handling diseases') diseases = self.get_diseases(es) so_it = self.handle_search_object(diseases, es, SearchObjectTypes.DISEASE) store_in_elasticsearch(so_it, dry_run, es, self.es_index, self.es_doc, self.workers_write, self.queue_write)
def store_data(self, dry_run): self.logger.info('store_data called') self.logger.debug('calling to create new expression index') with URLZSource(self.es_mappings).open() as mappings_file: mappings = json.load(mappings_file) with URLZSource(self.es_settings).open() as settings_file: settings = json.load(settings_file) es = new_es_client(self.es_hosts) with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings): #write into elasticsearch chunk_size = 1000 #TODO make configurable actions = elasticsearch_actions(self.hpa_merged_table, dry_run, self.es_index) failcount = 0 if not dry_run: results = None if self.workers_write > 0: results = elasticsearch.helpers.parallel_bulk( es, actions, thread_count=self.workers_write, queue_size=self.queue_write, chunk_size=chunk_size) else: results = elasticsearch.helpers.streaming_bulk( es, actions, chunk_size=chunk_size) for success, details in results: if not success: failcount += 1 if failcount: raise RuntimeError("%s relations failed to index" % failcount) if failcount: raise RuntimeError("%s failed to index" % failcount) self.logger.info('missing tissues %s', str(_missing_tissues))
def process(self, dry_run): def _put_line(line): return 1 self.logger.info('Reading Ensembl gene info from %s' % self.ensembl_filename) lines = more_itertools.with_iter( URLZSource(self.ensembl_filename).open()) with URLZSource(self.es_mappings).open() as mappings_file: mappings = json.load(mappings_file) with URLZSource(self.es_settings).open() as settings_file: settings = json.load(settings_file) es = new_es_client(self.es_hosts) with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings): #write into elasticsearch chunk_size = 1000 #TODO make configurable actions = elasticsearch_actions(lines, self.es_index, self.es_doc) failcount = 0 if not dry_run: results = None if self.workers_write > 0: results = elasticsearch.helpers.parallel_bulk( es, actions, thread_count=self.workers_write, queue_size=self.queue_write, chunk_size=chunk_size) else: results = elasticsearch.helpers.streaming_bulk( es, actions, chunk_size=chunk_size) for success, details in results: if not success: failcount += 1 if failcount: raise RuntimeError("%s relations failed to index" % failcount)
def score_producer_local_init( datasources_to_datatypes, dry_run, es_hosts, es_index_gene, es_index_hpa, es_index_efo, gene_cache_size, hpa_cache_size, efo_cache_size, ): scorer = Scorer() lookup_data = LookUpDataRetriever(new_es_client(es_hosts), gene_index=es_index_gene, gene_cache_size=gene_cache_size, hpa_index=es_index_hpa, hpa_cache_size=hpa_cache_size, efo_index=es_index_efo, efo_cache_size=efo_cache_size).lookup return scorer, lookup_data, datasources_to_datatypes, dry_run
def validation_on_start(eco_scores_uri, schema_uri, excluded_biotypes, datasources_to_datatypes, es_hosts, es_index_gene, es_index_eco, es_index_efo): logger = logging.getLogger(__name__) validator = opentargets_validator.helpers.generate_validator_from_schema( schema_uri) lookup_data = LookUpDataRetriever( new_es_client(es_hosts), (LookUpDataType.DISEASE, LookUpDataType.TARGET, LookUpDataType.ECO), gene_index=es_index_gene, eco_index=es_index_eco, efo_index=es_index_efo).lookup datasources_to_datatypes = datasources_to_datatypes evidence_manager = EvidenceManager(lookup_data, eco_scores_uri, excluded_biotypes, datasources_to_datatypes) return logger, validator, lookup_data, datasources_to_datatypes, evidence_manager
def process(self, dry_run): self.logger.debug("download uniprot uri %s", self.uri) self.logger.debug("to generate this file you have to call this url " "https://www.uniprot.org/uniprot/?query=reviewed%3Ayes%2BAND%2Borganism%3A9606&compress=yes&format=xml") with URLZSource(self.es_mappings).open() as mappings_file: mappings = json.load(mappings_file) with URLZSource(self.es_settings).open() as settings_file: settings = json.load(settings_file) chunk_size = 1000 # TODO make configurable es = new_es_client(self.es_hosts) with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings): items = generate_uniprot(self.uri) actions = elasticsearch_actions(items, self.es_index, self.es_doc) #write into elasticsearch failcount = 0 if not dry_run: results = None if self.workers_write > 0: results = elasticsearch.helpers.parallel_bulk(es, actions, thread_count=self.workers_write, queue_size=self.queue_write, chunk_size=chunk_size) else: results = elasticsearch.helpers.streaming_bulk(es, actions, chunk_size=chunk_size) for success, details in results: if not success: failcount += 1 if failcount: raise RuntimeError("%s relations failed to index" % failcount)
def main(): #parse config file, environment, and command line arguments mrtarget.cfg.setup_ops_parser() args = mrtarget.cfg.get_ops_args() #set up logging logger = None if args.log_config: if os.path.isfile(args.log_config) and os.access( args.log_config, os.R_OK): #read a log configuration file logging.config.fileConfig(args.log_config, disable_existing_loggers=False) logger = logging.getLogger(__name__ + ".main()") else: #unable to read the logging config file, abort logging.basicConfig() logger = logging.getLogger(__name__ + ".main()") logger.error("unable to read file {}".format(args.log_config)) return 1 else: #no logging config specified, fall back to default logging.basicConfig() logger = logging.getLogger(__name__ + ".main()") if not args.release_tag: logger.error('A [release-tag] has to be specified.') print('A [release-tag] has to be specified.', file=sys.stderr) return 1 else: Config.RELEASE_VERSION = args.release_tag logger.info('setting release version %s' % Config.RELEASE_VERSION) with RedisManager(args.redis_remote, args.redis_host, args.redis_port): es = new_es_client(args.elasticseach_nodes) redis = new_redis_client(args.redis_host, args.redis_port) #create a single query object for future use esquery = ESQuery(es) #read the data configuration data_config = mrtarget.cfg.get_data_config(args.data_config) #create something to accumulate qc metrics into over various steps qc_metrics = QCMetrics() with Loader(es, chunk_size=ElasticSearchConfiguration.bulk_load_chunk, dry_run=args.dry_run) as loader: if args.rea: process = ReactomeProcess( loader, data_config.reactome_pathway_data, data_config.reactome_pathway_relation) if not args.qc_only: process.process_all(args.dry_run) if not args.skip_qc: qc_metrics.update(process.qc(esquery)) if args.ens: process = EnsemblProcess(loader) if not args.qc_only: process.process(data_config.ensembl_filename, args.dry_run) if not args.skip_qc: qc_metrics.update(process.qc(esquery)) if args.unic: process = UniprotDownloader(loader) if not args.qc_only: process.process(data_config.uniprot_uri, args.dry_run) if not args.skip_qc: qc_metrics.update(process.qc(esquery)) if args.hpa: process = HPAProcess(loader, redis, args.elasticseach_nodes, data_config.tissue_translation_map, data_config.tissue_curation_map, data_config.hpa_normal_tissue, data_config.hpa_rna_level, data_config.hpa_rna_value, data_config.hpa_rna_zscore) if not args.qc_only: process.process_all(args.dry_run) if not args.skip_qc: qc_metrics.update(process.qc(esquery)) if args.gen: process = GeneManager( loader, redis, args.gen_plugin_places, data_config.gene_data_plugin_names, ) if not args.qc_only: process.merge_all(data_config, dry_run=args.dry_run) if not args.skip_qc: qc_metrics.update(process.qc(esquery)) if args.efo: process = EfoProcess(loader, data_config.ontology_efo, data_config.ontology_hpo, data_config.ontology_mp, data_config.disease_phenotype) if not args.qc_only: process.process_all(args.dry_run) if not args.skip_qc: qc_metrics.update(process.qc(esquery)) if args.eco: process = EcoProcess(loader, data_config.ontology_eco, data_config.ontology_so) if not args.qc_only: process.process_all(args.dry_run) if not args.skip_qc: qc_metrics.update(process.qc(esquery)) if args.val: es_output_folder = None if "elasticsearch_folder" in vars( args) and args.elasticsearch_folder is not None: es_output_folder = args.elasticsearch_folder process_evidences_pipeline( filenames=data_config.input_file, first_n=args.val_first_n, es_client=es, redis_client=redis, dry_run=args.dry_run, output_folder=es_output_folder, num_workers=args.val_workers_validator, num_writers=args.val_workers_writer, max_queued_events=args.val_queue_validator_writer, eco_scores_uri=data_config.eco_scores, schema_uri=data_config.schema, es_hosts=args.elasticseach_nodes, excluded_biotypes=data_config.excluded_biotypes, datasources_to_datatypes=data_config. datasources_to_datatypes) #TODO qc if args.assoc: process = ScoringProcess(args.redis_host, args.redis_port, args.elasticseach_nodes) if not args.qc_only: process.process_all(data_config.scoring_weights, data_config.is_direct_do_not_propagate, data_config.datasources_to_datatypes, args.dry_run, args.as_workers_production, args.as_workers_score, args.as_queue_production_score) if not args.skip_qc: qc_metrics.update(process.qc(esquery)) pass if args.ddr: process = DataDrivenRelationProcess(es) if not args.qc_only: process.process_all(args.dry_run, args.ddr_workers_production, args.ddr_workers_score, args.ddr_queue_production_score, args.ddr_queue_score_result) #TODO qc if args.sea: process = SearchObjectProcess(loader, redis) if not args.qc_only: process.process_all( data_config.chembl_target, data_config.chembl_mechanism, data_config.chembl_component, data_config.chembl_protein, data_config.chembl_molecule_set_uri_pattern, args.dry_run) #TODO qc if args.metric: process = Metrics( es, args.metric_file, data_config.datasources_to_datatypes).generate_metrics() if args.qc_in: #handle reading in previous qc from filename provided, and adding comparitive metrics qc_metrics.compare_with(args.qc_in) if args.qc_out: #handle writing out to a tsv file qc_metrics.write_out(args.qc_out) logger.info('`' + " ".join(sys.argv) + '` - finished') return 0
# ./data_pipeline_extract.py --index=master_evidence-data --output-dir=~/data/ot/extract --output-filename=evidence.json # ./data_pipeline_extract.py --index=master_gene-data --output-dir=~/data/ot/extract --output-filename=gene.json # ./data_pipeline_extract.py --index=master_association-data --output-dir=~/data/ot/extract --output-filename=association.json # ./data_pipeline_extract.py --index=master_efo-data --output-dir=~/data/ot/extract --output-filename=efo.json --id-field-name=id # ./data_pipeline_extract.py --index=master_eco-data --output-dir=~/data/ot/extract --output-filename=eco.json from mrtarget.common.connection import new_es_client from elasticsearch import helpers from pathlib import Path import more_itertools import tqdm import argparse import json import os import logging es = new_es_client('http://elasticsearch:9200') def get_record_iterator(index, id_field, batch_size=10000): # Setup scanner for entire index query = {"query": {"match_all": {}}} res = helpers.scan(es, query, index=index, size=batch_size, scroll='1h') for batch in more_itertools.chunked(tqdm.tqdm(res), batch_size): for r in batch: rec = r['_source'] if id_field: rec[id_field] = r['_id'] yield rec def export(index, out_file, id_field): logging.info(f'Beginning export to {out_file}')
def main(): #parse config file, environment, and command line arguments mrtarget.cfg.setup_ops_parser() args = mrtarget.cfg.get_ops_args() #set up logging logger = None if args.log_config: if os.path.isfile(args.log_config) and os.access( args.log_config, os.R_OK): #read a log configuration file logging.config.fileConfig(args.log_config, disable_existing_loggers=False) logger = logging.getLogger(__name__ + ".main()") else: #unable to read the logging config file, abort logging.basicConfig() logger = logging.getLogger(__name__ + ".main()") logger.error("unable to read file {}".format(args.log_config)) return 1 else: #no logging config specified, fall back to default logging.basicConfig() logger = logging.getLogger(__name__ + ".main()") logger.info('`' + " ".join(sys.argv) + '` - starting') #read the data configuration data_config = mrtarget.cfg.get_config(args.data_config) #read the es configuration es_config = mrtarget.cfg.get_config(args.es_config) #es clients can't be pased around to multiple processs! es = new_es_client(args.elasticseach_nodes) #create something to accumulate qc metrics into over various steps qc_metrics = QCMetrics() if args.rea: process = ReactomeProcess(args.elasticseach_nodes, es_config.rea.name, es_config.rea.mapping, es_config.rea.setting, data_config.reactome_pathway_data, data_config.reactome_pathway_relation, args.rea_workers_writer, args.rea_queue_write) if not args.qc_only: process.process_all(args.dry_run) if not args.skip_qc: qc_metrics.update(process.qc(es, es_config.rea.name)) if args.gen: process = GeneManager(args.elasticseach_nodes, es_config.gen.name, es_config.gen.mapping, es_config.gen.setting, args.gen_plugin_places, data_config.gene_data_plugin_names, data_config, es_config, args.gen_workers_writer, args.gen_queue_write) if not args.qc_only: process.merge_all(args.dry_run) if not args.skip_qc: qc_metrics.update(process.qc(es, es_config.gen.name)) if args.efo: process = EfoProcess(args.elasticseach_nodes, es_config.efo.name, es_config.efo.mapping, es_config.efo.setting, data_config.ontology_efo, data_config.ontology_hpo, data_config.ontology_mp, data_config.disease_phenotype, args.efo_workers_writer, args.efo_queue_write) if not args.qc_only: process.process_all(args.dry_run) if not args.skip_qc: qc_metrics.update(process.qc(es, es_config.efo.name)) if args.eco: process = EcoProcess(args.elasticseach_nodes, es_config.eco.name, es_config.eco.mapping, es_config.eco.setting, data_config.ontology_eco, data_config.ontology_so, args.eco_workers_writer, args.eco_queue_write) if not args.qc_only: process.process_all(args.dry_run) if not args.skip_qc: qc_metrics.update(process.qc(es, es_config.eco.name)) if args.val: process_evidences_pipeline( data_config.input_file, args.val_first_n, args.elasticseach_nodes, es_config.val_right.name, es_config.val_wrong.name, es_config.val_right.mapping, es_config.val_wrong.mapping, es_config.val_right.setting, es_config.val_wrong.setting, es_config.gen.name, es_config.eco.name, es_config.efo.name, args.dry_run, args.val_append_data, args.val_workers_validator, args.val_queue_validator, args.val_workers_writer, args.val_queue_validator_writer, args.val_cache_target, args.val_cache_target_u2e, args.val_cache_target_contains, args.val_cache_eco, args.val_cache_efo, args.val_cache_efo_contains, data_config.eco_scores, data_config.schema, data_config.excluded_biotypes, data_config.datasources_to_datatypes) #TODO qc if args.hpa: process = HPAProcess( args.elasticseach_nodes, es_config.hpa.name, es_config.hpa.mapping, es_config.hpa.setting, data_config.tissue_translation_map, data_config.tissue_curation_map, data_config.hpa_normal_tissue, data_config.hpa_rna_level, data_config.hpa_rna_value, data_config.hpa_rna_zscore, args.hpa_workers_writer, args.hpa_queue_write) if not args.qc_only: process.process_all(args.dry_run) if not args.skip_qc: qc_metrics.update(process.qc(es, es_config.hpa.name)) if args.assoc: process = ScoringProcess( args.elasticseach_nodes, es_config.asc.name, es_config.asc.mapping, es_config.asc.setting, es_config.gen.name, es_config.val_right.name, es_config.hpa.name, es_config.efo.name, args.as_workers_writer, args.as_workers_production, args.as_workers_score, args.as_queue_score, args.as_queue_production, args.as_queue_write, args.as_cache_hpa, args.as_cache_efo, args.as_cache_target, data_config.scoring_weights, data_config.is_direct_do_not_propagate, data_config.datasources_to_datatypes) if not args.qc_only: process.process_all(args.dry_run) if not args.skip_qc: qc_metrics.update(process.qc(es, es_config.asc.name)) if args.ddr: process = DataDrivenRelationProcess( args.elasticseach_nodes, es_config.ddr.name, es_config.ddr.mapping, es_config.ddr.setting, es_config.efo.name, es_config.gen.name, es_config.asc.name, args.ddr_workers_production, args.ddr_workers_score, args.ddr_workers_write, args.ddr_queue_production_score, args.ddr_queue_score_result, args.ddr_queue_write, data_config.ddr["score-threshold"], data_config.ddr["evidence-count"]) if not args.qc_only: process.process_all(args.dry_run) #TODO qc if args.sea: process = SearchObjectProcess( args.elasticseach_nodes, es_config.sea.name, es_config.sea.mapping, es_config.sea.setting, es_config.gen.name, es_config.efo.name, es_config.val_right.name, es_config.asc.name, args.sea_workers_writer, args.sea_queue_write, data_config.chembl_target, data_config.chembl_mechanism, data_config.chembl_component, data_config.chembl_protein, data_config.chembl_molecule) if not args.qc_only: process.process_all(args.dry_run) #TODO qc if args.drg: process = DrugProcess( args.elasticseach_nodes, es_config.drg.name, es_config.drg.mapping, es_config.drg.setting, es_config.gen.name, es_config.efo.name, args.drg_workers_writer, args.drg_queue_write, args.drg_cache_efo, args.drg_cache_efo_contains, args.drg_cache_target, args.drg_cache_target_u2e, args.drg_cache_target_contains, data_config.chembl_target, data_config.chembl_mechanism, data_config.chembl_component, data_config.chembl_protein, data_config.chembl_molecule, data_config.chembl_indication, data_config.adverse_events) if not args.qc_only: process.process_all(args.dry_run) if not args.skip_qc: qc_metrics.update(process.qc(es, es_config.drg.name)) if args.qc_in: #handle reading in previous qc from filename provided, and adding comparitive metrics qc_metrics.compare_with(args.qc_in) if args.qc_out: #handle writing out to a tsv file qc_metrics.write_out(args.qc_out) logger.info('`' + " ".join(sys.argv) + '` - finished') return 0
def write_on_start(es_hosts): kwargs = {} es_client = new_es_client(es_hosts) kwargs['es_loader'] = Loader(es=es_client) return kwargs
def produce_evidence_local_init(es_hosts, es_index_val_right, scoring_weights, is_direct_do_not_propagate, datasources_to_datatypes): es = new_es_client(es_hosts) return (es, es_index_val_right, scoring_weights, is_direct_do_not_propagate, datasources_to_datatypes)
def process_all(self, dry_run): es = new_es_client(self.es_hosts) drugs = self.generate(es) self.store(es, dry_run, drugs)
def produce_evidence_local_init(es_hosts, scoring_weights, is_direct_do_not_propagate, datasources_to_datatypes): es = new_es_client(es_hosts) es_query = ESQuery(es) return es_query, scoring_weights, is_direct_do_not_propagate, datasources_to_datatypes
def process_evidences_pipeline( filenames, first_n, es_hosts, es_index_valid, es_index_invalid, es_doc_valid, es_doc_invalid, es_mappings_valid, es_mappings_invalid, es_settings_valid, es_settings_invalid, es_index_gene, es_index_eco, es_index_efo, dry_run, workers_validation, queue_validation, workers_write, queue_write, eco_scores_uri, schema_uri, excluded_biotypes, datasources_to_datatypes): logger = logging.getLogger(__name__) # do not pass this es object to other processess, single process only! es = new_es_client(es_hosts) if not filenames: logger.error('tried to run with no filenames at all') raise RuntimeError("Must specify at least one filename of evidence") # files that are not fetchable failed_filenames = list(itertools.ifilterfalse(IO.check_to_open, filenames)) for uri in failed_filenames: logger.warning('failed to fetch uri %s', uri) # get the filenames that are properly fetchable #sort the list for consistent behaviour checked_filenames = sorted((set(filenames) - set(failed_filenames))) logger.info('start evidence processing pipeline') #create a iterable of lines from all file handles evs = IO.make_iter_lines(checked_filenames, first_n) #create functions with pre-baked arguments validation_on_start_baked = functools.partial(validation_on_start, eco_scores_uri, schema_uri, excluded_biotypes, datasources_to_datatypes, es_hosts, es_index_gene, es_index_eco, es_index_efo) #here is the pipeline definition pl_stage = pr.map(process_evidence, evs, workers=workers_validation, maxsize=queue_validation, on_start=validation_on_start_baked) logger.info('stages created, running scoring and writing') with URLZSource(es_mappings_valid).open() as mappings_file: mappings_valid = json.load(mappings_file) with URLZSource(es_mappings_invalid).open() as mappings_file: mappings_invalid = json.load(mappings_file) with URLZSource(es_settings_valid).open() as settings_file: settings_valid = json.load(settings_file) with URLZSource(es_settings_invalid).open() as settings_file: settings_invalid = json.load(settings_file) with ElasticsearchBulkIndexManager(es, es_index_invalid, settings_invalid, mappings_invalid): with ElasticsearchBulkIndexManager(es, es_index_valid, settings_valid, mappings_valid): #load into elasticsearch chunk_size = 1000 #TODO make configurable actions = elasticsearch_actions(pl_stage, es_index_valid, es_index_invalid, es_doc_valid, es_doc_invalid) failcount = 0 if not dry_run: results = None if workers_write > 0: logger.debug("Using parallel bulk writer for Elasticearch") # this can silently crash ? results = elasticsearch.helpers.parallel_bulk( es, actions, thread_count=workers_write, queue_size=queue_write, chunk_size=chunk_size) else: logger.debug( "Using streaming bulk writer for Elasticearch") results = elasticsearch.helpers.streaming_bulk( es, actions, chunk_size=chunk_size) for success, details in results: if not success: failcount += 1 if failcount: raise RuntimeError("%s relations failed to index" % failcount) print('stages created, ran scoring and writing') logger.info('stages created, ran scoring and writing') if failed_filenames: raise RuntimeError('unable to handle %s', str(failed_filenames))
def process_all(self, dry_run): # do not pass this es object to other processess, single process only! es = new_es_client(self.es_hosts) targets = self.get_targets(es) self.logger.info('setting up stages') #bake the arguments for the setup into function objects produce_evidence_local_init_baked = functools.partial(produce_evidence_local_init, self.es_hosts, self.es_index_val_right, self.scoring_weights, self.is_direct_do_not_propagate, self.datasources_to_datatypes) score_producer_local_init_baked = functools.partial(score_producer_local_init, self.datasources_to_datatypes, dry_run, self.es_hosts, self.es_index_gene, self.es_index_eco, self.es_index_hpa, self.es_index_efo) #pipeline stage for making the lists of the target/disease pairs and evidence pipeline_stage1 = pr.flat_map(produce_evidence, targets, workers=self.workers_production, maxsize=self.queue_produce, on_start=produce_evidence_local_init_baked) #pipeline stage for scoring the evidence sets #includes writing to elasticsearch pipeline_stage2 = pr.map(score_producer, pipeline_stage1, workers=self.workers_score, maxsize=self.queue_score, on_start=score_producer_local_init_baked) with URLZSource(self.es_mappings).open() as mappings_file: mappings = json.load(mappings_file) with URLZSource(self.es_settings).open() as settings_file: settings = json.load(settings_file) with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings): #load into elasticsearch self.logger.info('stages created, running scoring and writing') client = es chunk_size = 1000 #TODO make configurable actions = self.elasticsearch_actions(pipeline_stage2, self.es_index, self.es_doc) failcount = 0 if not dry_run: results = None if self.workers_write > 0: self.logger.debug("Using parallel bulk writer for Elasticearch") results = elasticsearch.helpers.parallel_bulk(client, actions, thread_count=self.workers_write, queue_size=self.queue_write, chunk_size=chunk_size) else: self.logger.debug("Using streaming bulk writer for Elasticearch") results = elasticsearch.helpers.streaming_bulk(client, actions, chunk_size=chunk_size) for success, details in results: if not success: failcount += 1 if failcount: raise RuntimeError("%s relations failed to index" % failcount) self.logger.info("DONE")
def elasticsearch_local_init(es_hosts): return Loader(new_es_client(es_hosts)),
def get_es_client(): return new_es_client('http://elasticsearch:9200')