def process_all(self, dry_run, ddr_workers_production, ddr_workers_score, ddr_queue_production_score, ddr_queue_score_result): start_time = time.time() target_data, disease_data = self.es_query.get_disease_to_targets_vectors() self.logger.info('Retrieved all the associations data in %i s'%(time.time()-start_time)) self.logger.info('target data length: %s size in memory: %f Kb'%(len(target_data),sys.getsizeof(target_data)/1024.)) self.logger.info('disease data length: %s size in memory: %f Kb' % (len(disease_data),sys.getsizeof(disease_data)/1024.)) '''sort the lists and keep using always the same order in all the steps''' disease_keys = sorted(disease_data.keys()) target_keys = sorted(target_data.keys()) self.logger.info('getting disese labels') disease_id_to_label = self.es_query.get_disease_labels(disease_keys) disease_labels = [disease_id_to_label[hit_id] for hit_id in disease_keys] self.logger.info('getting target labels') target_id_to_label = self.es_query.get_target_labels(target_keys) target_labels = [target_id_to_label[hit_id] for hit_id in target_keys] #setup elasticsearch self.loader = Loader(self.es, dry_run=dry_run) if not dry_run: #need to directly get the versioned index name for this function self.loader.create_new_index(Const.ELASTICSEARCH_RELATION_INDEX_NAME) self.loader.prepare_for_bulk_indexing(self.loader.get_versioned_index(Const.ELASTICSEARCH_RELATION_INDEX_NAME)) #calculate and store disease-to-disease in multiple processess self.logger.info('handling disease-to-disease') handle_pairs(RelationType.SHARED_TARGET, disease_labels, disease_data, disease_keys, target_keys, 0.19, 1024, self.loader, dry_run, ddr_workers_production, ddr_workers_score, ddr_queue_production_score, ddr_queue_score_result) self.logger.info('handled disease-to-disease') #calculate and store target-to-target in multiple processess self.logger.info('handling target-to-target') handle_pairs(RelationType.SHARED_DISEASE, target_labels, target_data, target_keys, disease_keys, 0.19, 1024, self.loader, dry_run, ddr_workers_production, ddr_workers_score, ddr_queue_production_score, ddr_queue_score_result) self.logger.info('handled target-to-target') #cleanup elasticsearch if not dry_run: self.loader.flush_all_and_wait(Const.ELASTICSEARCH_RELATION_INDEX_NAME) #restore old pre-load settings #note this automatically does all prepared indexes self.loader.restore_after_bulk_indexing()
def __init__(self, redis_host, redis_port, es_hosts): self.logger = logging.getLogger(__name__) self.es_hosts = es_hosts self.es = new_es_client(self.es_hosts) self.es_loader = Loader(self.es) self.es_query = ESQuery(self.es) self.redis_host = redis_host self.redis_port = redis_port self.r_server = new_redis_client(self.redis_host, self.redis_port)
def get_all_evidence_for_datatype( self, datatype, fields=None, ): # https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-multi-get.html index_name = Loader.get_versioned_index( Const.ELASTICSEARCH_DATA_INDEX_NAME, True) res = helpers.scan( client=self.handler, query={ "query": { "match": { "type": datatype } }, '_source': self._get_source_from_fields(fields), 'size': 1000, }, scroll='12h', index=index_name, timeout="10m", ) # res = list(res) for hit in res: yield hit['_source']
def get_validated_evidence_strings(self, size=1000, datasources=[], is_valid=True): # https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-multi-get.html index_name = Loader.get_versioned_index( Const.ELASTICSEARCH_VALIDATED_DATA_INDEX_NAME + '*', True) doc_type = None if datasources: doc_type = datasources res = helpers.scan( client=self.handler, query={ "query": { "match_phrase": { "is_valid": is_valid } }, '_source': True, 'size': size, }, scroll='12h', doc_type=doc_type, index=index_name, timeout="20m", ) for hit in res: yield hit['_source']
def get_associations_for_disease(self, disease, fields=None, size=100, get_top_hits=True): source = self._get_source_from_fields(fields) aggs = addict.Dict() if get_top_hits: aggs.direct_associations.filter.term.is_direct = True aggs.direct_associations.aggs.top_direct_ass.top_hits.sort[ 'harmonic-sum.overall'].order = 'desc' aggs.direct_associations.aggs.top_direct_ass.top_hits._source = source aggs.direct_associations.aggs.top_direct_ass.top_hits.size = size q = addict.Dict() q.query.constant_score.filter.terms['disease.id'] = [disease] q.sort['harmonic-sum.overall'].order = 'desc' q._source = source q.aggs = aggs q.size = size res = self.handler.search( index=Loader.get_versioned_index( Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME, True), doc_type=Const.ELASTICSEARCH_DATA_ASSOCIATION_DOC_NAME, body=q.to_dict()) return AssociationSummary(res)
def setup_writers(dry_run, es_hosts, output_folder): global_init = None local_init = None main = None local_shutdown = None global_shutdown = None if dry_run: main = dry_run_main elif es_hosts: #have to bake the loader object in so that the prepare for bulk indexing works es_loader = Loader(new_es_client(es_hosts)) #use partial to "bake" arguments into the function we return global_init = functools.partial(elasticsearch_global_init, es_loader) local_init = functools.partial(elasticsearch_local_init, es_hosts) main = elasticsearch_main local_shutdown = elasticsearch_local_shutdown global_shutdown = functools.partial(elasticsearch_global_shutdown, es_loader) elif output_folder: #use partial to "bake" arguments into the function we return global_init = functools.partial(file_global_init, output_folder) local_init = functools.partial(file_local_init, output_folder) main = file_main local_shutdown = file_local_shutdown else: raise ValueError( "Must specify one of dry_run, es_hosts, output_folder") return global_init, local_init, main, local_shutdown, global_shutdown
def get_disease_to_targets_vectors(self, treshold=0.1, evidence_count=3): ''' Get all the association objects that are: - direct -> to avoid ontology inflation - > 3 evidence count -> remove noise - overall score > threshold -> remove very lo quality noise :param treshold: minimum overall score threshold to consider for fetching association data :param evidence_count: minimum number of evidence consider for fetching association data :return: two dictionaries mapping target to disease and the reverse ''' self.logger.debug('scan es to get all diseases and targets') res = helpers.scan( client=self.handler, query={ "query": { "term": { "is_direct": True, } }, '_source': { 'includes': [ "target.id", 'disease.id', 'harmonic-sum', 'evidence_count' ] }, 'size': 1000, }, scroll='12h', index=Loader.get_versioned_index( Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME, True), timeout="10m", ) target_results = dict() disease_results = dict() self.logger.debug('start getting all targets and diseases from es') c = 0 for hit in res: c += 1 hit = hit['_source'] if hit['evidence_count']['total']>=evidence_count and \ hit['harmonic-sum']['overall'] >=treshold: '''store target associations''' if hit['target']['id'] not in target_results: target_results[hit['target']['id']] = SparseFloatDict() #TODO: return all counts and scores up to datasource level target_results[hit['target']['id']][ hit['disease']['id']] = hit['harmonic-sum']['overall'] '''store disease associations''' if hit['disease']['id'] not in disease_results: disease_results[hit['disease']['id']] = SparseFloatDict() # TODO: return all counts and scores up to datasource level disease_results[hit['disease']['id']][ hit['target']['id']] = hit['harmonic-sum']['overall'] if c % 10000 == 0: self.logger.debug('%d elements retrieved', c) return target_results, disease_results
def score_producer_local_init(es_hosts, redis_host, redis_port, lookup_data, datasources_to_datatypes, dry_run): #set the R server to lookup into r_server = new_redis_client(redis_host, redis_port) scorer = Scorer() loader = Loader(new_es_client(es_hosts)) return scorer, loader, r_server, lookup_data, datasources_to_datatypes, dry_run
def get_evidence_for_target_simple(self, target, expected=None): query_body = { "query": { "constant_score": { "filter": { "term": { "target.id": target } } } }, '_source': { "includes": [ "target.id", "private.efo_codes", "disease.id", "scores.association_score", "sourceID", "id", ] }, } if expected is not None and expected < 10000: query_body['size'] = 10000 res = self.handler.search(index=Loader.get_versioned_index( Const.ELASTICSEARCH_DATA_INDEX_NAME, True), body=query_body) for hit in res['hits']['hits']: yield hit['_source'] else: res = helpers.scan(client=self.handler, query=query_body, scroll='1h', index=Loader.get_versioned_index( Const.ELASTICSEARCH_DATA_INDEX_NAME, True), timeout="1h", request_timeout=2 * 60 * 60, size=1000) for hit in res: yield hit['_source']
def count_elements_in_index(self, index_name, doc_type=None, query=None): if query is None: query = {"match_all": {}} res = self.handler.search(index=Loader.get_versioned_index( index_name, True), doc_type=doc_type, body={ "query": query, '_source': False, 'size': 0, }) return res['hits']['total']
def get_reaction(self, reaction_id): res = self.handler.search( index=Loader.get_versioned_index( Const.ELASTICSEARCH_REACTOME_INDEX_NAME, True), doc_type=Const.ELASTICSEARCH_REACTOME_REACTION_DOC_NAME, body={ "query": { "ids": { "values": [reaction_id] } }, '_source': True, 'size': 1, }) for hit in res['hits']['hits']: return hit['_source']
def get_all_associations(self, ): res = helpers.scan( client=self.handler, query={ "query": { "match_all": {} }, '_source': True, 'size': 1000, }, scroll='1h', index=Loader.get_versioned_index( Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME, True), timeout="10m", ) for hit in res: yield hit['_source']
def count_evidence_for_target(self, target): res = self.handler.search(index=Loader.get_versioned_index( Const.ELASTICSEARCH_DATA_INDEX_NAME, True), body={ "query": { "constant_score": { "filter": { "term": { "target.id": target } } } }, '_source': [], 'size': 0 }) return res['hits']['total']
def get_all_ensembl_genes(self): res = helpers.scan( client=self.handler, query={ "query": { "match_all": {} }, '_source': True, 'size': 1000, }, scroll='1h', index=Loader.get_versioned_index( Const.ELASTICSEARCH_ENSEMBL_INDEX_NAME, True), timeout="10m", ) for hit in res: yield hit['_source']
def get_all_uniprot_entries(self): res = helpers.scan( client=self.handler, query={ "query": { "match_all": {} }, '_source': True, 'size': 100, }, scroll='12h', index=Loader.get_versioned_index( Const.ELASTICSEARCH_UNIPROT_INDEX_NAME, True), timeout="10m", ) for hit in res: yield jsonpickle.decode(base64.b64decode(hit['_source']['entry']))
def get_all_reactions(self): res = helpers.scan( client=self.handler, query={ "query": { "match_all": {} }, '_source': True, 'size': 1000, }, scroll='1h', doc_type=Const.ELASTICSEARCH_REACTOME_REACTION_DOC_NAME, index=Loader.get_versioned_index( Const.ELASTICSEARCH_REACTOME_INDEX_NAME, True), timeout="10m", ) for hit in res: yield hit['_source']
def get_all_diseases(self, fields=None): source = self._get_source_from_fields(fields) res = helpers.scan( client=self.handler, query={ "query": { "match_all": {} }, '_source': source, 'size': 1000, }, scroll='12h', doc_type=Const.ELASTICSEARCH_EFO_LABEL_DOC_NAME, index=Loader.get_versioned_index( Const.ELASTICSEARCH_EFO_LABEL_INDEX_NAME, True), timeout="10m", ) for hit in res: yield hit['_source']
def get_all_target_ids_with_evidence_data(self): #TODO: use an aggregation to get those with just data res = helpers.scan( client=self.handler, query={ "query": { "match_all": {} }, '_source': False, 'size': 100, }, scroll='12h', doc_type=Const.ELASTICSEARCH_GENE_NAME_DOC_NAME, index=Loader.get_versioned_index( Const.ELASTICSEARCH_GENE_NAME_INDEX_NAME, True), timeout="30m", ) for target in res: yield target['_id']
def get_disease_labels(self, ids): res = helpers.scan( client=self.handler, query={ "query": { "ids": { "values": ids, } }, '_source': 'label', 'size': 1, }, scroll='12h', index=Loader.get_versioned_index( Const.ELASTICSEARCH_EFO_LABEL_INDEX_NAME, True), timeout="10m", ) return dict((hit['_id'], hit['_source']['label']) for hit in res)
def get_all_target_disease_pair_from_evidence(self, only_direct=False): res = helpers.scan( client=self.handler, query={ "query": { "match_all": {} }, '_source': self._get_source_from_fields([ 'target.id', 'disease.id', 'private.efo_codes', 'scores.association_score' ]), 'size': 1000, }, scroll='6h', index=Loader.get_versioned_index( Const.ELASTICSEARCH_DATA_INDEX_NAME, True), timeout="1h", request_timeout=2 * 60 * 60, ) yielded_pairs = set() for hit in res: if hit['_source']['scores']['association_score'] > 0: if only_direct: pair = '-'.join([ hit['_source']['target']['id'], hit['_source']['disease']['id'] ]) if pair not in yielded_pairs: yield pair yielded_pairs.add(pair) else: for efo_id in hit['_source']['private']['efo_codes']: pair = '-'.join( [hit['_source']['target']['id'], efo_id]) if pair not in yielded_pairs: yield pair yielded_pairs.add(pair)
def get_all_evidence(self, fields=None): index_name = Loader.get_versioned_index( Const.ELASTICSEARCH_DATA_INDEX_NAME, True) doc_type = None res = helpers.scan( client=self.handler, query={ "query": { "match_all": {} }, '_source': self._get_source_from_fields(fields), 'size': 1000, }, scroll='12h', index=index_name, timeout="10m", ) # res = list(res) for hit in res: yield hit['_source']
def elasticsearch_local_init(es_hosts): return Loader(new_es_client(es_hosts)),
def main(): #parse config file, environment, and command line arguments mrtarget.cfg.setup_ops_parser() args = mrtarget.cfg.get_ops_args() #set up logging logger = None if args.log_config: if os.path.isfile(args.log_config) and os.access( args.log_config, os.R_OK): #read a log configuration file logging.config.fileConfig(args.log_config, disable_existing_loggers=False) logger = logging.getLogger(__name__ + ".main()") else: #unable to read the logging config file, abort logging.basicConfig() logger = logging.getLogger(__name__ + ".main()") logger.error("unable to read file {}".format(args.log_config)) return 1 else: #no logging config specified, fall back to default logging.basicConfig() logger = logging.getLogger(__name__ + ".main()") if not args.release_tag: logger.error('A [release-tag] has to be specified.') print('A [release-tag] has to be specified.', file=sys.stderr) return 1 else: Config.RELEASE_VERSION = args.release_tag logger.info('setting release version %s' % Config.RELEASE_VERSION) with RedisManager(args.redis_remote, args.redis_host, args.redis_port): es = new_es_client(args.elasticseach_nodes) redis = new_redis_client(args.redis_host, args.redis_port) #create a single query object for future use esquery = ESQuery(es) #read the data configuration data_config = mrtarget.cfg.get_data_config(args.data_config) #create something to accumulate qc metrics into over various steps qc_metrics = QCMetrics() with Loader(es, chunk_size=ElasticSearchConfiguration.bulk_load_chunk, dry_run=args.dry_run) as loader: if args.rea: process = ReactomeProcess( loader, data_config.reactome_pathway_data, data_config.reactome_pathway_relation) if not args.qc_only: process.process_all(args.dry_run) if not args.skip_qc: qc_metrics.update(process.qc(esquery)) if args.ens: process = EnsemblProcess(loader) if not args.qc_only: process.process(data_config.ensembl_filename, args.dry_run) if not args.skip_qc: qc_metrics.update(process.qc(esquery)) if args.unic: process = UniprotDownloader(loader) if not args.qc_only: process.process(data_config.uniprot_uri, args.dry_run) if not args.skip_qc: qc_metrics.update(process.qc(esquery)) if args.hpa: process = HPAProcess(loader, redis, args.elasticseach_nodes, data_config.tissue_translation_map, data_config.tissue_curation_map, data_config.hpa_normal_tissue, data_config.hpa_rna_level, data_config.hpa_rna_value, data_config.hpa_rna_zscore) if not args.qc_only: process.process_all(args.dry_run) if not args.skip_qc: qc_metrics.update(process.qc(esquery)) if args.gen: process = GeneManager( loader, redis, args.gen_plugin_places, data_config.gene_data_plugin_names, ) if not args.qc_only: process.merge_all(data_config, dry_run=args.dry_run) if not args.skip_qc: qc_metrics.update(process.qc(esquery)) if args.efo: process = EfoProcess(loader, data_config.ontology_efo, data_config.ontology_hpo, data_config.ontology_mp, data_config.disease_phenotype) if not args.qc_only: process.process_all(args.dry_run) if not args.skip_qc: qc_metrics.update(process.qc(esquery)) if args.eco: process = EcoProcess(loader, data_config.ontology_eco, data_config.ontology_so) if not args.qc_only: process.process_all(args.dry_run) if not args.skip_qc: qc_metrics.update(process.qc(esquery)) if args.val: es_output_folder = None if "elasticsearch_folder" in vars( args) and args.elasticsearch_folder is not None: es_output_folder = args.elasticsearch_folder process_evidences_pipeline( filenames=data_config.input_file, first_n=args.val_first_n, es_client=es, redis_client=redis, dry_run=args.dry_run, output_folder=es_output_folder, num_workers=args.val_workers_validator, num_writers=args.val_workers_writer, max_queued_events=args.val_queue_validator_writer, eco_scores_uri=data_config.eco_scores, schema_uri=data_config.schema, es_hosts=args.elasticseach_nodes, excluded_biotypes=data_config.excluded_biotypes, datasources_to_datatypes=data_config. datasources_to_datatypes) #TODO qc if args.assoc: process = ScoringProcess(args.redis_host, args.redis_port, args.elasticseach_nodes) if not args.qc_only: process.process_all(data_config.scoring_weights, data_config.is_direct_do_not_propagate, data_config.datasources_to_datatypes, args.dry_run, args.as_workers_production, args.as_workers_score, args.as_queue_production_score) if not args.skip_qc: qc_metrics.update(process.qc(esquery)) pass if args.ddr: process = DataDrivenRelationProcess(es) if not args.qc_only: process.process_all(args.dry_run, args.ddr_workers_production, args.ddr_workers_score, args.ddr_queue_production_score, args.ddr_queue_score_result) #TODO qc if args.sea: process = SearchObjectProcess(loader, redis) if not args.qc_only: process.process_all( data_config.chembl_target, data_config.chembl_mechanism, data_config.chembl_component, data_config.chembl_protein, data_config.chembl_molecule_set_uri_pattern, args.dry_run) #TODO qc if args.metric: process = Metrics( es, args.metric_file, data_config.datasources_to_datatypes).generate_metrics() if args.qc_in: #handle reading in previous qc from filename provided, and adding comparitive metrics qc_metrics.compare_with(args.qc_in) if args.qc_out: #handle writing out to a tsv file qc_metrics.write_out(args.qc_out) logger.info('`' + " ".join(sys.argv) + '` - finished') return 0
def exists(self, index, doc_type, id, realtime=False): return self.handler.exists(index=Loader.get_versioned_index( index, True), doc_type=doc_type, id=id, realtime=realtime)
def delete_data(self, index, query, doc_type='', chunk_size=1000, altered_keys=()): ''' Delete all the documents in an index matching a given query :param index: index to use :param query: query matching the elements to remove :param doc_type: document types, default is to look for all the doc types :param chunk_size: size of the bulk action sent to delete :param altered_keys: list of fields to fetch data and return as being altered by the delete query :return: dict of keys altered by the query ''' '''count available data''' res = self.handler.search( index=Loader.get_versioned_index(index, True), body={ "query": query, '_source': False, 'size': 0, }, doc_type=doc_type, ) total = res['hits']['total'] '''if data is matching query, delete it with scan and bulk''' altered = dict() for key in altered_keys: altered[key] = set() if total: batch = [] for hit in helpers.scan( client=self.handler, query={ "query": query, '_source': self._get_source_from_fields(altered_keys), 'size': chunk_size, }, scroll='1h', index=Loader.get_versioned_index(index, True), doc_type=doc_type, timeout='1h', ): action = { '_op_type': 'delete', '_index': hit['_index'], '_type': hit['_type'], '_id': hit['_id'], } batch.append(action) flat_source = self.flatten(hit['_source']) for key in altered_keys: if key in flat_source: altered[key].add(flat_source[key]) if len(batch) >= chunk_size: self._flush_bulk(batch) batch = [] #if len(batch) >= chunk_size: self._flush_bulk(batch) '''flush changes''' self.handler.indices.flush(Loader.get_versioned_index(index, True), wait_if_ongoing=True) return altered
def get_objects_by_id(self, ids, index, doc_type, source=True, source_exclude=[], realtime=False): ''' :param ids: list of idientifiers for documents :param index: index for all the documents :param doc_type: doc type for all the documents :return: generator of documents ''' if isinstance(ids, (list, tuple)): res = self.handler.mget( index=Loader.get_versioned_index(index, True), doc_type=doc_type, body=dict(ids=ids), _source=source, _source_exclude=source_exclude, realtime=True, ) if not res: time.sleep(0.1) res = self.handler.mget( index=Loader.get_versioned_index(index, True), doc_type=doc_type, body=dict(ids=ids), _source=source, _source_exclude=source_exclude, realtime=True, ) for doc in res['docs']: if doc['found']: yield doc['_source'] else: raise KeyError('object with id %s not found' % (doc['_id'])) else: try: res = self.handler.get( index=Loader.get_versioned_index(index, True), doc_type=doc_type, id=ids, _source=source, _source_exclude=source_exclude, realtime=True, ) try: yield res['_source'] except Exception as e: self.logger.exception( 'cannot retrieve single object by id %s ' % ids) raise KeyError('object with id %s not found' % ids) except TransportError as te: if te.status_code == 404: raise KeyError('object with id %s not found' % ids)
class ScoringProcess(): def __init__(self, redis_host, redis_port, es_hosts): self.logger = logging.getLogger(__name__) self.es_hosts = es_hosts self.es = new_es_client(self.es_hosts) self.es_loader = Loader(self.es) self.es_query = ESQuery(self.es) self.redis_host = redis_host self.redis_port = redis_port self.r_server = new_redis_client(self.redis_host, self.redis_port) def process_all(self, scoring_weights, is_direct_do_not_propagate, datasources_to_datatypes, dry_run, num_workers_produce, num_workers_score, max_queued_produce_to_score): lookup_data = LookUpDataRetriever( self.es, self.r_server, targets=[], data_types=(LookUpDataType.DISEASE, LookUpDataType.TARGET, LookUpDataType.ECO, LookUpDataType.HPA)).lookup targets = list(self.es_query.get_all_target_ids_with_evidence_data()) #setup elasticsearch if not dry_run: self.es_loader.create_new_index( Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME) self.es_loader.prepare_for_bulk_indexing( self.es_loader.get_versioned_index( Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME)) self.logger.info('setting up stages') #bake the arguments for the setup into function objects produce_evidence_local_init_baked = functools.partial( produce_evidence_local_init, self.es_hosts, scoring_weights, is_direct_do_not_propagate, datasources_to_datatypes) score_producer_local_init_baked = functools.partial( score_producer_local_init, self.es_hosts, self.redis_host, self.redis_port, lookup_data, datasources_to_datatypes, dry_run) #this doesn't need to be in the external config, since its so content light #as to be meaningless max_queued_score_out = 10000 #pipeline stage for making the lists of the target/disease pairs and evidence pipeline_stage = pr.flat_map( produce_evidence, targets, workers=num_workers_produce, maxsize=max_queued_produce_to_score, on_start=produce_evidence_local_init_baked, on_done=produce_evidence_local_shutdown) #pipeline stage for scoring the evidence sets #includes writing to elasticsearch pipeline_stage = pr.each(score_producer, pipeline_stage, workers=num_workers_score, maxsize=max_queued_score_out, on_start=score_producer_local_init_baked, on_done=score_producer_local_shutdown) #loop over the end of the pipeline to make sure everything is finished self.logger.info('stages created, running scoring and writing') pr.run(pipeline_stage) self.logger.info('stages created, ran scoring and writing') #cleanup elasticsearch if not dry_run: self.logger.info('flushing data to index') self.es_loader.flush_all_and_wait( Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME) #restore old pre-load settings #note this automatically does all prepared indexes self.es_loader.restore_after_bulk_indexing() self.logger.info('flushed data to index') self.logger.info("DONE") """ Run a series of QC tests on EFO elasticsearch index. Returns a dictionary of string test names and result objects """ def qc(self, esquery): #number of eco entries association_count = 0 #Note: try to avoid doing this more than once! for association in esquery.get_all_associations(): association_count += 1 if association_count % 1000 == 0: self.logger.debug("checking %d", association_count) #put the metrics into a single dict metrics = dict() metrics["association.count"] = association_count return metrics