def get_validated_evidence_strings(self, size=1000, datasources=[], is_valid=True): # https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-multi-get.html index_name = Loader.get_versioned_index( Const.ELASTICSEARCH_VALIDATED_DATA_INDEX_NAME + '*', True) doc_type = None if datasources: doc_type = datasources res = helpers.scan( client=self.handler, query={ "query": { "match_phrase": { "is_valid": is_valid } }, '_source': True, 'size': size, }, scroll='12h', doc_type=doc_type, index=index_name, timeout="20m", ) for hit in res: yield hit['_source']
def get_disease_to_targets_vectors(self, treshold=0.1, evidence_count=3): ''' Get all the association objects that are: - direct -> to avoid ontology inflation - > 3 evidence count -> remove noise - overall score > threshold -> remove very lo quality noise :param treshold: minimum overall score threshold to consider for fetching association data :param evidence_count: minimum number of evidence consider for fetching association data :return: two dictionaries mapping target to disease and the reverse ''' self.logger.debug('scan es to get all diseases and targets') res = helpers.scan( client=self.handler, query={ "query": { "term": { "is_direct": True, } }, '_source': { 'includes': [ "target.id", 'disease.id', 'harmonic-sum', 'evidence_count' ] }, 'size': 1000, }, scroll='12h', index=Loader.get_versioned_index( Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME, True), timeout="10m", ) target_results = dict() disease_results = dict() self.logger.debug('start getting all targets and diseases from es') c = 0 for hit in res: c += 1 hit = hit['_source'] if hit['evidence_count']['total']>=evidence_count and \ hit['harmonic-sum']['overall'] >=treshold: '''store target associations''' if hit['target']['id'] not in target_results: target_results[hit['target']['id']] = SparseFloatDict() #TODO: return all counts and scores up to datasource level target_results[hit['target']['id']][ hit['disease']['id']] = hit['harmonic-sum']['overall'] '''store disease associations''' if hit['disease']['id'] not in disease_results: disease_results[hit['disease']['id']] = SparseFloatDict() # TODO: return all counts and scores up to datasource level disease_results[hit['disease']['id']][ hit['target']['id']] = hit['harmonic-sum']['overall'] if c % 10000 == 0: self.logger.debug('%d elements retrieved', c) return target_results, disease_results
def get_associations_for_disease(self, disease, fields=None, size=100, get_top_hits=True): source = self._get_source_from_fields(fields) aggs = addict.Dict() if get_top_hits: aggs.direct_associations.filter.term.is_direct = True aggs.direct_associations.aggs.top_direct_ass.top_hits.sort[ 'harmonic-sum.overall'].order = 'desc' aggs.direct_associations.aggs.top_direct_ass.top_hits._source = source aggs.direct_associations.aggs.top_direct_ass.top_hits.size = size q = addict.Dict() q.query.constant_score.filter.terms['disease.id'] = [disease] q.sort['harmonic-sum.overall'].order = 'desc' q._source = source q.aggs = aggs q.size = size res = self.handler.search( index=Loader.get_versioned_index( Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME, True), doc_type=Const.ELASTICSEARCH_DATA_ASSOCIATION_DOC_NAME, body=q.to_dict()) return AssociationSummary(res)
def get_all_evidence_for_datatype( self, datatype, fields=None, ): # https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-multi-get.html index_name = Loader.get_versioned_index( Const.ELASTICSEARCH_DATA_INDEX_NAME, True) res = helpers.scan( client=self.handler, query={ "query": { "match": { "type": datatype } }, '_source': self._get_source_from_fields(fields), 'size': 1000, }, scroll='12h', index=index_name, timeout="10m", ) # res = list(res) for hit in res: yield hit['_source']
def get_evidence_for_target_simple(self, target, expected=None): query_body = { "query": { "constant_score": { "filter": { "term": { "target.id": target } } } }, '_source': { "includes": [ "target.id", "private.efo_codes", "disease.id", "scores.association_score", "sourceID", "id", ] }, } if expected is not None and expected < 10000: query_body['size'] = 10000 res = self.handler.search(index=Loader.get_versioned_index( Const.ELASTICSEARCH_DATA_INDEX_NAME, True), body=query_body) for hit in res['hits']['hits']: yield hit['_source'] else: res = helpers.scan(client=self.handler, query=query_body, scroll='1h', index=Loader.get_versioned_index( Const.ELASTICSEARCH_DATA_INDEX_NAME, True), timeout="1h", request_timeout=2 * 60 * 60, size=1000) for hit in res: yield hit['_source']
def count_elements_in_index(self, index_name, doc_type=None, query=None): if query is None: query = {"match_all": {}} res = self.handler.search(index=Loader.get_versioned_index( index_name, True), doc_type=doc_type, body={ "query": query, '_source': False, 'size': 0, }) return res['hits']['total']
def get_reaction(self, reaction_id): res = self.handler.search( index=Loader.get_versioned_index( Const.ELASTICSEARCH_REACTOME_INDEX_NAME, True), doc_type=Const.ELASTICSEARCH_REACTOME_REACTION_DOC_NAME, body={ "query": { "ids": { "values": [reaction_id] } }, '_source': True, 'size': 1, }) for hit in res['hits']['hits']: return hit['_source']
def get_all_associations(self, ): res = helpers.scan( client=self.handler, query={ "query": { "match_all": {} }, '_source': True, 'size': 1000, }, scroll='1h', index=Loader.get_versioned_index( Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME, True), timeout="10m", ) for hit in res: yield hit['_source']
def count_evidence_for_target(self, target): res = self.handler.search(index=Loader.get_versioned_index( Const.ELASTICSEARCH_DATA_INDEX_NAME, True), body={ "query": { "constant_score": { "filter": { "term": { "target.id": target } } } }, '_source': [], 'size': 0 }) return res['hits']['total']
def get_all_ensembl_genes(self): res = helpers.scan( client=self.handler, query={ "query": { "match_all": {} }, '_source': True, 'size': 1000, }, scroll='1h', index=Loader.get_versioned_index( Const.ELASTICSEARCH_ENSEMBL_INDEX_NAME, True), timeout="10m", ) for hit in res: yield hit['_source']
def get_all_uniprot_entries(self): res = helpers.scan( client=self.handler, query={ "query": { "match_all": {} }, '_source': True, 'size': 100, }, scroll='12h', index=Loader.get_versioned_index( Const.ELASTICSEARCH_UNIPROT_INDEX_NAME, True), timeout="10m", ) for hit in res: yield jsonpickle.decode(base64.b64decode(hit['_source']['entry']))
def get_all_reactions(self): res = helpers.scan( client=self.handler, query={ "query": { "match_all": {} }, '_source': True, 'size': 1000, }, scroll='1h', doc_type=Const.ELASTICSEARCH_REACTOME_REACTION_DOC_NAME, index=Loader.get_versioned_index( Const.ELASTICSEARCH_REACTOME_INDEX_NAME, True), timeout="10m", ) for hit in res: yield hit['_source']
def get_all_target_ids_with_evidence_data(self): #TODO: use an aggregation to get those with just data res = helpers.scan( client=self.handler, query={ "query": { "match_all": {} }, '_source': False, 'size': 100, }, scroll='12h', doc_type=Const.ELASTICSEARCH_GENE_NAME_DOC_NAME, index=Loader.get_versioned_index( Const.ELASTICSEARCH_GENE_NAME_INDEX_NAME, True), timeout="30m", ) for target in res: yield target['_id']
def get_all_diseases(self, fields=None): source = self._get_source_from_fields(fields) res = helpers.scan( client=self.handler, query={ "query": { "match_all": {} }, '_source': source, 'size': 1000, }, scroll='12h', doc_type=Const.ELASTICSEARCH_EFO_LABEL_DOC_NAME, index=Loader.get_versioned_index( Const.ELASTICSEARCH_EFO_LABEL_INDEX_NAME, True), timeout="10m", ) for hit in res: yield hit['_source']
def get_disease_labels(self, ids): res = helpers.scan( client=self.handler, query={ "query": { "ids": { "values": ids, } }, '_source': 'label', 'size': 1, }, scroll='12h', index=Loader.get_versioned_index( Const.ELASTICSEARCH_EFO_LABEL_INDEX_NAME, True), timeout="10m", ) return dict((hit['_id'], hit['_source']['label']) for hit in res)
def get_all_target_disease_pair_from_evidence(self, only_direct=False): res = helpers.scan( client=self.handler, query={ "query": { "match_all": {} }, '_source': self._get_source_from_fields([ 'target.id', 'disease.id', 'private.efo_codes', 'scores.association_score' ]), 'size': 1000, }, scroll='6h', index=Loader.get_versioned_index( Const.ELASTICSEARCH_DATA_INDEX_NAME, True), timeout="1h", request_timeout=2 * 60 * 60, ) yielded_pairs = set() for hit in res: if hit['_source']['scores']['association_score'] > 0: if only_direct: pair = '-'.join([ hit['_source']['target']['id'], hit['_source']['disease']['id'] ]) if pair not in yielded_pairs: yield pair yielded_pairs.add(pair) else: for efo_id in hit['_source']['private']['efo_codes']: pair = '-'.join( [hit['_source']['target']['id'], efo_id]) if pair not in yielded_pairs: yield pair yielded_pairs.add(pair)
def get_all_evidence(self, fields=None): index_name = Loader.get_versioned_index( Const.ELASTICSEARCH_DATA_INDEX_NAME, True) doc_type = None res = helpers.scan( client=self.handler, query={ "query": { "match_all": {} }, '_source': self._get_source_from_fields(fields), 'size': 1000, }, scroll='12h', index=index_name, timeout="10m", ) # res = list(res) for hit in res: yield hit['_source']
def delete_data(self, index, query, doc_type='', chunk_size=1000, altered_keys=()): ''' Delete all the documents in an index matching a given query :param index: index to use :param query: query matching the elements to remove :param doc_type: document types, default is to look for all the doc types :param chunk_size: size of the bulk action sent to delete :param altered_keys: list of fields to fetch data and return as being altered by the delete query :return: dict of keys altered by the query ''' '''count available data''' res = self.handler.search( index=Loader.get_versioned_index(index, True), body={ "query": query, '_source': False, 'size': 0, }, doc_type=doc_type, ) total = res['hits']['total'] '''if data is matching query, delete it with scan and bulk''' altered = dict() for key in altered_keys: altered[key] = set() if total: batch = [] for hit in helpers.scan( client=self.handler, query={ "query": query, '_source': self._get_source_from_fields(altered_keys), 'size': chunk_size, }, scroll='1h', index=Loader.get_versioned_index(index, True), doc_type=doc_type, timeout='1h', ): action = { '_op_type': 'delete', '_index': hit['_index'], '_type': hit['_type'], '_id': hit['_id'], } batch.append(action) flat_source = self.flatten(hit['_source']) for key in altered_keys: if key in flat_source: altered[key].add(flat_source[key]) if len(batch) >= chunk_size: self._flush_bulk(batch) batch = [] #if len(batch) >= chunk_size: self._flush_bulk(batch) '''flush changes''' self.handler.indices.flush(Loader.get_versioned_index(index, True), wait_if_ongoing=True) return altered
def get_objects_by_id(self, ids, index, doc_type, source=True, source_exclude=[], realtime=False): ''' :param ids: list of idientifiers for documents :param index: index for all the documents :param doc_type: doc type for all the documents :return: generator of documents ''' if isinstance(ids, (list, tuple)): res = self.handler.mget( index=Loader.get_versioned_index(index, True), doc_type=doc_type, body=dict(ids=ids), _source=source, _source_exclude=source_exclude, realtime=True, ) if not res: time.sleep(0.1) res = self.handler.mget( index=Loader.get_versioned_index(index, True), doc_type=doc_type, body=dict(ids=ids), _source=source, _source_exclude=source_exclude, realtime=True, ) for doc in res['docs']: if doc['found']: yield doc['_source'] else: raise KeyError('object with id %s not found' % (doc['_id'])) else: try: res = self.handler.get( index=Loader.get_versioned_index(index, True), doc_type=doc_type, id=ids, _source=source, _source_exclude=source_exclude, realtime=True, ) try: yield res['_source'] except Exception as e: self.logger.exception( 'cannot retrieve single object by id %s ' % ids) raise KeyError('object with id %s not found' % ids) except TransportError as te: if te.status_code == 404: raise KeyError('object with id %s not found' % ids)
class DataDrivenRelationProcess(object): def __init__(self, es): self.es = es self.es_query=ESQuery(self.es) self.logger = logging.getLogger(__name__) def process_all(self, dry_run, ddr_workers_production, ddr_workers_score, ddr_queue_production_score, ddr_queue_score_result): start_time = time.time() target_data, disease_data = self.es_query.get_disease_to_targets_vectors() self.logger.info('Retrieved all the associations data in %i s'%(time.time()-start_time)) self.logger.info('target data length: %s size in memory: %f Kb'%(len(target_data),sys.getsizeof(target_data)/1024.)) self.logger.info('disease data length: %s size in memory: %f Kb' % (len(disease_data),sys.getsizeof(disease_data)/1024.)) '''sort the lists and keep using always the same order in all the steps''' disease_keys = sorted(disease_data.keys()) target_keys = sorted(target_data.keys()) self.logger.info('getting disese labels') disease_id_to_label = self.es_query.get_disease_labels(disease_keys) disease_labels = [disease_id_to_label[hit_id] for hit_id in disease_keys] self.logger.info('getting target labels') target_id_to_label = self.es_query.get_target_labels(target_keys) target_labels = [target_id_to_label[hit_id] for hit_id in target_keys] #setup elasticsearch self.loader = Loader(self.es, dry_run=dry_run) if not dry_run: #need to directly get the versioned index name for this function self.loader.create_new_index(Const.ELASTICSEARCH_RELATION_INDEX_NAME) self.loader.prepare_for_bulk_indexing(self.loader.get_versioned_index(Const.ELASTICSEARCH_RELATION_INDEX_NAME)) #calculate and store disease-to-disease in multiple processess self.logger.info('handling disease-to-disease') handle_pairs(RelationType.SHARED_TARGET, disease_labels, disease_data, disease_keys, target_keys, 0.19, 1024, self.loader, dry_run, ddr_workers_production, ddr_workers_score, ddr_queue_production_score, ddr_queue_score_result) self.logger.info('handled disease-to-disease') #calculate and store target-to-target in multiple processess self.logger.info('handling target-to-target') handle_pairs(RelationType.SHARED_DISEASE, target_labels, target_data, target_keys, disease_keys, 0.19, 1024, self.loader, dry_run, ddr_workers_production, ddr_workers_score, ddr_queue_production_score, ddr_queue_score_result) self.logger.info('handled target-to-target') #cleanup elasticsearch if not dry_run: self.loader.flush_all_and_wait(Const.ELASTICSEARCH_RELATION_INDEX_NAME) #restore old pre-load settings #note this automatically does all prepared indexes self.loader.restore_after_bulk_indexing()
def exists(self, index, doc_type, id, realtime=False): return self.handler.exists(index=Loader.get_versioned_index( index, True), doc_type=doc_type, id=id, realtime=realtime)
class ScoringProcess(): def __init__(self, redis_host, redis_port, es_hosts): self.logger = logging.getLogger(__name__) self.es_hosts = es_hosts self.es = new_es_client(self.es_hosts) self.es_loader = Loader(self.es) self.es_query = ESQuery(self.es) self.redis_host = redis_host self.redis_port = redis_port self.r_server = new_redis_client(self.redis_host, self.redis_port) def process_all(self, scoring_weights, is_direct_do_not_propagate, datasources_to_datatypes, dry_run, num_workers_produce, num_workers_score, max_queued_produce_to_score): lookup_data = LookUpDataRetriever( self.es, self.r_server, targets=[], data_types=(LookUpDataType.DISEASE, LookUpDataType.TARGET, LookUpDataType.ECO, LookUpDataType.HPA)).lookup targets = list(self.es_query.get_all_target_ids_with_evidence_data()) #setup elasticsearch if not dry_run: self.es_loader.create_new_index( Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME) self.es_loader.prepare_for_bulk_indexing( self.es_loader.get_versioned_index( Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME)) self.logger.info('setting up stages') #bake the arguments for the setup into function objects produce_evidence_local_init_baked = functools.partial( produce_evidence_local_init, self.es_hosts, scoring_weights, is_direct_do_not_propagate, datasources_to_datatypes) score_producer_local_init_baked = functools.partial( score_producer_local_init, self.es_hosts, self.redis_host, self.redis_port, lookup_data, datasources_to_datatypes, dry_run) #this doesn't need to be in the external config, since its so content light #as to be meaningless max_queued_score_out = 10000 #pipeline stage for making the lists of the target/disease pairs and evidence pipeline_stage = pr.flat_map( produce_evidence, targets, workers=num_workers_produce, maxsize=max_queued_produce_to_score, on_start=produce_evidence_local_init_baked, on_done=produce_evidence_local_shutdown) #pipeline stage for scoring the evidence sets #includes writing to elasticsearch pipeline_stage = pr.each(score_producer, pipeline_stage, workers=num_workers_score, maxsize=max_queued_score_out, on_start=score_producer_local_init_baked, on_done=score_producer_local_shutdown) #loop over the end of the pipeline to make sure everything is finished self.logger.info('stages created, running scoring and writing') pr.run(pipeline_stage) self.logger.info('stages created, ran scoring and writing') #cleanup elasticsearch if not dry_run: self.logger.info('flushing data to index') self.es_loader.flush_all_and_wait( Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME) #restore old pre-load settings #note this automatically does all prepared indexes self.es_loader.restore_after_bulk_indexing() self.logger.info('flushed data to index') self.logger.info("DONE") """ Run a series of QC tests on EFO elasticsearch index. Returns a dictionary of string test names and result objects """ def qc(self, esquery): #number of eco entries association_count = 0 #Note: try to avoid doing this more than once! for association in esquery.get_all_associations(): association_count += 1 if association_count % 1000 == 0: self.logger.debug("checking %d", association_count) #put the metrics into a single dict metrics = dict() metrics["association.count"] = association_count return metrics