def complete_cell_n_tissue(self, assay_2_compound: dict, ac_dh_assay_dict: dict): pb = get_new_progressbar('cell-completion', len(self.cell_2_assay)) for i, cell_id in enumerate(self.cell_2_assay): if cell_id not in self.cell_dict: self.cell_dict[cell_id] = {} self.cell_dict[cell_id]['related_activities'] = { 'count': 0, 'all_chembl_ids': set() } self.cell_dict[cell_id]['related_compounds'] = { 'count': 0, 'all_chembl_ids': set() } for assay in self.cell_2_assay.get(cell_id, []): compounds = assay_2_compound.get(assay, {}) for compound_i in compounds: if compound_i not in self.cell_dict[cell_id]['related_compounds']['all_chembl_ids']: self.cell_dict[cell_id]['related_compounds']['count'] += 1 self.cell_dict[cell_id]['related_compounds']['all_chembl_ids'].add(compound_i) if ac_dh_assay_dict.get(assay, None): self.cell_dict[cell_id]['related_activities']['count'] += ac_dh_assay_dict[assay].get( 'related_activities', {} ).get('count', 0) pb.update(i) pb.finish() pb = get_new_progressbar('tissue-completion', len(self.tissue_2_assay)) for i, tissue_id in enumerate(self.tissue_2_assay): if tissue_id not in self.tissue_dict: self.tissue_dict[tissue_id] = {} self.tissue_dict[tissue_id]['related_activities'] = { 'count': 0, 'all_chembl_ids': set() } self.tissue_dict[tissue_id]['related_compounds'] = { 'count': 0, 'all_chembl_ids': set() } for assay in self.tissue_2_assay.get(tissue_id, []): compounds = assay_2_compound.get(assay, {}) for compound_i in compounds: if compound_i not in self.tissue_dict[tissue_id]['related_compounds']['all_chembl_ids']: self.tissue_dict[tissue_id]['related_compounds']['count'] += 1 self.tissue_dict[tissue_id]['related_compounds']['all_chembl_ids'].add(compound_i) if ac_dh_assay_dict.get(assay, None): self.tissue_dict[tissue_id]['related_activities']['count'] += ac_dh_assay_dict[assay].get( 'related_activities', {} ).get('count', 0) pb.update(i) pb.finish()
def iterate_resource(self): self.count_future = self.thread_pool.submit(self._get_resource_count) self.total_count = self.count_future.result() self.iterated_count = 0 chunk_size = ResourceIterator.LIMIT * 3 self.progress_bar = progress_bar_handler.get_new_progressbar( self.resource.res_name, self.total_count) if self.on_start: try: self.on_start(self.resource.res_name, self.total_count) except: print('Exception on resource "{0}" start.\n'.format( self.resource.res_name), file=sys.stderr) print('Exception caught: \n{0}\n'.format( traceback.format_exc()), file=sys.stderr) print('ERROR: on_resource_start for {0} failed exiting now!'. format(self.resource.res_name), file=sys.stderr) sys.stderr.flush() return stop_at = self.total_count if self.iterate_all else ( ResourceIterator.LIMIT * 10) for offset_i in range(0, stop_at, chunk_size): if self.stop: return task = self._submit_iterate_resource_chunk_to_queue( offset_i, offset_i + chunk_size) if task: self.scheduled_tasks.append(task) self.check_progress_bar(wait_to_finish=True) if not self.stop and self.on_done: self.on_done(self.resource.res_name)
def pre_cache_svg_files(): global CACHING_PB, CACHING_PB_COUNT, WS_REQUEST_POOL, RDKIT_CACHE, INDIGO_CACHE, SVG_FAILURES, BASE_CACHE_PATH CACHING_PB = progress_bar_handler.get_new_progressbar( 'molecule_svg_caching', max_val=es_util.get_idx_count(MOLECULE.idx_name)) CACHING_PB_COUNT = 0 def __handle_molecule_doc(doc, *args, **kargs): if not STOP_SCAN: WS_REQUEST_POOL.submit(get_svg_by_chembl_id, doc['molecule_chembl_id']) WS_REQUEST_POOL.submit(get_svg_by_chembl_id, doc['molecule_chembl_id'], True) es_util.scan_index(MOLECULE.idx_name, on_doc=__handle_molecule_doc, query={ '_source': 'molecule_chembl_id', 'query': { 'query_string': { 'query': '_exists_:molecule_structures' } } }) WS_REQUEST_POOL.join() CACHING_PB.finish() print('RDKIT SVG data has been cached for {0} CHEMBL IDS'.format( len(RDKIT_CACHE)), file=sys.stderr) print('INDIGO SVG data has been cached for {0} CHEMBL IDS'.format( len(INDIGO_CACHE)), file=sys.stderr) indigo_fails = 0 rdkit_fails = 0 both_fails = 0 for key, value in SVG_FAILURES.items(): if len(value) > 1: SVG_FAILURES[key] = 'BOTH' both_fails += 1 else: if value[0] == 'INDIGO': indigo_fails += 1 else: rdkit_fails += 1 SVG_FAILURES[key] = value[0] failures_file_path = os.path.join(BASE_CACHE_PATH, 'svg_failures.json') try: with open(failures_file_path, 'w', encoding='utf-8') as failures_file: json.dump(SVG_FAILURES, failures_file) except: traceback.print_exc() print('UNABLE TO WRITE FILE AT {0}'.format(failures_file_path), file=sys.stderr) print('INDIGO FAIL COUNT: {0}'.format(indigo_fails), file=sys.stderr) print('RDKIT FAIL COUNT: {0}'.format(rdkit_fails), file=sys.stderr) print('BOTH FAIL COUNT: {0}'.format(both_fails), file=sys.stderr)
def scan_index(self, es_index, on_doc=None, query=None): if self.es_conn is None: print( "FATAL ERROR: there is not an elastic search connection defined.", file=sys.stderr) traceback.print_exc(file=sys.stderr) sys.exit(1) if query is None: query = {} query['track_total_hits'] = True search_res = self.es_conn.search(index=es_index, body=query) total_docs = search_res['hits']['total']['value'] update_every = min(math.ceil(total_docs * 0.001), 1000) scan_query = SummableDict() if query: scan_query += query scanner = helpers.scan(self.es_conn, index=es_index, scroll='10m', query=query, size=1000) count = 0 p_bar = progress_bar_handler.get_new_progressbar( '{0}_es-index-scan'.format(es_index), total_docs) for doc_n in scanner: if callable(on_doc): should_stop = on_doc(doc_n['_source'], doc_n['_id'], total_docs, count, count == 0, count == total_docs - 1) if should_stop or self.stop_scan: return count += 1 if count % update_every == 0: p_bar.update(count) p_bar.finish()
def complete_compound(self): pb = get_new_progressbar('compound-completion', len(self.compound_2_assay)) for i, molecule_chembl_id in enumerate(self.compound_2_assay): if molecule_chembl_id not in self.compound_dict: self.compound_dict[molecule_chembl_id] = {} self.compound_dict[molecule_chembl_id]['related_cell_lines'] = { 'count': 0, 'all_chembl_ids': set() } self.compound_dict[molecule_chembl_id]['related_tissues'] = { 'count': 0, 'all_chembl_ids': set() } for assay in self.compound_2_assay.get(molecule_chembl_id, []): cell_n_tissue = self.assay_dh.assay_2_cell_n_tissue.get( assay, {}) cell_id = cell_n_tissue.get('cell_chembl_id', None) tissue_id = cell_n_tissue.get('tissue_chembl_id', None) if cell_id and \ cell_id not in self.compound_dict[molecule_chembl_id]['related_cell_lines']['all_chembl_ids']: self.compound_dict[molecule_chembl_id][ 'related_cell_lines']['count'] += 1 self.compound_dict[molecule_chembl_id][ 'related_cell_lines']['all_chembl_ids'].add(cell_id) if tissue_id and \ tissue_id not in self.compound_dict[molecule_chembl_id]['related_tissues']['all_chembl_ids']: self.compound_dict[molecule_chembl_id]['related_tissues'][ 'count'] += 1 self.compound_dict[molecule_chembl_id]['related_tissues'][ 'all_chembl_ids'].add(tissue_id) pb.update(i) pb.finish()
def get_all_dn_dicts(self): total_dn_dict = SummableDict() pb = get_new_progressbar('built-dn-hierarchy-dict', len(self.children)) current = 0 for node in self.children.values(): dn_dict_i, shared_family_data, node_data = node.get_denormalization_dict( ) for chembl_id, dn_data in dn_dict_i.items(): total_dn_dict[chembl_id] = dn_data current += 1 pb.update(current) pb.finish() return total_dn_dict
def run(self): signal_handler.add_termination_handler(self.stop_submitter) self.submission_pb = progress_bar_handler.get_new_progressbar( 'ES-bulk-submitter', 1) self.submission_pool.start() cur_low_counts = 0 while not self.stop_submission: max_count = self.get_max_queue_count() if max_count >= self.max_docs_per_request * 5 or ( max_count > 0 and cur_low_counts > 10): cur_low_counts = 0 self.check_and_submit_queues() else: if max_count > 0: cur_low_counts += 1 time.sleep(1) sys.stderr.flush()
def load_all_chembl_unichem_data(): global STOP_LOAD unichem_ds = load_unichem_ds_desc() unichem_data_by_chembl_id = {} pb = progress_bar_handler.get_new_progressbar('reading-unichem', len(unichem_ds) - 1) for i, src_id_i in enumerate(sorted(unichem_ds.keys())): if STOP_LOAD: return if src_id_i == 1 or src_id_i == '1': continue req_i = requests.get(url=UNICHEM_FTP_URL.format(src_id_i), stream=True, verify=False) decoder = zlib.decompressobj(16 + zlib.MAX_WBITS) last_row_in_last_chunk = None for chunk in req_i.iter_content(chunk_size=1024, decode_unicode=False): if STOP_LOAD: return rows_in_chunk = decoder.decompress(chunk).decode("utf-8") if last_row_in_last_chunk: rows_in_chunk = last_row_in_last_chunk + rows_in_chunk save_last = not rows_in_chunk.endswith('\n') records = rows_in_chunk.split('\n') if save_last: last_row_in_last_chunk = records[-1] records = records[:-1] else: last_row_in_last_chunk = None collect_unichem_records(src_id_i, records, unichem_data_by_chembl_id, unichem_ds) last_rows = decoder.flush().decode("utf-8") if last_row_in_last_chunk: last_rows = last_row_in_last_chunk + last_rows records = last_rows.split('\n') collect_unichem_records(src_id_i, records, unichem_data_by_chembl_id, unichem_ds) pb.update(i) pb.finish() return unichem_data_by_chembl_id
def save_denormalization_dict( cls, resource_desc: resources_description.ResourceDescription, dn_dict: dict, get_update_script_and_size, new_mappings=None, do_index=False): if new_mappings: es_util.update_doc_type_mappings(resource_desc.idx_name, new_mappings) progressbar_name = '{0}-dn-{1}'.format(cls.RESOURCE.res_name, resource_desc.res_name) doc_ids = list(dn_dict.keys()) p_bar = progress_bar_handler.get_new_progressbar( progressbar_name, len(dn_dict)) entity_dn_count = 0 for doc_id_i in doc_ids: if DenormalizationHandler.STOP: return update_doc, update_size = get_update_script_and_size( doc_id_i, dn_dict[doc_id_i]) # Indexes instead of update if it is requested if do_index: es_util.index_doc_bulk(resource_desc.idx_name, doc_id_i, update_doc) else: es_util.update_doc_bulk(resource_desc.idx_name, doc_id_i, doc=update_doc) entity_dn_count += 1 p_bar.update(entity_dn_count) es_util.bulk_submitter.finish_current_queues() p_bar.finish()
def do_complete_data(self, doc: dict, total_docs: int, index: int, first: bool, last: bool): if first: self.complete_data_pb = progress_bar_handler.get_new_progressbar( '{0}-data-completion'.format(self.RESOURCE.idx_name), total_docs) mappings = self.get_custom_mappings_for_complete_data() if len(mappings.keys()) > 0: self.update_mappings(mappings) update_doc = self.get_doc_for_complete_data(doc) if update_doc is not None: es_util.update_doc_bulk(self.RESOURCE.idx_name, self.RESOURCE.get_doc_id(doc), doc=update_doc) es_util.bulk_submitter.set_complete_futures(True) if last: es_util.bulk_submitter.finish_current_queues() es_util.bulk_submitter.set_complete_futures(False) self.complete_data_pb.finish() else: self.complete_data_pb.update(index)
def save_denormalization_for_new_index(self): es_util.delete_idx(self.generated_resource.idx_name) es_util.create_idx(self.generated_resource.idx_name, 3, 1, analysis=DefaultMappings.COMMON_ANALYSIS, mappings=DrugIndicationDenormalizationHandler. get_new_index_mappings()) dn_dict = {} print('{0} GROUPED RECORDS WERE FOUND'.format( len(self.drug_inds_by_grouping_id)), file=sys.stderr) p_bar = progress_bar_handler.get_new_progressbar( 'drug_inds_by_parent-dn-generation', len(self.drug_inds_by_grouping_id)) i = 0 for group_drug_inds in self.drug_inds_by_grouping_id.values(): base_drug_ind = group_drug_inds[0] efo_data = {} indication_refs = [] max_phase_for_ind = 0 for drug_ind_i in group_drug_inds: max_phase_for_ind = max(max_phase_for_ind, drug_ind_i.get('max_phase_for_ind', 0)) efo_id_i = drug_ind_i.get('efo_id', None) if efo_id_i is not None: efo_data[efo_id_i] = drug_ind_i.get('efo_term', None) indication_refs += drug_ind_i.get('indication_refs', []) parent_chembl_id, mesh_id = self.get_drug_ind_grouping_id_parts( base_drug_ind) drug_ind_data = SummableDict( **DRUG_INDICATION.get_doc_by_id_from_es( base_drug_ind['drugind_id'])) drug_ind_data -= ['efo_term', 'efo_id'] drug_ind_data['efo'] = [{ 'id': efo_id, 'term': term } for efo_id, term in efo_data.items()] drug_ind_data['max_phase_for_ind'] = max_phase_for_ind drug_ind_data['indication_refs'] = indication_refs new_mechanism_doc = { 'parent_molecule': MOLECULE.get_doc_by_id_from_es(parent_chembl_id), 'drug_indication': drug_ind_data } doc_id = self.generated_resource.get_doc_id(new_mechanism_doc) dn_dict[doc_id] = new_mechanism_doc i += 1 p_bar.update(i) p_bar.finish() self.save_denormalization_dict( self.generated_resource, dn_dict, DenormalizationHandler.default_update_script_and_size, do_index=True)
'index': 'unichem_bkp_simple', 'size': 1000, 'slice': { 'id': 2, 'max': 1000 } }, 'dest': { 'index': 'unichem_test' } } num_slices = 1000 initial_time = time.time() sleep_time = 10 pb_scheduled = get_new_progressbar('scheduled_slices', max_val=num_slices) pb_reindexed = get_new_progressbar('reindex_slices', max_val=num_slices) scheduled_slices = 0 completed_slices = 0 slice_reindex_timeout = sleep_time * 1000 def reindex_slice(slice_index): global completed_slices, scheduled_slices, task_ids, sleep_time, pb_scheduled, pb_reindexed, slice_reindex_timeout,\ base_url, base_request_path, base_reindex_data, sync_lock, es_auth, stop_reindex if stop_reindex: return task_id = None sync_lock.acquire() try:
def save_denormalization(self): if self.compound_families_dir: es_util.delete_idx(self.generated_resource.idx_name) es_util.create_idx(self.generated_resource.idx_name, 3, 1, analysis=DefaultMappings.COMMON_ANALYSIS, mappings=MechanismDenormalizationHandler. get_new_index_mappings()) dn_dict = {} print('{0} GROUPED RECORDS WERE FOUND'.format( len(self.mechanisms_by_grouping_id)), file=sys.stderr) p_bar = progress_bar_handler.get_new_progressbar( 'mechanism_by_parent_target-dn-generation', len(self.mechanisms_by_grouping_id)) i = 0 for group_mechanisms in self.mechanisms_by_grouping_id.values(): base_mechanism = group_mechanisms[0] action_type = base_mechanism.get('action_type', None) bs_id = base_mechanism.get('site_id', None) mechanism_refs = [] mechanism_comments_set = set() selectivity_comments_set = set() binding_site_comments_set = set() max_phase = 0 for mechanism_i in group_mechanisms: if action_type != mechanism_i.get('action_type', None): print('ACTION TYPE SHOULD BE {0} FOR MECHANISM {1}!'. format(action_type, mechanism_i['mec_id']), file=sys.stderr) print(pprint.pformat(group_mechanisms), file=sys.stderr) if bs_id != mechanism_i.get('site_id', None): print('BINDING SITE SHOULD BE {0} FOR MECHANISM {1}!'. format(bs_id, mechanism_i['mec_id']), file=sys.stderr) print(pprint.pformat(group_mechanisms), file=sys.stderr) if bs_id is None: bs_id = mechanism_i.get('site_id', None) mechanism_i_comment = mechanism_i.get( 'mechanism_comment', None) if mechanism_i_comment is not None: mechanism_comments_set.add(mechanism_i_comment) mechanism_i_selectivity_comment = mechanism_i.get( 'selectivity_comment', None) if mechanism_i_selectivity_comment is not None: selectivity_comments_set.add( mechanism_i_selectivity_comment) mechanism_i_binding_site_comment = mechanism_i.get( 'binding_site_comment', None) if mechanism_i_binding_site_comment is not None: binding_site_comments_set.add( mechanism_i_binding_site_comment) mechanism_refs += mechanism_i.get('mechanism_refs', []) max_phase = max(max_phase, mechanism_i.get('max_phase', 0)) parent_chembl_id, target_chembl_id, mechanism_of_action = \ self.get_mechanism_grouping_id_parts(base_mechanism) new_mechanism_doc = { 'parent_molecule': MOLECULE.get_doc_by_id_from_es(parent_chembl_id), 'target': TARGET.get_doc_by_id_from_es(target_chembl_id), 'binding_site': BINDING_SITE.get_doc_by_id_from_es(bs_id), 'mechanism_of_action': base_mechanism } new_mechanism_doc['mechanism_of_action'][ 'mechanism_comment'] = list(mechanism_comments_set) new_mechanism_doc['mechanism_of_action'][ 'selectivity_comment'] = list(selectivity_comments_set) new_mechanism_doc['mechanism_of_action'][ 'binding_site_comment'] = list(binding_site_comments_set) new_mechanism_doc['mechanism_of_action'][ 'max_phase'] = max_phase doc_id = self.generated_resource.get_doc_id(new_mechanism_doc) if len(mechanism_comments_set) > 1: print('MULTIPLE MECHANISM COMMENTS FOUND FOR {0}'.format( doc_id), file=sys.stderr) if len(selectivity_comments_set) > 1: print('MULTIPLE SELECTIVITY COMMENTS FOUND FOR {0}'.format( doc_id), file=sys.stderr) if len(binding_site_comments_set) > 1: print( 'MULTIPLE BINDING SITE COMMENTS FOUND FOR {0}'.format( doc_id), file=sys.stderr) dn_dict[doc_id] = new_mechanism_doc i += 1 p_bar.update(i) p_bar.finish() self.save_denormalization_dict( self.generated_resource, dn_dict, DenormalizationHandler.default_update_script_and_size, do_index=True)