def create_res_idx(res_name, num_docs): global MIG_TOTAL MIG_TOTAL[res_name] = num_docs idx_desc = es_resources_desc.resources_2_es_mapping.get(res_name, None) idx_name = get_index_name(res_name) idx_exists = es_util.get_idx_count(idx_name) > 0 if idx_exists and not DELETE_AND_CREATE_INDEXES: MIG_LOG.info('INDEX {0} EXISTS. SKIPPING DELETION.'.format(idx_name)) return n_shards = getattr(idx_desc, 'shards', num_shards_by_num_rows(num_docs)) n_replicas = getattr(idx_desc, 'replicas', 0) res_analysis = getattr(idx_desc, 'analysis', None) res_mappings = getattr(idx_desc, 'mappings', None) es_util.create_idx(idx_name, shards=n_shards, replicas=n_replicas, analysis=res_analysis, mappings=res_mappings, logger=MIG_LOG) MIG_LOG.info( "ELASTIC INDEX CREATED: RESOURCE:{0}->INDEX:{1} SHARDS:{2} REPLICAS:{3}" .format(res_name, idx_name, n_shards, n_replicas))
def run_analysers(): global test_idx_name, test_doc_name, test_str_std_name, custom_mappings, custom_analysis try: es_util.create_idx(test_idx_name, 1, 1, mappings=custom_mappings, analysis=custom_analysis, logger=logging.getLogger()) time.sleep(1) analyze('CHEMBL_25', 'lowercase_keyword') analyze('CHEMBL_25', 'lowercase_alphanumeric_keyword') analyze('CHEMBL_25', 'lowercase_ngrams') analyze('ASPIRIN', 'lowercase_ngrams') analyze('SILDENAFIL CITRATE', 'lowercase_ngrams') analyze('Acetylsalicylic Acid', 'standard') analyze('Acetyl salicylic Acid', 'standard') analyze('6-Octylsalicylic Acid', 'lowercase_ngrams') create_test_doc('Acetylsalicylic Acid') create_test_doc('6-Octylsalicylic Acid') create_test_doc('ACETYLSALICYLIC-LYSINE') time.sleep(3) run_coffee_query( "./es_query_test.coffee", test_idx_name, replacements_dict={"<SEARCH_STRING>": "Acetylsalicylic Acid"}, show_only=[test_str_std_name]) except Exception as e: traceback.print_exc() finally: es_util.delete_idx(test_idx_name)
def test_analyzer(self): test_idx_name = 'chembl_test_long_id_field_idx' custom_analysis = es_util.DefaultMappings.COMMON_ANALYSIS field_name = 'test_long_id_field' custom_mappings = { 'properties': { field_name: es_util.DefaultMappings.TEXT_STD, } } es_util.create_idx(test_idx_name, 1, 0, mappings=custom_mappings, analysis=custom_analysis, logger=logging.getLogger()) test_text = ['0'] * (10**6) analyze(test_idx_name, ) es_util.delete_idx(test_idx_name)
def save_denormalization_for_new_index(self): es_util.delete_idx(self.generated_resource.idx_name) es_util.create_idx(self.generated_resource.idx_name, 3, 1, analysis=DefaultMappings.COMMON_ANALYSIS, mappings=DrugIndicationDenormalizationHandler. get_new_index_mappings()) dn_dict = {} print('{0} GROUPED RECORDS WERE FOUND'.format( len(self.drug_inds_by_grouping_id)), file=sys.stderr) p_bar = progress_bar_handler.get_new_progressbar( 'drug_inds_by_parent-dn-generation', len(self.drug_inds_by_grouping_id)) i = 0 for group_drug_inds in self.drug_inds_by_grouping_id.values(): base_drug_ind = group_drug_inds[0] efo_data = {} indication_refs = [] max_phase_for_ind = 0 for drug_ind_i in group_drug_inds: max_phase_for_ind = max(max_phase_for_ind, drug_ind_i.get('max_phase_for_ind', 0)) efo_id_i = drug_ind_i.get('efo_id', None) if efo_id_i is not None: efo_data[efo_id_i] = drug_ind_i.get('efo_term', None) indication_refs += drug_ind_i.get('indication_refs', []) parent_chembl_id, mesh_id = self.get_drug_ind_grouping_id_parts( base_drug_ind) drug_ind_data = SummableDict( **DRUG_INDICATION.get_doc_by_id_from_es( base_drug_ind['drugind_id'])) drug_ind_data -= ['efo_term', 'efo_id'] drug_ind_data['efo'] = [{ 'id': efo_id, 'term': term } for efo_id, term in efo_data.items()] drug_ind_data['max_phase_for_ind'] = max_phase_for_ind drug_ind_data['indication_refs'] = indication_refs new_mechanism_doc = { 'parent_molecule': MOLECULE.get_doc_by_id_from_es(parent_chembl_id), 'drug_indication': drug_ind_data } doc_id = self.generated_resource.get_doc_id(new_mechanism_doc) dn_dict[doc_id] = new_mechanism_doc i += 1 p_bar.update(i) p_bar.finish() self.save_denormalization_dict( self.generated_resource, dn_dict, DenormalizationHandler.default_update_script_and_size, do_index=True)
def save_denormalization(self): if self.compound_families_dir: es_util.delete_idx(self.generated_resource.idx_name) es_util.create_idx(self.generated_resource.idx_name, 3, 1, analysis=DefaultMappings.COMMON_ANALYSIS, mappings=MechanismDenormalizationHandler. get_new_index_mappings()) dn_dict = {} print('{0} GROUPED RECORDS WERE FOUND'.format( len(self.mechanisms_by_grouping_id)), file=sys.stderr) p_bar = progress_bar_handler.get_new_progressbar( 'mechanism_by_parent_target-dn-generation', len(self.mechanisms_by_grouping_id)) i = 0 for group_mechanisms in self.mechanisms_by_grouping_id.values(): base_mechanism = group_mechanisms[0] action_type = base_mechanism.get('action_type', None) bs_id = base_mechanism.get('site_id', None) mechanism_refs = [] mechanism_comments_set = set() selectivity_comments_set = set() binding_site_comments_set = set() max_phase = 0 for mechanism_i in group_mechanisms: if action_type != mechanism_i.get('action_type', None): print('ACTION TYPE SHOULD BE {0} FOR MECHANISM {1}!'. format(action_type, mechanism_i['mec_id']), file=sys.stderr) print(pprint.pformat(group_mechanisms), file=sys.stderr) if bs_id != mechanism_i.get('site_id', None): print('BINDING SITE SHOULD BE {0} FOR MECHANISM {1}!'. format(bs_id, mechanism_i['mec_id']), file=sys.stderr) print(pprint.pformat(group_mechanisms), file=sys.stderr) if bs_id is None: bs_id = mechanism_i.get('site_id', None) mechanism_i_comment = mechanism_i.get( 'mechanism_comment', None) if mechanism_i_comment is not None: mechanism_comments_set.add(mechanism_i_comment) mechanism_i_selectivity_comment = mechanism_i.get( 'selectivity_comment', None) if mechanism_i_selectivity_comment is not None: selectivity_comments_set.add( mechanism_i_selectivity_comment) mechanism_i_binding_site_comment = mechanism_i.get( 'binding_site_comment', None) if mechanism_i_binding_site_comment is not None: binding_site_comments_set.add( mechanism_i_binding_site_comment) mechanism_refs += mechanism_i.get('mechanism_refs', []) max_phase = max(max_phase, mechanism_i.get('max_phase', 0)) parent_chembl_id, target_chembl_id, mechanism_of_action = \ self.get_mechanism_grouping_id_parts(base_mechanism) new_mechanism_doc = { 'parent_molecule': MOLECULE.get_doc_by_id_from_es(parent_chembl_id), 'target': TARGET.get_doc_by_id_from_es(target_chembl_id), 'binding_site': BINDING_SITE.get_doc_by_id_from_es(bs_id), 'mechanism_of_action': base_mechanism } new_mechanism_doc['mechanism_of_action'][ 'mechanism_comment'] = list(mechanism_comments_set) new_mechanism_doc['mechanism_of_action'][ 'selectivity_comment'] = list(selectivity_comments_set) new_mechanism_doc['mechanism_of_action'][ 'binding_site_comment'] = list(binding_site_comments_set) new_mechanism_doc['mechanism_of_action'][ 'max_phase'] = max_phase doc_id = self.generated_resource.get_doc_id(new_mechanism_doc) if len(mechanism_comments_set) > 1: print('MULTIPLE MECHANISM COMMENTS FOUND FOR {0}'.format( doc_id), file=sys.stderr) if len(selectivity_comments_set) > 1: print('MULTIPLE SELECTIVITY COMMENTS FOUND FOR {0}'.format( doc_id), file=sys.stderr) if len(binding_site_comments_set) > 1: print( 'MULTIPLE BINDING SITE COMMENTS FOUND FOR {0}'.format( doc_id), file=sys.stderr) dn_dict[doc_id] = new_mechanism_doc i += 1 p_bar.update(i) p_bar.finish() self.save_denormalization_dict( self.generated_resource, dn_dict, DenormalizationHandler.default_update_script_and_size, do_index=True)