예제 #1
0
def create_res_idx(res_name, num_docs):
    global MIG_TOTAL
    MIG_TOTAL[res_name] = num_docs

    idx_desc = es_resources_desc.resources_2_es_mapping.get(res_name, None)
    idx_name = get_index_name(res_name)

    idx_exists = es_util.get_idx_count(idx_name) > 0

    if idx_exists and not DELETE_AND_CREATE_INDEXES:
        MIG_LOG.info('INDEX {0} EXISTS. SKIPPING DELETION.'.format(idx_name))
        return

    n_shards = getattr(idx_desc, 'shards', num_shards_by_num_rows(num_docs))
    n_replicas = getattr(idx_desc, 'replicas', 0)
    res_analysis = getattr(idx_desc, 'analysis', None)
    res_mappings = getattr(idx_desc, 'mappings', None)
    es_util.create_idx(idx_name,
                       shards=n_shards,
                       replicas=n_replicas,
                       analysis=res_analysis,
                       mappings=res_mappings,
                       logger=MIG_LOG)
    MIG_LOG.info(
        "ELASTIC INDEX CREATED: RESOURCE:{0}->INDEX:{1} SHARDS:{2} REPLICAS:{3}"
        .format(res_name, idx_name, n_shards, n_replicas))
예제 #2
0
def run_analysers():
    global test_idx_name, test_doc_name, test_str_std_name, custom_mappings, custom_analysis
    try:
        es_util.create_idx(test_idx_name,
                           1,
                           1,
                           mappings=custom_mappings,
                           analysis=custom_analysis,
                           logger=logging.getLogger())
        time.sleep(1)
        analyze('CHEMBL_25', 'lowercase_keyword')
        analyze('CHEMBL_25', 'lowercase_alphanumeric_keyword')
        analyze('CHEMBL_25', 'lowercase_ngrams')
        analyze('ASPIRIN', 'lowercase_ngrams')
        analyze('SILDENAFIL CITRATE', 'lowercase_ngrams')
        analyze('Acetylsalicylic Acid', 'standard')
        analyze('Acetyl salicylic Acid', 'standard')
        analyze('6-Octylsalicylic Acid', 'lowercase_ngrams')
        create_test_doc('Acetylsalicylic Acid')
        create_test_doc('6-Octylsalicylic Acid')
        create_test_doc('ACETYLSALICYLIC-LYSINE')
        time.sleep(3)
        run_coffee_query(
            "./es_query_test.coffee",
            test_idx_name,
            replacements_dict={"<SEARCH_STRING>": "Acetylsalicylic Acid"},
            show_only=[test_str_std_name])

    except Exception as e:
        traceback.print_exc()
    finally:
        es_util.delete_idx(test_idx_name)
    def test_analyzer(self):

        test_idx_name = 'chembl_test_long_id_field_idx'
        custom_analysis = es_util.DefaultMappings.COMMON_ANALYSIS
        field_name = 'test_long_id_field'
        custom_mappings = {
            'properties': {
                field_name: es_util.DefaultMappings.TEXT_STD,
            }
        }

        es_util.create_idx(test_idx_name,
                           1,
                           0,
                           mappings=custom_mappings,
                           analysis=custom_analysis,
                           logger=logging.getLogger())

        test_text = ['0'] * (10**6)

        analyze(test_idx_name, )

        es_util.delete_idx(test_idx_name)
예제 #4
0
    def save_denormalization_for_new_index(self):
        es_util.delete_idx(self.generated_resource.idx_name)
        es_util.create_idx(self.generated_resource.idx_name,
                           3,
                           1,
                           analysis=DefaultMappings.COMMON_ANALYSIS,
                           mappings=DrugIndicationDenormalizationHandler.
                           get_new_index_mappings())

        dn_dict = {}

        print('{0} GROUPED RECORDS WERE FOUND'.format(
            len(self.drug_inds_by_grouping_id)),
              file=sys.stderr)
        p_bar = progress_bar_handler.get_new_progressbar(
            'drug_inds_by_parent-dn-generation',
            len(self.drug_inds_by_grouping_id))
        i = 0
        for group_drug_inds in self.drug_inds_by_grouping_id.values():
            base_drug_ind = group_drug_inds[0]
            efo_data = {}
            indication_refs = []
            max_phase_for_ind = 0
            for drug_ind_i in group_drug_inds:

                max_phase_for_ind = max(max_phase_for_ind,
                                        drug_ind_i.get('max_phase_for_ind', 0))

                efo_id_i = drug_ind_i.get('efo_id', None)
                if efo_id_i is not None:
                    efo_data[efo_id_i] = drug_ind_i.get('efo_term', None)

                indication_refs += drug_ind_i.get('indication_refs', [])

            parent_chembl_id, mesh_id = self.get_drug_ind_grouping_id_parts(
                base_drug_ind)

            drug_ind_data = SummableDict(
                **DRUG_INDICATION.get_doc_by_id_from_es(
                    base_drug_ind['drugind_id']))
            drug_ind_data -= ['efo_term', 'efo_id']
            drug_ind_data['efo'] = [{
                'id': efo_id,
                'term': term
            } for efo_id, term in efo_data.items()]
            drug_ind_data['max_phase_for_ind'] = max_phase_for_ind
            drug_ind_data['indication_refs'] = indication_refs

            new_mechanism_doc = {
                'parent_molecule':
                MOLECULE.get_doc_by_id_from_es(parent_chembl_id),
                'drug_indication': drug_ind_data
            }
            doc_id = self.generated_resource.get_doc_id(new_mechanism_doc)

            dn_dict[doc_id] = new_mechanism_doc
            i += 1
            p_bar.update(i)
        p_bar.finish()

        self.save_denormalization_dict(
            self.generated_resource,
            dn_dict,
            DenormalizationHandler.default_update_script_and_size,
            do_index=True)
예제 #5
0
    def save_denormalization(self):
        if self.compound_families_dir:
            es_util.delete_idx(self.generated_resource.idx_name)
            es_util.create_idx(self.generated_resource.idx_name,
                               3,
                               1,
                               analysis=DefaultMappings.COMMON_ANALYSIS,
                               mappings=MechanismDenormalizationHandler.
                               get_new_index_mappings())

            dn_dict = {}

            print('{0} GROUPED RECORDS WERE FOUND'.format(
                len(self.mechanisms_by_grouping_id)),
                  file=sys.stderr)
            p_bar = progress_bar_handler.get_new_progressbar(
                'mechanism_by_parent_target-dn-generation',
                len(self.mechanisms_by_grouping_id))
            i = 0
            for group_mechanisms in self.mechanisms_by_grouping_id.values():
                base_mechanism = group_mechanisms[0]
                action_type = base_mechanism.get('action_type', None)
                bs_id = base_mechanism.get('site_id', None)
                mechanism_refs = []
                mechanism_comments_set = set()
                selectivity_comments_set = set()
                binding_site_comments_set = set()
                max_phase = 0
                for mechanism_i in group_mechanisms:
                    if action_type != mechanism_i.get('action_type', None):
                        print('ACTION TYPE SHOULD BE {0} FOR MECHANISM {1}!'.
                              format(action_type, mechanism_i['mec_id']),
                              file=sys.stderr)
                        print(pprint.pformat(group_mechanisms),
                              file=sys.stderr)
                    if bs_id != mechanism_i.get('site_id', None):
                        print('BINDING SITE SHOULD BE {0} FOR MECHANISM {1}!'.
                              format(bs_id, mechanism_i['mec_id']),
                              file=sys.stderr)
                        print(pprint.pformat(group_mechanisms),
                              file=sys.stderr)
                    if bs_id is None:
                        bs_id = mechanism_i.get('site_id', None)

                    mechanism_i_comment = mechanism_i.get(
                        'mechanism_comment', None)
                    if mechanism_i_comment is not None:
                        mechanism_comments_set.add(mechanism_i_comment)

                    mechanism_i_selectivity_comment = mechanism_i.get(
                        'selectivity_comment', None)
                    if mechanism_i_selectivity_comment is not None:
                        selectivity_comments_set.add(
                            mechanism_i_selectivity_comment)

                    mechanism_i_binding_site_comment = mechanism_i.get(
                        'binding_site_comment', None)
                    if mechanism_i_binding_site_comment is not None:
                        binding_site_comments_set.add(
                            mechanism_i_binding_site_comment)

                    mechanism_refs += mechanism_i.get('mechanism_refs', [])

                    max_phase = max(max_phase, mechanism_i.get('max_phase', 0))

                parent_chembl_id, target_chembl_id, mechanism_of_action = \
                    self.get_mechanism_grouping_id_parts(base_mechanism)

                new_mechanism_doc = {
                    'parent_molecule':
                    MOLECULE.get_doc_by_id_from_es(parent_chembl_id),
                    'target':
                    TARGET.get_doc_by_id_from_es(target_chembl_id),
                    'binding_site':
                    BINDING_SITE.get_doc_by_id_from_es(bs_id),
                    'mechanism_of_action':
                    base_mechanism
                }
                new_mechanism_doc['mechanism_of_action'][
                    'mechanism_comment'] = list(mechanism_comments_set)
                new_mechanism_doc['mechanism_of_action'][
                    'selectivity_comment'] = list(selectivity_comments_set)
                new_mechanism_doc['mechanism_of_action'][
                    'binding_site_comment'] = list(binding_site_comments_set)
                new_mechanism_doc['mechanism_of_action'][
                    'max_phase'] = max_phase
                doc_id = self.generated_resource.get_doc_id(new_mechanism_doc)

                if len(mechanism_comments_set) > 1:
                    print('MULTIPLE MECHANISM COMMENTS FOUND FOR {0}'.format(
                        doc_id),
                          file=sys.stderr)
                if len(selectivity_comments_set) > 1:
                    print('MULTIPLE SELECTIVITY COMMENTS FOUND FOR {0}'.format(
                        doc_id),
                          file=sys.stderr)
                if len(binding_site_comments_set) > 1:
                    print(
                        'MULTIPLE BINDING SITE COMMENTS FOUND FOR {0}'.format(
                            doc_id),
                        file=sys.stderr)

                dn_dict[doc_id] = new_mechanism_doc
                i += 1
                p_bar.update(i)
            p_bar.finish()

            self.save_denormalization_dict(
                self.generated_resource,
                dn_dict,
                DenormalizationHandler.default_update_script_and_size,
                do_index=True)