def index_cris_cohort(): f_patient_doc = './hepc_pos_doc_brcid.txt' f_yodie_anns = 'U:/kconnect/hepc_output/' print 'loading all docs at a time...' docs = load_all_docs() print 'docs read' doc_dict = {} for d in docs: doc_dict[d['CN_Doc_ID']] = d es = EntityCentricES.get_instance('./index_settings/es_cris_setting.json') lines = utils.read_text_file(f_patient_doc, encoding='utf-8-sig') doc_to_patient = {} for l in lines: arr = l.split('\t') doc_to_patient[arr[1]] = arr[0] container = [] ann_files = [ f for f in listdir(f_yodie_anns) if isfile(join(f_yodie_anns, f)) ] for ann in ann_files: utils.multi_thread_large_file_tasking( join(f_yodie_anns, ann), 20, do_index_cris, args=[es, doc_to_patient, doc_dict, container], file_encoding='iso-8859-1') print 'file %s [%s] done' % (ann, len(container)) print 'num done %s' % len(container) print 'done'
def do_semehr_index(settings, patients, doc_to_patient): """ do SemEHR index :param settings: :param patients: :param doc_to_patient: :return: """ es = EntityCentricES(settings.get_attr(['semehr', 'es_host'])) es.index_name = settings.get_attr(['semehr', 'index']) es.concept_doc_type = settings.get_attr(['semehr', 'concept_doc_type']) es.entity_doc_type = settings.get_attr(['semehr', 'entity_doc_type']) es.doc_level_index = settings.get_attr(['semehr', 'doc_level_index']) f_yodie_anns = settings.get_attr(['yodie', 'output_file_path']) ann_files = [ f for f in listdir(f_yodie_anns) if isfile(join(f_yodie_anns, f)) ] if settings.get_attr(['job', 'semehr-concept']) == 'yes': logging.info('[SemEHR-step] starting semehr-concept process') logging.debug('working on files : %s' % ann_files) # index concepts concept_index = settings.get_attr(['semehr', 'concept_index']) for ann in ann_files: utils.multi_thread_large_file_tasking( join(f_yodie_anns, ann), 10, do_index_100k_anns, args=[es, doc_to_patient, concept_index]) logging.info('[SemEHR-step-end]concept/document level indexing done') if settings.get_attr(['job', 'semehr-patients']) == 'yes': logging.info('[SemEHR-step] indexing annotations at patient level') # index patients es_doc_url = settings.get_attr(['semehr', 'es_doc_url']) es_full_text = Elasticsearch([es_doc_url], serializer=JSONSerializerPython2(), verify_certs=False) ft_index_name = settings.get_attr(['semehr', 'full_text_index']) ft_doc_type = settings.get_attr(['semehr', 'full_text_doc_type']) ft_entity_field = settings.get_attr( ['semehr', 'full_text_patient_field']) ft_fulltext_field = settings.get_attr( ['semehr', 'full_text_text_field']) utils.multi_thread_tasking(patients, 10, do_index_100k_patients, args=[ es, es_full_text, ft_index_name, ft_doc_type, ft_entity_field, ft_fulltext_field ]) logging.info('[SemEHR-step-end]patient level indexing done')
def index_mimic_notes(): es = EntityCentricES.get_instance('./index_settings/es_mimic_setting.json') ann_files = [ f for f in listdir(_f_yodie_anns) if isfile(join(_f_yodie_anns, f)) ] patients = [] for ann in ann_files: print 'indexing %s ...' % ann utils.multi_thread_large_file_tasking(join(_f_yodie_anns, ann), 20, do_index_mimic, args=[es, patients]) print 'full text and annotations done.' patients = list(set(patients)) index_patients(patients, es)
def do_semehr_index(settings, patients, doc_to_patient): """ do SemEHR index :param settings: :param patients: :param doc_to_patient: :return: """ es = EntityCentricES(settings.get_attr(['semehr', 'es_host'])) es.index_name = settings.get_attr(['semehr', 'index']) es.concept_doc_type = settings.get_attr(['semehr', 'concept_doc_type']) es.entity_doc_type = settings.get_attr(['semehr', 'entity_doc_type']) f_yodie_anns = settings.get_attr(['yodie', 'output_file_path']) ann_files = [ f for f in listdir(f_yodie_anns) if isfile(join(f_yodie_anns, f)) ] if settings.get_attr(['job', 'semehr-concept']) == 'yes': print 'working on files : %s' % ann_files # index concepts for ann in ann_files: utils.multi_thread_large_file_tasking(join(f_yodie_anns, ann), 10, do_index_100k_anns, args=[es, doc_to_patient]) if settings.get_attr(['job', 'semehr-patients']) == 'yes': # index patients es_doc_url = settings.get_attr(['semehr', 'es_doc_url']) es_full_text = Elasticsearch([es_doc_url], serializer=JSONSerializerPython2()) ft_index_name = settings.get_attr(['semehr', 'full_text_index']) ft_doc_type = settings.get_attr(['semehr', 'full_text_doc_type']) ft_entity_field = settings.get_attr( ['semehr', 'full_text_patient_field']) ft_fulltext_field = settings.get_attr( ['semehr', 'full_text_text_field']) utils.multi_thread_tasking(patients, 10, do_index_100k_patients, args=[ es, es_full_text, ft_index_name, ft_doc_type, ft_entity_field, ft_fulltext_field ])
def index_100k(index_setting_file, patient_index_only=None): es = EntityCentricES.get_instance(index_setting_file) f_patient_doc = es.customise_settings['patient_doc_mapping_file'] f_yodie_anns = es.customise_settings['yodie_output_folder'] es_epr_full_text = es.customise_settings['es_ft'] ft_index_name = es.customise_settings['ft_index_name'] ft_doc_type = es.customise_settings['ft_doc_type'] ft_entity_field = es.customise_settings['ft_entity_field'] ft_fulltext_field = es.customise_settings['ft_fulltext_field'] lines = utils.read_text_file(f_patient_doc) doc_to_patient = {} patients = set() for l in lines: arr = l.split('\t') doc_to_patient[arr[1]] = arr[0] patients.add(arr[0]) patients = list(patients) # epr full text index api es_full_text = Elasticsearch([es_epr_full_text], serializer=JSONSerializerPython2()) # es_full_text.get() if patient_index_only is None: ann_files = [ f for f in listdir(f_yodie_anns) if isfile(join(f_yodie_anns, f)) ] for ann in ann_files: utils.multi_thread_large_file_tasking(join(f_yodie_anns, ann), 10, do_index_100k_anns, args=[es, doc_to_patient]) print 'anns done, indexing patients...' else: print 'skipping concept indexing' utils.multi_thread_tasking(patients, 10, do_index_100k_patients, args=[ es, es_full_text, ft_index_name, ft_doc_type, ft_entity_field, ft_fulltext_field ]) print 'all done'