def index_cris_cohort():
    f_patient_doc = './hepc_pos_doc_brcid.txt'
    f_yodie_anns = 'U:/kconnect/hepc_output/'
    print 'loading all docs at a time...'
    docs = load_all_docs()
    print 'docs read'
    doc_dict = {}
    for d in docs:
        doc_dict[d['CN_Doc_ID']] = d

    es = EntityCentricES.get_instance('./index_settings/es_cris_setting.json')
    lines = utils.read_text_file(f_patient_doc, encoding='utf-8-sig')
    doc_to_patient = {}
    for l in lines:
        arr = l.split('\t')
        doc_to_patient[arr[1]] = arr[0]
    container = []
    ann_files = [
        f for f in listdir(f_yodie_anns) if isfile(join(f_yodie_anns, f))
    ]
    for ann in ann_files:
        utils.multi_thread_large_file_tasking(
            join(f_yodie_anns, ann),
            20,
            do_index_cris,
            args=[es, doc_to_patient, doc_dict, container],
            file_encoding='iso-8859-1')
        print 'file %s [%s] done' % (ann, len(container))
    print 'num done %s' % len(container)
    print 'done'
def do_semehr_index(settings, patients, doc_to_patient):
    """
    do SemEHR index
    :param settings:
    :param patients:
    :param doc_to_patient:
    :return:
    """
    es = EntityCentricES(settings.get_attr(['semehr', 'es_host']))
    es.index_name = settings.get_attr(['semehr', 'index'])
    es.concept_doc_type = settings.get_attr(['semehr', 'concept_doc_type'])
    es.entity_doc_type = settings.get_attr(['semehr', 'entity_doc_type'])
    es.doc_level_index = settings.get_attr(['semehr', 'doc_level_index'])

    f_yodie_anns = settings.get_attr(['yodie', 'output_file_path'])
    ann_files = [
        f for f in listdir(f_yodie_anns) if isfile(join(f_yodie_anns, f))
    ]

    if settings.get_attr(['job', 'semehr-concept']) == 'yes':
        logging.info('[SemEHR-step] starting semehr-concept process')
        logging.debug('working on files : %s' % ann_files)
        # index concepts
        concept_index = settings.get_attr(['semehr', 'concept_index'])
        for ann in ann_files:
            utils.multi_thread_large_file_tasking(
                join(f_yodie_anns, ann),
                10,
                do_index_100k_anns,
                args=[es, doc_to_patient, concept_index])
        logging.info('[SemEHR-step-end]concept/document level indexing done')

    if settings.get_attr(['job', 'semehr-patients']) == 'yes':
        logging.info('[SemEHR-step] indexing annotations at patient level')
        # index patients
        es_doc_url = settings.get_attr(['semehr', 'es_doc_url'])
        es_full_text = Elasticsearch([es_doc_url],
                                     serializer=JSONSerializerPython2(),
                                     verify_certs=False)
        ft_index_name = settings.get_attr(['semehr', 'full_text_index'])
        ft_doc_type = settings.get_attr(['semehr', 'full_text_doc_type'])
        ft_entity_field = settings.get_attr(
            ['semehr', 'full_text_patient_field'])
        ft_fulltext_field = settings.get_attr(
            ['semehr', 'full_text_text_field'])
        utils.multi_thread_tasking(patients,
                                   10,
                                   do_index_100k_patients,
                                   args=[
                                       es, es_full_text, ft_index_name,
                                       ft_doc_type, ft_entity_field,
                                       ft_fulltext_field
                                   ])
        logging.info('[SemEHR-step-end]patient level indexing done')
def index_mimic_notes():
    es = EntityCentricES.get_instance('./index_settings/es_mimic_setting.json')
    ann_files = [
        f for f in listdir(_f_yodie_anns) if isfile(join(_f_yodie_anns, f))
    ]
    patients = []
    for ann in ann_files:
        print 'indexing %s ...' % ann
        utils.multi_thread_large_file_tasking(join(_f_yodie_anns, ann),
                                              20,
                                              do_index_mimic,
                                              args=[es, patients])
    print 'full text and annotations done.'
    patients = list(set(patients))
    index_patients(patients, es)
示例#4
0
def do_semehr_index(settings, patients, doc_to_patient):
    """
    do SemEHR index
    :param settings:
    :param patients:
    :param doc_to_patient:
    :return:
    """
    es = EntityCentricES(settings.get_attr(['semehr', 'es_host']))
    es.index_name = settings.get_attr(['semehr', 'index'])
    es.concept_doc_type = settings.get_attr(['semehr', 'concept_doc_type'])
    es.entity_doc_type = settings.get_attr(['semehr', 'entity_doc_type'])

    f_yodie_anns = settings.get_attr(['yodie', 'output_file_path'])
    ann_files = [
        f for f in listdir(f_yodie_anns) if isfile(join(f_yodie_anns, f))
    ]

    if settings.get_attr(['job', 'semehr-concept']) == 'yes':
        print 'working on files : %s' % ann_files
        # index concepts
        for ann in ann_files:
            utils.multi_thread_large_file_tasking(join(f_yodie_anns, ann),
                                                  10,
                                                  do_index_100k_anns,
                                                  args=[es, doc_to_patient])
    if settings.get_attr(['job', 'semehr-patients']) == 'yes':
        # index patients
        es_doc_url = settings.get_attr(['semehr', 'es_doc_url'])
        es_full_text = Elasticsearch([es_doc_url],
                                     serializer=JSONSerializerPython2())
        ft_index_name = settings.get_attr(['semehr', 'full_text_index'])
        ft_doc_type = settings.get_attr(['semehr', 'full_text_doc_type'])
        ft_entity_field = settings.get_attr(
            ['semehr', 'full_text_patient_field'])
        ft_fulltext_field = settings.get_attr(
            ['semehr', 'full_text_text_field'])
        utils.multi_thread_tasking(patients,
                                   10,
                                   do_index_100k_patients,
                                   args=[
                                       es, es_full_text, ft_index_name,
                                       ft_doc_type, ft_entity_field,
                                       ft_fulltext_field
                                   ])
def index_100k(index_setting_file, patient_index_only=None):
    es = EntityCentricES.get_instance(index_setting_file)
    f_patient_doc = es.customise_settings['patient_doc_mapping_file']
    f_yodie_anns = es.customise_settings['yodie_output_folder']
    es_epr_full_text = es.customise_settings['es_ft']
    ft_index_name = es.customise_settings['ft_index_name']
    ft_doc_type = es.customise_settings['ft_doc_type']
    ft_entity_field = es.customise_settings['ft_entity_field']
    ft_fulltext_field = es.customise_settings['ft_fulltext_field']

    lines = utils.read_text_file(f_patient_doc)
    doc_to_patient = {}
    patients = set()
    for l in lines:
        arr = l.split('\t')
        doc_to_patient[arr[1]] = arr[0]
        patients.add(arr[0])
    patients = list(patients)
    # epr full text index api
    es_full_text = Elasticsearch([es_epr_full_text],
                                 serializer=JSONSerializerPython2())
    # es_full_text.get()

    if patient_index_only is None:
        ann_files = [
            f for f in listdir(f_yodie_anns) if isfile(join(f_yodie_anns, f))
        ]
        for ann in ann_files:
            utils.multi_thread_large_file_tasking(join(f_yodie_anns, ann),
                                                  10,
                                                  do_index_100k_anns,
                                                  args=[es, doc_to_patient])
        print 'anns done, indexing patients...'
    else:
        print 'skipping concept indexing'
    utils.multi_thread_tasking(patients,
                               10,
                               do_index_100k_patients,
                               args=[
                                   es, es_full_text, ft_index_name,
                                   ft_doc_type, ft_entity_field,
                                   ft_fulltext_field
                               ])
    print 'all done'