Exemplo n.º 1
0
def update_mimic_doc_dates(doc_dates):
    es = EntityCentricES.get_instance('./index_settings/es_mimic_setting.json')
    container = []
    utils.multi_thread_tasking(doc_dates,
                               20,
                               do_doc_update_date,
                               args=[es, container])
Exemplo n.º 2
0
def do_semehr_index(settings, patients, doc_to_patient):
    """
    do SemEHR index
    :param settings:
    :param patients:
    :param doc_to_patient:
    :return:
    """
    es = EntityCentricES(settings.get_attr(['semehr', 'es_host']))
    es.index_name = settings.get_attr(['semehr', 'index'])
    es.concept_doc_type = settings.get_attr(['semehr', 'concept_doc_type'])
    es.entity_doc_type = settings.get_attr(['semehr', 'entity_doc_type'])

    f_yodie_anns = settings.get_attr(['yodie', 'output_file_path'])
    ann_files = [
        f for f in listdir(f_yodie_anns) if isfile(join(f_yodie_anns, f))
    ]

    if settings.get_attr(['job', 'semehr-concept']) == 'yes':
        print 'working on files : %s' % ann_files
        # index concepts
        for ann in ann_files:
            utils.multi_thread_large_file_tasking(join(f_yodie_anns, ann),
                                                  10,
                                                  do_index_100k_anns,
                                                  args=[es, doc_to_patient])
    if settings.get_attr(['job', 'semehr-patients']) == 'yes':
        # index patients
        es_doc_url = settings.get_attr(['semehr', 'es_doc_url'])
        es_full_text = Elasticsearch([es_doc_url],
                                     serializer=JSONSerializerPython2())
        ft_index_name = settings.get_attr(['semehr', 'full_text_index'])
        ft_doc_type = settings.get_attr(['semehr', 'full_text_doc_type'])
        ft_entity_field = settings.get_attr(
            ['semehr', 'full_text_patient_field'])
        ft_fulltext_field = settings.get_attr(
            ['semehr', 'full_text_text_field'])
        utils.multi_thread_tasking(patients,
                                   10,
                                   do_index_100k_patients,
                                   args=[
                                       es, es_full_text, ft_index_name,
                                       ft_doc_type, ft_entity_field,
                                       ft_fulltext_field
                                   ])
Exemplo n.º 3
0
def index_mimic_notes():
    es = EntityCentricES.get_instance('./index_settings/es_mimic_setting.json')
    ann_files = [
        f for f in listdir(_f_yodie_anns) if isfile(join(_f_yodie_anns, f))
    ]
    patients = []
    for ann in ann_files:
        print 'indexing %s ...' % ann
        utils.multi_thread_large_file_tasking(join(_f_yodie_anns, ann),
                                              20,
                                              do_index_mimic,
                                              args=[es, patients])
    print 'full text and annotations done.'
    patients = list(set(patients))
    index_patients(patients, es)
def do_semehr_index(settings, patients, doc_to_patient):
    """
    do SemEHR index
    :param settings:
    :param patients:
    :param doc_to_patient:
    :return:
    """
    es = EntityCentricES(settings.get_attr(['semehr', 'es_host']))
    es.index_name = settings.get_attr(['semehr', 'index'])
    es.concept_doc_type = settings.get_attr(['semehr', 'concept_doc_type'])
    es.entity_doc_type = settings.get_attr(['semehr', 'entity_doc_type'])
    es.doc_level_index = settings.get_attr(['semehr', 'doc_level_index'])

    f_yodie_anns = settings.get_attr(['yodie', 'output_file_path'])
    ann_files = [
        f for f in listdir(f_yodie_anns) if isfile(join(f_yodie_anns, f))
    ]

    if settings.get_attr(['job', 'semehr-concept']) == 'yes':
        logging.info('[SemEHR-step] starting semehr-concept process')
        logging.debug('working on files : %s' % ann_files)
        # index concepts
        concept_index = settings.get_attr(['semehr', 'concept_index'])
        for ann in ann_files:
            utils.multi_thread_large_file_tasking(
                join(f_yodie_anns, ann),
                10,
                do_index_100k_anns,
                args=[es, doc_to_patient, concept_index])
        logging.info('[SemEHR-step-end]concept/document level indexing done')

    if settings.get_attr(['job', 'semehr-patients']) == 'yes':
        logging.info('[SemEHR-step] indexing annotations at patient level')
        # index patients
        es_doc_url = settings.get_attr(['semehr', 'es_doc_url'])
        es_full_text = Elasticsearch([es_doc_url],
                                     serializer=JSONSerializerPython2(),
                                     verify_certs=False)
        ft_index_name = settings.get_attr(['semehr', 'full_text_index'])
        ft_doc_type = settings.get_attr(['semehr', 'full_text_doc_type'])
        ft_entity_field = settings.get_attr(
            ['semehr', 'full_text_patient_field'])
        ft_fulltext_field = settings.get_attr(
            ['semehr', 'full_text_text_field'])
        utils.multi_thread_tasking(patients,
                                   10,
                                   do_index_100k_patients,
                                   args=[
                                       es, es_full_text, ft_index_name,
                                       ft_doc_type, ft_entity_field,
                                       ft_fulltext_field
                                   ])
        logging.info('[SemEHR-step-end]patient level indexing done')
def process_semehr(config_file):
    """
    a pipeline to process all SemEHR related processes:
    0. ES doc copy from one index to another;
    1. bio-yodie NLP pipeline annotation on docs;
    2. entity centric SemEHR ES indexing
    :param config_file:
    :return:
    """
    # read the configuration
    ps = ProcessSetting(config_file)

    # setting log configuration
    log_level = 'INFO' if ps.get_attr(
        ['logging', 'level']) is None else ps.get_attr(['logging', 'level'])
    log_format = '%(name)s %(asctime)s %(levelname)s %(message)s' if ps.get_attr(['logging', 'format']) is None \
        else ps.get_attr(['logging', 'format'])
    log_file = None if ps.get_attr(
        ['logging', 'file']) is None else ps.get_attr(['logging', 'file'])
    logging.basicConfig(level=log_level, format=log_format)
    if log_file is not None:
        formatter = logging.Formatter(log_format)
        file_handler = logging.FileHandler(log_file)
        file_handler.setLevel(log_level)
        file_handler.setFormatter(formatter)
        logging.getLogger().addHandler(file_handler)
        logging.info('logging to %s' % log_file)

    # initialise the jobstatus class instance
    job_file = join(
        ps.get_attr(['job', 'job_status_file_path']),
        'semehr_job_status_%s.json' % ps.get_attr(['job', 'job_id']))
    logging.info('[SemEHR-step] using job status file %s' % job_file)
    job_status = JobStatus(job_file)
    job_status.job_start()

    # preload: load documents to es
    if ps.get_attr(['job', 'epr_index']) == 'yes':
        logging.info('[SemEHR-step]load documents to elasticsearch...')
        load_document_to_es(settings=ps)
        logging.info('[SemEHR-step-end] epr_index step done')

    data_rows = []
    doc2pid = {}
    pids = []
    if ps.get_attr(['job', 'load_docs']) == 'yes':
        sql_template = ps.get_attr(['new_docs', 'sql_query'])
        logging.info(
            '[SemEHR-step] retrieving docs by using the template [%s]' %
            sql_template)
        data_rows = get_docs_for_processing(
            job_status, sql_template,
            ps.get_attr(['new_docs', 'dbconn_setting_file']))
        logging.info('total docs num is %s' % len(data_rows))
    elif ps.get_attr(['job', 'cohort_docs']) == 'yes':
        logging.info('[SemEHR-step] retrieving docs by cohort [%s]' %
                     ps.get_attr(['cohort_docs', 'es_cohort_file']))
        data_rows, doc2pid, pids = es_get_cohort_docs(ps)
        logging.info('total docs num is %s' % len(data_rows))

    try:
        # if True:
        # 0. copy docs
        if ps.get_attr(['job', 'copy_docs']) == 'yes':
            logging.info('[SemEHR-step] copy docs')
            docs = [str(r['docid']) for r in data_rows]
            utils.multi_thread_tasking(
                docs,
                ps.get_attr(['doc_copy', 'thread_num']),
                do_copy_doc,
                args=[
                    EntityCentricES(ps.get_attr(['doc_copy', 'es_host'])),
                    ps.get_attr(['doc_copy', 'src_index']),
                    ps.get_attr(['doc_copy', 'src_doc_type']),
                    ps.get_attr(['doc_copy', 'dest_index']),
                    ps.get_attr(['doc_copy', 'dest_doc_type'])
                ])
            logging.info('[SemEHR-step-end]copying docs done')

        if ps.get_attr(['job', 'yodie']) == 'yes':
            docid_path = '%s/%s_docids.txt' % (ps.get_attr([
                'yodie', 'input_doc_file_path'
            ]), ps.get_attr(['job', 'job_id']))
            logging.info('[SemEHR-step] doing yodie')
            # 1. do bio-yodie pipeline
            # 1.1 prepare the configuration file
            num_docs = produce_yodie_config(ps, data_rows, docid_path)
            if num_docs == 0:
                logging.info(
                    '[SemEHR-step-end] nothing to process, NLP step done')
            else:
                logging.info('total number of docs %s' % num_docs)
                # 1.2 set the env variables
                set_sys_env(ps)
                # 1.3 clear ann output folder
                logging.info('clearing %s ...' %
                             ps.get_attr(['yodie', 'output_file_path']))
                clear_folder(ps.get_attr(['yodie', 'output_file_path']))
                # 1.3 run bio-yodie
                os.chdir(ps.get_attr(['yodie', 'gcp_run_path']))
                if ps.get_attr(['yodie', 'os']) == 'win':
                    cmd = ' '.join([
                        'java',
                        "-Dgate.home=%s" % ps.get_attr(['env', 'gate_home']),
                        "-Dgcp.home=%s" % ps.get_attr(['env', 'gcp_home']),
                        "-Djava.protocol.handler.pkgs=gate.cloud.util.protocols",
                        "-cp .;{SCRIPTDIR}/conf;{SCRIPTDIR}/gcp.jar;{SCRIPTDIR}/lib/*;"
                        "{GATE_HOME}/bin/gate.jar;{GATE_HOME}/lib/*".format(
                            **{
                                "SCRIPTDIR": ps.get_attr(['env', 'gcp_home']),
                                "GATE_HOME": ps.get_attr(['env', 'gate_home'])
                            }),
                        '-Dat.ofai.gate.modularpipelines.configFile="%s/bio-yodie-1-2-1/main-bio/main-bio.config.yaml" '
                        % ps.get_attr(['env', 'yodie_path']),
                        "-Xmx%s" % ps.get_attr(['yodie', 'memory']),
                        "gate.cloud.batch.BatchRunner",
                        "-t %s" % ps.get_attr(['yodie', 'thread_num']),
                        "-b %s" % ps.get_attr(['yodie', 'config_xml_path'])
                    ])
                else:
                    cmd = ' '.join([
                        'gcp-direct.sh',
                        "-t %s" % ps.get_attr(['yodie', 'thread_num']),
                        "-Xmx%s" % ps.get_attr(['yodie', 'memory']),
                        "-b %s" % ps.get_attr(['yodie', 'config_xml_path']),
                        '-Dat.ofai.gate.modularpipelines.configFile="%s/bio-yodie-1-2-1/main-bio/main-bio.config.yaml" '
                        % ps.get_attr(['env', 'yodie_path']),
                    ])
                logging.debug(
                    'executing the following command to start NLP...')
                logging.info(cmd)
                p = Popen(cmd, shell=True, stderr=STDOUT)
                p.wait()

                if 0 != p.returncode:
                    job_status.set_status(False)
                    job_status.save()
                    logging.error(
                        'ERROR doing the NLP, stopped with a coide [%s]' %
                        p.returncode)
                    exit(p.returncode)
                else:
                    logging.info('[SemEHR-step-end] NLP step done')
                if 'semehr_path' in os.environ:
                    logging.info('changing back to semehr_path: %s' %
                                 os.environ['semehr_path'])
                    os.chdir(os.environ['semehr_path'])

        # 2. do SemEHR concept/entity indexing
        if ps.get_attr(['job', 'semehr-concept']) == 'yes' or ps.get_attr(
            ['job', 'semehr-patients']) == 'yes':
            patients = []
            doc_to_patient = {}
            for r in data_rows:
                patients.append(str(r['patientid']))
                doc_to_patient[str(r['docid'])] = str(r['patientid'])
            patients = list(set(patients))
            do_semehr_index(ps, patients, doc_to_patient)

        # 3. do SemEHR actionable transparency
        if ps.get_attr(['job', 'action_trans']) == 'yes':
            logging.info('[SemEHR-step]doing transparency...')
            actionable_transparise(settings=ps)

        # 4. do SemEHR document annotation analysis (post processing)
        if ps.get_attr(['job', 'doc_analysis']) == 'yes':
            logging.info('[SemEHR-step]doing SemEHR annotation analysis...')
            do_semehr_doc_anns_analysis(settings=ps)
            logging.info('[SemEHR-step-end] doc_analysis step done')

        # 4.5 do SemEHR patient level index
        if ps.get_attr(['job', 'patient_index']) == 'yes':
            logging.info('[SemEHR-step]doing patient level indexing...')
            patient_level_indexing(settings=ps, pids=pids)
            logging.info('[SemEHR-step-end] patient level indexing done')

        # 5. do populate results for a research study
        if ps.get_attr(['job', 'populate_cohort_result']) == 'yes':
            logging.info(
                '[SemEHR-step]doing SemEHR cohort result extraction...')
            populate_cohort_results(settings=ps)
            logging.info('[SemEHR-step-end] populate_cohort_result step done')

        # 6. do collect cohort doc based results for a research study
        if ps.get_attr(['job', 'cohort_doc_collection']) == 'yes':
            logging.info(
                '[SemEHR-step]doing SemEHR cohort doc based collection...')
            collect_cohort_doc_results(settings=ps, doc2pid=doc2pid)
            logging.info(
                '[SemEHR-step-end] collect_cohort_doc_results step done')

        job_status.set_status(True)
        job_status.save()
        logging.info('[SemEHR-process-end] all done')
    except Exception as e:
        logging.error('[SemEHR-process-ERROR] Failed to do SemEHR process %s' %
                      str(e))
        job_status.set_status(False)
        job_status.save()
Exemplo n.º 6
0
def process_semehr(config_file):
    """
    a pipeline to process all SemEHR related processes:
    0. ES doc copy from one index to another;
    1. bio-yodie NLP pipeline annotation on docs;
    2. entity centric SemEHR ES indexing
    :param config_file:
    :return:
    """
    # read the configuration
    ps = ProcessSetting(config_file)

    # initialise the jobstatus class instance
    job_file = join(
        ps.get_attr(['job', 'job_status_file_path']),
        'semehr_job_status_%s.json' % ps.get_attr(['job', 'job_id']))
    print 'using job status file %s' % job_file
    job_status = JobStatus(job_file)
    job_status.job_start()

    data_rows = []
    if ps.get_attr(['job', 'load_docs']) == 'yes':
        sql_template = ps.get_attr(['new_docs', 'sql_query'])
        print 'retrieving docs by using the template [%s]' % sql_template
        data_rows = get_docs_for_processing(
            job_status, sql_template,
            ps.get_attr(['new_docs', 'dbconn_setting_file']))
        print 'total docs num is %s' % len(data_rows)

    # try:
    if True:
        # 0. copy docs
        if ps.get_attr(['job', 'copy_docs']) == 'yes':
            docs = [str(r['docid']) for r in data_rows]
            utils.multi_thread_tasking(
                docs,
                ps.get_attr(['doc_copy', 'thread_num']),
                do_copy_doc,
                args=[
                    EntityCentricES(ps.get_attr(['doc_copy', 'es_host'])),
                    ps.get_attr(['doc_copy', 'src_index']),
                    ps.get_attr(['doc_copy', 'src_doc_type']),
                    ps.get_attr(['doc_copy', 'dest_index']),
                    ps.get_attr(['doc_copy', 'dest_doc_type'])
                ])

        if ps.get_attr(['job', 'yodie']) == 'yes':
            docid_path = '%s/%s_docids.txt' % (ps.get_attr([
                'yodie', 'input_doc_file_path'
            ]), ps.get_attr(['job', 'job_id']))
            print 'working on yodie with %s documents, saved to %s...' % (str(
                len(data_rows)), docid_path)
            # save doc ids to text file for input to bioyodie
            print 'saving doc ids to [%s]' % docid_path
            utils.save_string('\n'.join([str(r['docid']) for r in data_rows]),
                              docid_path)
            # 1. do bio-yodie pipeline
            # 1.1 prepare the configuration file
            produce_yodie_config(ps)
            # 1.2 set the env variables
            set_sys_env(ps)
            # 1.3 clear ann output folder
            print 'clearing %s ...' % ps.get_attr(
                ['yodie', 'output_file_path'])
            clear_folder(ps.get_attr(['yodie', 'output_file_path']))
            # 1.3 run bio-yodie
            os.chdir(ps.get_attr(['yodie', 'gcp_run_path']))
            if ps.get_attr(['yodie', 'os']) == 'win':
                cmd = ' '.join([
                    'java',
                    "-Dgate.home=%s" % ps.get_attr(['env', 'gate_home']),
                    "-Dgcp.home=%s" % ps.get_attr(['env', 'gcp_home']),
                    "-Djava.protocol.handler.pkgs=gate.cloud.util.protocols",
                    "-cp .;{SCRIPTDIR}/conf;{SCRIPTDIR}/gcp.jar;{SCRIPTDIR}/lib/*;"
                    "{GATE_HOME}/bin/gate.jar;{GATE_HOME}/lib/*".format(
                        **{
                            "SCRIPTDIR": ps.get_attr(['env', 'gcp_home']),
                            "GATE_HOME": ps.get_attr(['env', 'gate_home'])
                        }),
                    '-Dat.ofai.gate.modularpipelines.configFile="%s/bio-yodie-1-2-1/main-bio/main-bio.config.yaml" '
                    % ps.get_attr(['env', 'yodie_path']),
                    "-Xmx%s" % ps.get_attr(['yodie', 'memory']),
                    "gate.cloud.batch.BatchRunner",
                    "-t %s" % ps.get_attr(['yodie', 'thread_num']),
                    "-b %s" % ps.get_attr(['yodie', 'config_xml_path'])
                ])
            else:
                cmd = ' '.join([
                    'gcp-direct.sh',
                    "-t %s" % ps.get_attr(['yodie', 'thread_num']),
                    "-Xmx%s" % ps.get_attr(['yodie', 'memory']),
                    "-b %s" % ps.get_attr(['yodie', 'config_xml_path']),
                    '-Dat.ofai.gate.modularpipelines.configFile="%s/bio-yodie-1-2-1/main-bio/main-bio.config.yaml" '
                    % ps.get_attr(['env', 'yodie_path']),
                ])
            print cmd
            p = Popen(cmd, shell=True, stderr=STDOUT)
            p.wait()

            if 0 != p.returncode:
                job_status.set_status(False)
                job_status.save()
                exit(p.returncode)

        # 2. do SemEHR concept/entity indexing
        if ps.get_attr(['job', 'semehr-concept']) == 'yes' or ps.get_attr(
            ['job', 'semehr-patients']) == 'yes':
            patients = []
            doc_to_patient = {}
            for r in data_rows:
                patients.append(str(r['patientid']))
                doc_to_patient[str(r['docid'])] = str(r['patientid'])
            patients = list(set(patients))
            do_semehr_index(ps, patients, doc_to_patient)

        # 3. do SemEHR actionable transparency
        if ps.get_attr(['job', 'action_trans']) == 'yes':
            print 'doing transparency...'
            actionable_transparise(settings=ps)

        job_status.set_status(True)
        job_status.save()