def test_serialisation(nlp, docs_path, models_path):
    docs = []
    utils.multi_thread_process_files(docs_path,
                                     'txt',
                                     10,
                                     nlp_process_doc,
                                     args=[docs])
    jl.dump(docs, models_path)
Exemplo n.º 2
0
def process_doc_anns(anns_folder, full_text_folder, rule_config_file, output_folder,
                     study_folder=None,
                     study_config='study.json', full_text_fn_ptn='%s.txt', fn_pattern='se_ann_%s.json',
                     thread_num=10, es_inst=None, es_text_field='', patient_id_field='', combined_anns=None,
                     es_output_index=None, es_output_doc='doc'):
    """
    multiple threading process doc anns
    :param anns_folder:
    :param full_text_folder:
    :param rule_config_file:
    :param output_folder:
    :param study_folder:
    :param study_config:
    :param full_text_fn_ptn:
    :param fn_pattern:
    :param thread_num:
    :param es_inst: semquery.SemEHRES instance
    :param es_text_field: the full text filed name in the es index
    :return:
    """
    if es_inst is None:
        text_reader = FileTextReader(full_text_folder, full_text_fn_ptn)
    else:
        text_reader = ESTextReader(es_inst, es_text_field, patient_id_field=patient_id_field)
    ret = load_study_ruler(study_folder, rule_config_file, study_config)
    sa = ret['sa']
    ruler = ret['ruler']

    # for ff in [f for f in listdir(anns_folder) if isfile(join(anns_folder, f))]:
    #     analyse_doc_anns(join(anns_folder, ff), ruler, text_reader, output_folder, fn_pattern, sa)
    if combined_anns is None:
        utils.multi_thread_process_files(dir_path=anns_folder,
                                         file_extension='json',
                                         num_threads=thread_num,
                                         process_func=analyse_doc_anns_file,
                                         args=[ruler, text_reader, output_folder, fn_pattern,
                                               es_inst, es_output_index, es_output_doc,
                                               sa])
    else:
        ann_files = [f for f in listdir(anns_folder) if isfile(join(anns_folder, f))]
        for ann in ann_files:
            utils.multi_process_large_file_tasking(
                large_file=join(anns_folder, ann),
                process_func=analyse_doc_anns_line,
                args=[ruler, text_reader, output_folder, fn_pattern,
                      es_inst, es_output_index, es_output_doc,
                      sa])

    logging.info('post processing of ann docs done')
Exemplo n.º 3
0
def process_files(read_dir, write_dir):
    utils.multi_thread_process_files(read_dir, file_extension='xml', num_threads=10,
                                     process_func=save_full_text, args=[write_dir])