def analyse_discharge_summaries(es, q, doc_type='eprdoc',
                                full_text_field='fulltext',
                                reg_exp=r'^([^\n\:]+)\:$',
                                output_file='../resources/wrappers/section_freqs.json'):
    """
    iterate all discharge summaries and create the section dictionary for
    the corpus (EHR system)
    :param es:
    :param q:
    :param doc_type:
    :param full_text_field
    :param reg_exp
    :param output_file
    :return:
    """
    scroll_obj = es.scroll(q, doc_type, include_fields=[full_text_field], size=500)
    container = []
    utils.multi_thread_tasking_it(scroll_obj, 10, do_query_analysis, args=[container, full_text_field, reg_exp])
    print 'search finished. merging sections...'
    sec_freq = {}
    for ss in container:
        for s in ss:
            sec_freq[s] = 1 if s not in sec_freq else 1 + sec_freq[s]
    utils.save_json_array(sec_freq, output_file)
    print json.dumps(sec_freq)
    print 'done'
def parse_es_docs(
    es,
    q,
    writing_es_host,
    writing_index_name,
    writing_doc_type,
    doc_type='eprdoc',
    full_text_field='fulltext',
    output_file='../resources/wrappers/sen_data_extracted.json',
    failed_docs_file='../resources/wrappers/sen_failed_docs.json',
):
    writing_es = Elasticsearch([writing_es_host], verify_certs=False)
    # scroll_obj = es.scroll(q, doc_type, include_fields=[full_text_field], size=500)
    ret_count, docs = es.search(doc_type, q, offset=0, size=30)
    container = []
    failed_docs = []
    print 'anonymising... %s, %s' % (len(docs), ','.join(
        [d['_id'] for d in docs]))
    utils.multi_thread_tasking_it(docs,
                                  1,
                                  do_doc_anonymisation,
                                  args=[
                                      writing_es, writing_index_name,
                                      writing_doc_type, full_text_field,
                                      container, failed_docs
                                  ])
    print 'search finished. merging sections...'
    utils.save_json_array(container, output_file)
    utils.save_json_array(failed_docs_file, failed_docs_file)
    print 'done'
def query_patients(es, q_obj):
    scroll_obj = es.scroll("", "medprofile", size=300,
                           q_obj=q_obj,
                           include_fields=[])
    container = []
    utils.multi_thread_tasking_it(scroll_obj, 20, do_collect_pids, args=[container])
    return container
示例#4
0
 def search_by_scroll(self, q, doc_type, field='_all', include_fields=None,
                      collection_func=lambda d, c: c.append(d['_id']),
                      index=None):
     logging.debug('scrolling [%s]' % q)
     scroll_obj = self.scroll(q, doc_type, field=field, size=300, include_fields=include_fields, index=index)
     container = []
     utils.multi_thread_tasking_it(scroll_obj, 10, collection_func, args=[container])
     return container