def analyse_discharge_summaries(es, q, doc_type='eprdoc', full_text_field='fulltext', reg_exp=r'^([^\n\:]+)\:$', output_file='../resources/wrappers/section_freqs.json'): """ iterate all discharge summaries and create the section dictionary for the corpus (EHR system) :param es: :param q: :param doc_type: :param full_text_field :param reg_exp :param output_file :return: """ scroll_obj = es.scroll(q, doc_type, include_fields=[full_text_field], size=500) container = [] utils.multi_thread_tasking_it(scroll_obj, 10, do_query_analysis, args=[container, full_text_field, reg_exp]) print 'search finished. merging sections...' sec_freq = {} for ss in container: for s in ss: sec_freq[s] = 1 if s not in sec_freq else 1 + sec_freq[s] utils.save_json_array(sec_freq, output_file) print json.dumps(sec_freq) print 'done'
def parse_es_docs( es, q, writing_es_host, writing_index_name, writing_doc_type, doc_type='eprdoc', full_text_field='fulltext', output_file='../resources/wrappers/sen_data_extracted.json', failed_docs_file='../resources/wrappers/sen_failed_docs.json', ): writing_es = Elasticsearch([writing_es_host], verify_certs=False) # scroll_obj = es.scroll(q, doc_type, include_fields=[full_text_field], size=500) ret_count, docs = es.search(doc_type, q, offset=0, size=30) container = [] failed_docs = [] print 'anonymising... %s, %s' % (len(docs), ','.join( [d['_id'] for d in docs])) utils.multi_thread_tasking_it(docs, 1, do_doc_anonymisation, args=[ writing_es, writing_index_name, writing_doc_type, full_text_field, container, failed_docs ]) print 'search finished. merging sections...' utils.save_json_array(container, output_file) utils.save_json_array(failed_docs_file, failed_docs_file) print 'done'
def query_patients(es, q_obj): scroll_obj = es.scroll("", "medprofile", size=300, q_obj=q_obj, include_fields=[]) container = [] utils.multi_thread_tasking_it(scroll_obj, 20, do_collect_pids, args=[container]) return container
def search_by_scroll(self, q, doc_type, field='_all', include_fields=None, collection_func=lambda d, c: c.append(d['_id']), index=None): logging.debug('scrolling [%s]' % q) scroll_obj = self.scroll(q, doc_type, field=field, size=300, include_fields=include_fields, index=index) container = [] utils.multi_thread_tasking_it(scroll_obj, 10, collection_func, args=[container]) return container