def concept_analyse(concept_id, condition_label_sql, wrong_label_sql, db_cnf): # get condition mention labels concept_result = {'concept': concept_id, 'labels': {}} mc = MConcept(concept_id) results_condition_labels = [] dutil.query_data(condition_label_sql.format(**{'concept': concept_id}), results_condition_labels, dbconn=dutil.get_db_connection_by_setting(db_cnf)) for r in results_condition_labels: if r['label'] not in mc.name2labels: mc.add_label(ConceptLabel(r['label'])) mc.name2labels[r['label']].condition_mention = r['num'] results_wrong_labels = [] dutil.query_data(wrong_label_sql.format(**{'concept': concept_id}), results_wrong_labels, dbconn=dutil.get_db_connection_by_setting(db_cnf)) for r in results_wrong_labels: if r['label'] not in mc.name2labels: mc.add_label(ConceptLabel(r['label'])) mc.name2labels[r['label']].wrong_mention = r['num'] output = mc.output() print output return concept_result, output
def run_parallel_prediction(settings): cm_obj = Concept2Mapping(settings['concept_mapping_file']) mp_inst = mp.MentionPattern(settings['pattern_folder'], cm_obj.cui2label, in_action=True) mp_inst = None db_pool = du.get_mysql_pooling(settings['dbconf'], num=30) doc_ids = [] model_factory = ModelFactory(settings['phenotypes'], settings['model_dir']) du.query_data(settings['sql_docs4process'], pool=db_pool, container=doc_ids) # for d in doc_ids: # do_one_doc(d, model_factory, cm_obj, mp_inst, db_pool, # settings['sql_text_ptn'], # settings['sql_ann_ptn'], # settings['save_result_sql_ptn'], # settings['update_doc_sql_ptn']) utils.multi_thread_tasking(doc_ids, num_threads=settings['num_threads'], process_func=do_one_doc, args=[ model_factory, cm_obj, mp_inst, db_pool, settings['sql_text_ptn'], settings['sql_ann_ptn'], settings['save_result_sql_ptn'], settings['update_doc_sql_ptn'] ]) logging.info('#docs: %s all done' % len(doc_ids))
def query_data(self, query_template, q_obj): rows_container = [] dutil.query_data(query_template.format(**q_obj), rows_container, dbconn=dutil.get_db_connection_by_setting( self.db_conn_file)) return rows_container
def db_populate_patient_result(container, pid, doc_ann_sql_temp, doc_ann_pks, dbcnn_file, concept_list, cui2concept, ontext_filter_fun=None): """ populate a row (per patient) in the result table :param pid: :param doc_ann_sql_temp: :param doc_ann_pks: :param dbcnn_file: :param concept_list: :param cui2concept: :param container: :return: """ rows = [] db.query_data(doc_ann_sql_temp.format(pid), rows, db.get_db_connection_by_setting(dbcnn_file)) c2f = {} for c in concept_list: c2f[c] = {'f': 0, 'rf': 0, 'docs': []} logging.debug('pid: %s has %s docs' % (pid, len(rows))) i = 0 g2_c2f = {} grp = False for r in rows: try: i += 1 if 'grp' in r: grp = True if r['grp'] in g2_c2f: c2f = g2_c2f[r['grp']] else: c2f = {} for c in concept_list: c2f[c] = {'f': 0, 'rf': 0, 'docs': []} g2_c2f[r['grp']] = c2f anns = json.loads(fix_escaped_issue(r['anns'])) ann_doc = SemEHRAnnDoc() ann_doc.load(anns) for a in ann_doc.annotations: # for c in a.study_concepts: if a.cui in cui2concept: c = cui2concept[a.cui] logging.debug('%s found in %s, ruled_by=%s, concepts:%s' % (c, '-'.join([r[k] for k in doc_ann_pks]), a.ruled_by, a.study_concepts)) if c in c2f: correct = len(a.ruled_by) == 0 if correct and ontext_filter_fun is not None: correct = ontext_filter_fun(a) if not correct: c2f[c]['rf'] += 1 else: c2f[c]['f'] += 1 c2f[c]['docs'].append([r[k] for k in doc_ann_pks]) except Exception as e: logging.error('parsing anns %s because of %s' % (fix_escaped_issue(r['anns']), str(e))) logging.info('pid %s done' % pid) if not grp: g2_c2f = c2f container.append({'p': pid, 'c2f': g2_c2f, 'grp': grp}) logging.debug('pid %s with %s, %s' % (pid, len(c2f), len(container)))
def extract_cohort_docs(self, use_combo_fn_name=True): db_conf_file = self._cohort_conf db_conf = None if 'linux_dsn_setting' in self._conf and self._conf['linux_dsn_setting']: # need dsn settings db_conf = self.populate_linux_odbc_setting() db_conf_file = None logging.info('using dsn %s' % db_conf['dsn']) query_size = self._conf['query_size'] if 'query_size' in self._conf else 50 file_pattern = self._conf['file_pattern'] if 'file_pattern' in self._conf else '%s.txt' out_put_folder = self._conf['out_put_folder'] if len(self._patient_ids) == 0: logging.info('cohort is empty, has it been loaded?') return q_temp = self._conf['doc_query_temp'] logging.info('working on extraction, cohort size:%s' % len(self._patient_ids)) for idx in range(0, len(self._patient_ids), query_size): q = q_temp.format(**{'patient_ids': ",".join(["'%s'" % p for p in self._patient_ids[idx:idx+query_size]])}) logging.info('querying batch %s' % (idx + 1)) logging.debug(q) docs = [] db.query_data(q, docs, db.get_db_connection_by_setting(db_conf_file, db_conf)) if self._dest == 'sql': # save docs to database self.save_docs_to_db(docs) else: # save docs to files for d in docs: if d['doc_content'] is None: continue fn = ('%s_%s' % (d['doc_id'], d['patient_id'])) if use_combo_fn_name else ('%s' % d['doc_id']) utils.save_string(d['doc_content'], join(out_put_folder, file_pattern % fn)) logging.info('%s docs saved to destination [%s]' % (len(docs), self._dest)) logging.info('query finished, docs saved to %s' % out_put_folder)
def analyse_db_doc_anns(sql, ann_sql, pks, update_template, full_text_sql, dbcnn_file, rule_config_file, study_folder, thread_num=10, study_config='study.json', update_status_template=None): """ do database based annotation post processing :param sql: get a list of annotation primary keys :param ann_sql: a query template to query ann and its doc full text :param pks: an array of primary key columns :param update_template: an update query template to update post-processed ann :param dbcnn_file: database connection file :param thread_num: :param study_folder: :param rule_config_file: :param study_config: :return: """ ret = load_study_ruler(study_folder, rule_config_file, study_config) sa = ret['sa'] ruler = ret['ruler'] rows = [] db.query_data(sql, rows, db.get_db_connection_by_setting(dbcnn_file)) reader = DBTextReader(full_text_sql, dbcnn_file) cnns = [] for i in xrange(thread_num): cnns.append(db.get_db_connection_by_setting(dbcnn_file)) utils.multi_process_tasking(rows, db_doc_process, num_procs=thread_num, args=[ann_sql, pks, update_template, dbcnn_file, reader, sa, ruler, update_status_template], thread_wise_objs=cnns) for i in xrange(thread_num): db.release_db_connection(cnns[i])
def load_phenotype_def_into_db(): db_cnf = '../../studies/COMOB_SD/dbcnn_input.json' p_def_file = './data/phenotype_defs.json' pd = utils.load_json_data(p_def_file) w_sql = """ insert into tp_phenotype_concepts (phenotype_id, concept_id) values ('{pid}', '{cid}'); """ r_sql = """ select * from tp_phenotypes """ p_data = [] dutil.query_data(r_sql, p_data, dutil.get_db_connection_by_setting(db_cnf)) p2pid = {} for r in p_data: p2pid[r['phenotype_name']] = r['id'] for p in pd: if p not in p2pid: print '%s not found in definition table' % p continue for c in pd[p]['concepts']: sql = w_sql.format(**{'pid': p2pid[p], 'cid': c}) print 'executing [%s]' % sql dutil.query_data(sql, None, dbconn=dutil.get_db_connection_by_setting(db_cnf)) print 'done'
def populate_patient_concept_table(cohort_name, concepts, out_file, patients_sql, concept_doc_freq_sql): patients = [] dutil.query_data(patients_sql.format(cohort_name), patients) id2p = {} for p in patients: id2p[p['brcid']] = p non_empty_concepts = [] for c in concepts: patient_concept_freq = [] print 'querying %s...' % c dutil.query_data(concept_doc_freq_sql.format(c, cohort_name), patient_concept_freq) if len(patient_concept_freq) > 0: non_empty_concepts.append(c) for pc in patient_concept_freq: id2p[pc['brcid']][c] = str(pc['num']) label2cid = {} concept_labels = [] for c in non_empty_concepts: label = oi.get_concept_label(c) label2cid[label] = c concept_labels.append(label) concept_labels = sorted(concept_labels) s = '\t'.join(['brcid'] + concept_labels) + '\n' for p in patients: s += '\t'.join([p['brcid']] + [p[label2cid[k]] if label2cid[k] in p else '0' for k in concept_labels]) + '\n' utils.save_string(s, out_file) print 'done'
def convert_text_ann_from_db(sql_temp, pks, db_conn, full_text_folder, ann_folder, full_text_file_pattern='%s.txt', ann_file_pattern='%s.txt.knowtator.xml'): sql = sql_temp.format(**pks) results = [] logging.info('doing [%s]...' % sql) file_key = '_'.join([pks[k] for k in pks]) dbutils.query_data(sql, results, dbutils.get_db_connection_by_setting(db_conn)) if len(results) > 0: text = results[0]['text'].replace('\r', '\n') anns = json.loads(results[0]['anns']) xml = AnnConverter.to_eHOST(AnnConverter.load_ann(anns, file_key), full_text=text) utils.save_string(xml, join(ann_folder, ann_file_pattern % file_key)) utils.save_string( text, join(full_text_folder, full_text_file_pattern % file_key)) logging.info('doc [%s] done' % file_key) else: logging.info('doc/anns [%s] not found' % file_key)
def label_analyse(sql_template_file, db_cnf, output_file=None): sql_temps = utils.load_json_data(sql_template_file) concepts = [] dutil.query_data(sql_temps['get_validated_concepts'], concepts, dbconn=dutil.get_db_connection_by_setting(db_cnf)) s = '' for c in concepts: data, output = concept_analyse(c['concept_id'], sql_temps['condition_label_sql'], sql_temps['wrong_label_sql'], db_cnf) s += output if output_file is not None: print 'saving output to %s...' % output_file utils.save_string(s, output_file)
def get_docs_for_processing(job_status, job_sql_template, cnn_conf_file): """ retrieve docs to process from a database table/view :param job_status: :return: """ job_data = job_status.job_start() print 'working on %s' % job_data container = [] sqldbutils.query_data(job_sql_template.format(**job_data), container, dbconn=sqldbutils.get_db_connection_by_setting(cnn_conf_file)) return container
def populate_patient_study_table(cohort_name, study_analyzer, out_file, patients_sql, term_doc_freq_sql): """ populate result table for a given study analyzer instance :param cohort_name: :param study_analyzer: :param out_file: :return: """ patients = [] dutil.query_data(patients_sql.format(cohort_name), patients) id2p = {} for p in patients: id2p[p['brcid']] = p non_empty_concepts = [] study_concepts = study_analyzer.study_concepts for sc in study_concepts: sc_key = '%s(%s)' % (sc.name, len(sc.concept_closure)) concept_list = ', '.join(['\'%s\'' % c for c in sc.concept_closure]) patient_term_freq = [] if len(sc.concept_closure) > 0: data_sql = term_doc_freq_sql.format( **{ 'concepts': concept_list, 'cohort_id': cohort_name, 'extra_constrains': ' \n '.join( [generate_skip_term_constrain(study_analyzer)] + [] if (study_analyzer.study_options is None or study_analyzer .study_options['extra_constrains'] is None) else study_analyzer.study_options['extra_constrains']) }) print data_sql dutil.query_data(data_sql, patient_term_freq) if len(patient_term_freq) > 0: non_empty_concepts.append(sc_key) for pc in patient_term_freq: id2p[pc['brcid']][sc_key] = str(pc['num']) concept_labels = sorted(non_empty_concepts) s = '\t'.join(['brcid'] + concept_labels) + '\n' for p in patients: s += '\t'.join([p['brcid']] + [p[k] if k in p else '0' for k in concept_labels]) + '\n' utils.save_string(s, out_file) print 'done'
def action_transparentise(cohort_name, db_conn_file, cohort_doc_sql_template, doc_ann_sql_template, doc_content_sql_template, action_trans_update_sql_template, corpus_trans_file): """ use actionable transparency model to create confidence value for each annotations; this method split all cohort documents into batches that are to processed in multiple threads :param cohort_name: :param db_conn_file: :param cohort_doc_sql_template: :param doc_ann_sql_template: :param doc_content_sql_template: :param action_trans_update_sql_template: :param corpus_trans_file: :return: """ docs = [] dutil.query_data(cohort_doc_sql_template.format(cohort_name), docs, dbconn=dutil.get_db_connection_by_setting(db_conn_file)) batch_size = 500 batches = [] for i in range(0, len(docs), batch_size): batches.append(docs[i:i+batch_size]) nlp = tstg.load_mode('en') corpus_predictor = tssp.CorpusPredictor.load_corpus_model(corpus_trans_file) for batch in batches: print 'working on %s/%s batch' % (i, len(batches)) try: do_action_trans_docs(batch, nlp, doc_ann_sql_template, doc_content_sql_template, action_trans_update_sql_template, db_conn_file, corpus_predictor) except Exception as e: print 'error processing [%s]' % e i += 1 #utils.multi_thread_tasking(batches, 1, do_action_trans_docs, # args=[nlp, # doc_ann_sql_template, # doc_content_sql_template, # action_trans_update_sql_template, # db_conn_file, # corpus_predictor # ]) print 'all anns transparentised'
def complete_sample_ann_data(key_anns, complete_sql, db_conn_file, container): k = key_anns[0] anns = key_anns[1] for ann in anns: rows_container = [] dutil.query_data(complete_sql.format(**{'doc_id': ann['id'], 'start': ann['annotations'][0]['start'], 'end': ann['annotations'][0]['end'], 'concept': ann['annotations'][0]['concept']}), rows_container, dbconn=dutil.get_db_connection_by_setting(db_conn_file)) if len(rows_container) > 0: ann['annotations'][0]['string_orig'] = rows_container[0]['string_orig'] if 'action_trans' in rows_container[0]: ann['annotations'][0]['confidence'] = rows_container[0]['action_trans'] container.append([k, anns])
def extract_sample(pk_vals, concept, cui2concept, sample_sql_temp, dbcnn_file, container, ontext_filter_fun=positive_patient_filter): """ extract an sample :param pk_vals: :param concept: :param sample_sql_temp: :param dbcnn_file: :param container: :return: """ rows = [] db.query_data(sample_sql_temp.format(*[v for v in pk_vals]), rows, db.get_db_connection_by_setting(dbcnn_file)) if len(rows) > 0: r = rows[0] anns = json.loads(r['anns']) ann_doc = SemEHRAnnDoc() ann_doc.load(anns) for a in ann_doc.annotations: if a.cui in cui2concept and concept == cui2concept[a.cui]: correct = len(a.ruled_by) == 0 if correct and ontext_filter_fun is not None: correct = ontext_filter_fun(a) if correct: container.append({ 'content': r['text'], 'doc_table': r['src_table'], 'doc_col': r['src_col'], 'id': '_'.join(pk_vals), 'annotations': [{ 'start': a.start, 'end': a.end, 'concept': a.cui, 'string_orig': a.str }] }) break
def do_one_doc(doc_id, model_factory, concept_mapping, mention_pattern, db_pool, sql_text_ptn, sql_ann_ptn, save_result_sql_ptn, update_doc_sql_ptn): container = [] du.query_data(sql_ann_ptn.format(**doc_id), pool=db_pool, container=container) if len(container) == 0: logging.info('%s anns not found' % doc_id) return doc_anns = json.loads(container[0]['anns']) patient_id = container[0]['patient_id'] container = [] du.query_data(sql_text_ptn.format(**doc_id), pool=db_pool, container=container) if len(container) == 0: logging.info('%s text not found' % doc_id) return text = container[0]['doc_content'] container = [] p2count = predict_doc_phenotypes(str(doc_id), doc_anns, text, model_factory, concept_mapping, mention_pattern=mention_pattern) if p2count is not None: save_dict = doc_id.copy() save_dict['result'] = json.dumps(p2count) save_dict['patient_id'] = patient_id du.query_data(save_result_sql_ptn.format(**save_dict), container=None, pool=db_pool) du.query_data(update_doc_sql_ptn.format(**doc_id), container=None, pool=db_pool) else: du.query_data(update_doc_sql_ptn.format(**doc_id), container=None, pool=db_pool) logging.info('%s empty phenotypes' % doc_id) logging.info('%s done' % doc_id)
def populate_episode_study_table(study_analyzer, episode_data, out_path, cohort): study_concepts = study_analyzer.study_concepts for sc in study_concepts: sc_key = '%s(%s)' % (sc.name, len(sc.concept_closure)) print 'working on %s' % sc_key concept_list = ', '.join(['\'%s\'' % c for c in sc.concept_closure]) patient_date_tuples = [] if len(sc.concept_closure) > 0: data_sql = doc_concept_sql_cohort.format(**{'concepts': concept_list, 'cohort': cohort, 'extra_constrains': ''}) print data_sql dutil.query_data(data_sql, patient_date_tuples, dbconn=dutil.get_mysqldb_connection(my_host, my_user, my_pwd, my_db, my_sock) if db_connection == 'mysql' else None) # filter patient_date tuples using episode constraints for eps in episode_data: for row in patient_date_tuples: if eps['brcid'] == str(row['brcid']): eps[sc.name] = {'win1': 0, 'win2': 0, 'win3': 0} if sc.name not in eps else eps[sc.name] count_eps_win(eps, sc.name, row, 'win1') count_eps_win(eps, sc.name, row, 'win2') count_eps_win(eps, sc.name, row, 'win3') rows = [] headers = ['brcid'] + [sc.name for sc in study_concepts] for eps in episode_data: r = {'win1':[eps['brcid']], 'win2':[eps['brcid']], 'win3':[eps['brcid']]} for sc in study_concepts: if sc.name in eps: r['win1'].append(str(eps[sc.name]['win1'])) r['win2'].append(str(eps[sc.name]['win2'])) r['win3'].append(str(eps[sc.name]['win3'])) else: r['win1'].append('0') r['win2'].append('0') r['win3'].append('0') rows.append(r) for w in ['win1', 'win2', 'win3']: s = '\t'.join(headers) + '\n' for r in rows: s += '\t'.join(r[w]) + '\n' utils.save_string(s, out_path + '/weeks_eps' + w + '_control.tsv')
def download_docs(doc_ids, query, db_conn_setting, out_put_folder): """ download clinical notes from EHR :param doc_ids: :param query: :param db_conn_setting: :return: """ db_cnn = dutil.get_db_connection_by_setting(db_conn_setting) results = [] q = query.format(**{'ids': ','.join(['\'%s\'' % did for did in doc_ids])}) print 'querying [%s]' % q print q dutil.query_data(q, results, db_cnn) for r in results: if r['textcontent'] is not None: utils.save_string(r['textcontent'].decode('cp1252').replace( chr(13), ' '), join(out_put_folder, r['cn_doc_id'] + '.txt'), encoding='utf-8')
def get_db_docs_for_converting(settings): sql = settings['sql'] db_conn = settings['db_conn'] doc_ann_sql_temp = settings['sql_temp'] full_text_folder = settings['full_text_folder'] ann_folder = settings['ann_folder'] results = [] dbutils.query_data(sql, results, dbutils.get_db_connection_by_setting(db_conn)) ds = [] for r in results: ds.append(r) logging.info('total docs %s' % len(ds)) for d in ds: AnnConverter.convert_text_ann_from_db( sql_temp=doc_ann_sql_temp, pks=d, db_conn=db_conn, full_text_folder=full_text_folder, ann_folder=ann_folder)
def db_doc_process(cnn, row, sql_template, pks, update_template, dbcnn_file, text_reader, sa, ruler, update_status_template): sql = sql_template.format(*[row[k] for k in pks]) rets = [] db.query_data(sql, rets, db.get_db_connection_by_setting(dbcnn_file)) if len(rets) > 0: anns = json.loads(fix_escaped_issue(rets[0]['anns'])) ann_doc = SemEHRAnnDoc() ann_doc.load(anns) no_concepts = False if len(ann_doc.annotations) > 0: num_concepts = process_doc_rule(ann_doc, ruler, text_reader, [row[k] for k in pks], sa) if num_concepts > 0: update_query = update_template.format(*( [db.escape_string(json.dumps(ann_doc.serialise_json()))] + [row[k] for k in pks])) # logging.debug('update ann: %s' % update_query) db.query_data(update_query, None, dbconn=cnn) logging.info('ann %s updated' % row) else: no_concepts = True else: no_concepts = True if no_concepts and update_status_template is not None: q = update_status_template.format(*[row[k] for k in pks]) db.query_data(q, None, dbconn=cnn) logging.debug('no concepts found/update %s' % q)
def extract_cohort_docs(self): db_conf_file = self._cohort_conf db_conf = None if 'linux_dsn_setting' in self._conf and self._conf[ 'linux_dsn_setting']: db_conf = self.populate_linux_odbc_setting() db_conf_file = None logging.info('using dsn %s' % db_conf['dsn']) query_size = self._conf[ 'query_size'] if 'query_size' in self._conf else 50 file_pattern = self._conf[ 'file_pattern'] if 'file_pattern' in self._conf else '%s.txt' out_put_folder = self._conf['out_put_folder'] if len(self._patient_ids) == 0: logging.info('cohort is empty, has it been loaded?') return q_temp = self._conf['doc_query_temp'] logging.info('working on extraction, cohort size:%s' % len(self._patient_ids)) for idx in range(0, len(self._patient_ids), query_size): q = q_temp.format( **{ 'patient_ids': ",".join([ "'%s'" % p for p in self._patient_ids[idx:idx + query_size] ]) }) logging.info('querying batch %s' % (idx + 1)) logging.debug(q) docs = [] db.query_data( q, docs, db.get_db_connection_by_setting(db_conf_file, db_conf)) for d in docs: utils.save_string( d['doc_content'], join(out_put_folder, file_pattern % d['doc_id'])) logging.info('query finished, docs saved to %s' % out_put_folder)
def collect_patient_morbidity_result(phenotypes, sql_result, db_conf, output_folder): columns = ['patient_id'] + phenotypes # 1. query all results, assuming the result size is not too BIG (<1m) and ordered by patient_id db_pool = du.get_mysql_pooling(db_conf, num=10) container = [] du.query_data(sql_result, pool=db_pool, container=container) data = {} phenotype2subtypes = {} phenotype2data = {} for phe in phenotypes: phenotype2data[phe] = [] cur_p = None cur_row = initial_morbidity_row(phenotypes) cur_phenotype_detail = None for d in container: if d['patient_id'] != cur_p: add_data_row(data, cur_p, cur_row, phenotypes, phenotype2data, cur_phenotype_detail) cur_p = d['patient_id'] cur_row = initial_morbidity_row(phenotypes) cur_phenotype_detail = initial_phenotype_details(phenotypes) logging.info('working on [%s]...' % cur_p) doc_result = json.loads(d['result']) for p in doc_result: cur_row[p] += doc_result[p]['freq'] add_phenotype_detail(cur_phenotype_detail, p, doc_result[p]['cui2freq'], phenotype2subtypes) add_data_row(data, cur_p, cur_row, phenotypes, phenotype2data, cur_phenotype_detail) df = pd.DataFrame(data) output_file = join(output_folder, 'icd_chapter_results.csv') df.to_csv(output_file, index=False) logging.info('data saved to %s' % output_file) for p in phenotypes: populate_subtype_output(phenotype2subtypes[p], phenotype2data[p], join(output_folder, '%s.csv' % p))
def do_action_trans_docs(docs, nlp, doc_ann_sql_template, doc_content_sql_template, action_trans_update_sql_template, db_conn_file, corpus_predictor): """ do actionable transparency prediction on a batch of docs. this function is to supposed to be called in a single thread :param docs: :param nlp: :param doc_ann_sql_template: :param doc_content_sql_template: :param action_trans_update_sql_template: :param db_conn_file: :param corpus_predictor: :return: """ # self_nlp = tstg.load_mode('en') for doc_id in docs: doc_anns = [] dutil.query_data( doc_ann_sql_template.format(doc_id['docid']), doc_anns, dbconn=dutil.get_db_connection_by_setting(db_conn_file)) doc_anns = [{ 's': int(ann['s']), 'e': int(ann['e']), 'AnnId': str(ann['AnnId']), 'signed_label': '', 'gt_label': '', 'action_trans': ann['action_trans'] } for ann in doc_anns] if len(doc_anns) == 0: continue if doc_anns[0]['action_trans'] is not None: print 'found trans %s of first ann, skipping doc' % doc_anns[0][ 'action_trans'] continue doc_container = [] dutil.query_data( doc_content_sql_template.format(doc_id['docid']), doc_container, dbconn=dutil.get_db_connection_by_setting(db_conn_file)) ptns = tstg.doc_processing(nlp, unicode(doc_container[0]['content']), doc_anns, doc_id['docid']) # print 'doc %s read/model created, predicting...' for inst in ptns: acc = corpus_predictor.predcit(inst) anns = inst.annotations sql = action_trans_update_sql_template.format( **{ 'acc': acc, 'AnnId': anns[0]['AnnId'] }) # print 'executing %s' % sql dutil.query_data( sql, container=None, dbconn=dutil.get_db_connection_by_setting(db_conn_file))
def smp_export(patient_id, es, corpus_mapping, sql_temp, db_cnn): """ structured medical profile extraction :param es: elasticsearch index :param patient_id: :param sql_temp: :param db_cnn :return: """ print 'indexing %s' % patient_id ds_ids = mimicdao.get_summary_doc_by_patient(patient_id) for r in ds_ids: doc = es.get_doc_detail(r['row_id']) profile = parse_discharge_summary(doc['fulltext'], doc['anns'], corpus_mapping) mp = {} for sec in profile: t = sec['section'] if sec['section'] != '' else 'basic' t = t.replace(' ', '_') mp[t] = sec file_name = '%s_%s.json' % (patient_id, r['row_id']) db.query_data(sql_temp.format(**{'patient_id': patient_id, 'doc_id': r['row_id'], 'smp': db.escape_string(json.dumps(mp))}), None, dbconn=db.get_db_connection_by_setting(db_cnn)) print '%s indexed' % file_name
def get_concepts(output_file): curated_mappings = utils.load_json_data( './resources/curated_mappings.json') autoimmune_concepts = [] patients = [] dutil.query_data(autoimmune_concepts_sql, autoimmune_concepts) print '{} concepts read'.format(len(autoimmune_concepts)) dutil.query_data(patients_sql, patients) print patients[0] # patient dic patient_dic = {} for p in patients: patient_dic[p['brcid']] = p non_empty_curated_concepts = [] non_empty_not_curated_concepts = [] empty_concepts = [] for co in autoimmune_concepts: c = co['concept_name'] sympton_freq_result = [] print autoimmune_sympton_freq_sql.format(c) dutil.query_data( autoimmune_sympton_freq_sql.format(c.replace("'", "''")), sympton_freq_result) if len(sympton_freq_result) > 0: if c in curated_mappings and curated_mappings[ c] is not None and curated_mappings[c] == 'correct': non_empty_curated_concepts.append(c) else: non_empty_not_curated_concepts.append(c) else: empty_concepts.append(c) for sf in sympton_freq_result: patient_dic[sf['brcid']][c] = sf['num'] patient_dic[sf['brcid']]['any'] = sf['num'] + \ (patient_dic[sf['brcid']]['any'] if 'any' in patient_dic[sf['brcid']] else 0) p_attrs = [ 'brcid', 'primary_diag', 'diagnosis_date', 'dob', 'gender_id', 'ethnicitycleaned' ] d_attrs = sorted( non_empty_curated_concepts ) # sorted([co['concept_name'] for co in autoimmune_concepts]) d_attrs = ['any'] + d_attrs + ['=sep=' ] + sorted(non_empty_not_curated_concepts) s = '\t'.join(p_attrs) + '\t' + '\t'.join(d_attrs) + '\n' for p in patients: s += '\t'.join([str(p[k]) for k in p_attrs]) + '\t' + '\t'.join( ['0' if c not in p else str(p[c]) for c in d_attrs]) + '\n' utils.save_string(s, output_file) print json.dumps(empty_concepts)
def generate_result_in_one_iteration(cohort_name, study_analyzer, out_file, sample_size, sample_out_file, doc_to_brc_sql, brc_sql, anns_iter_sql, skip_term_sql, doc_content_sql, db_conn_file): """ generate result in one iteration over all annotations. this is supposed to be much faster when working on large study concepts. But post-processing using rules not supported now :param cohort_name: :param study_analyzer: :param out_file: :param sample_size: :param sample_out_file: :param doc_to_brc_sql: :param brc_sql: :param anns_iter_sql: :param skip_term_sql: :param doc_content_sql: :param db_conn_file: :return: """ # populate concept to anns maps sc2anns = {} for sc in study_analyzer.study_concepts: sc2anns[sc.name] = [] # populate patient list print 'populating patient list...' patients = {} rows_container = [] dutil.query_data(brc_sql.format(cohort_name), rows_container, dbconn=dutil.get_db_connection_by_setting(db_conn_file)) for r in rows_container: patients[r['brcid']] = {'brcid': r['brcid']} # populate document id to patient id dictionary print 'populating doc to patient map...' rows_container = [] dutil.query_data(doc_to_brc_sql.format(cohort_name), rows_container, dbconn=dutil.get_db_connection_by_setting(db_conn_file)) doc2brc = {} for dp in rows_container: doc2brc[dp['doc_id']] = dp['brcid'] # query annotations print 'iterating annotations...' rows_container = [] dutil.query_data(anns_iter_sql.format(**{'cohort_id': cohort_name, 'extra_constrains': ' \n '.join( [generate_skip_term_constrain(study_analyzer, skip_term_sql)] + [] if (study_analyzer.study_options is None or study_analyzer.study_options['extra_constrains'] is None) else study_analyzer.study_options['extra_constrains'])}), rows_container, dbconn=dutil.get_db_connection_by_setting(db_conn_file)) for r in rows_container: concept_id = r['inst_uri'] brcid = doc2brc[r['doc_id']] if r['doc_id'] in doc2brc else None if brcid is None: print 'doc %s not matched to a patient!!!' % r['doc_id'] continue patient = patients[brcid] if brcid in patients else None if patient is None: print 'brc id %s not matched a patient!!!' % brcid continue # get matched study concepts for sc in study_analyzer.study_concepts: if concept_id in sc.concept_closure: patient[sc.name] = (patient[sc.name] + 1) if sc.name in patient else 1 sc2anns[sc.name].append({'ann_id': r['ann_id'], 'doc_id': r['doc_id'], 'concept_id': concept_id, 'start': r['start_offset'], 'end': r['end_offset']}) # generate result table print 'generate result table...' concept_labels = sorted([k for k in sc2anns]) s = '\t'.join(['brcid'] + concept_labels) + '\n' lines = [] utils.multi_thread_tasking([patients[pid] for pid in patients], 40, do_put_line, args=[concept_labels, lines]) s += '\n'.join(lines) utils.save_string(s, out_file) # generate sample annotations term_to_docs = {} for concept in sc2anns: ann_ids = sc2anns[concept] sample_ids = [] if len(ann_ids) <= sample_size: sample_ids = ann_ids else: for i in xrange(sample_size): index = random.randrange(len(ann_ids)) sample_ids.append(ann_ids[index]) del ann_ids[index] term_to_docs[concept] = sample_ids # query doc contents print 'populating term to sampled anns...' term_to_sampled = {} for term in term_to_docs: sample_ids = term_to_docs[term] if len(sample_ids) <=0 : continue sample_doc_ids = ['\'' + s['doc_id'] + '\'' for s in sample_ids] rows_container = [] dutil.query_data(doc_content_sql.format(','.join(sample_doc_ids)), rows_container, dbconn=dutil.get_db_connection_by_setting(db_conn_file)) doc_to_content = {} for r in rows_container: doc_to_content[r['doc_id']] = r['TextContent'] term_sampled = [] for s in sample_ids: term_sampled.append({'id': s['doc_id'], 'content': doc_to_content[s['doc_id']], 'annotations': [{'start': s['start'], 'end': s['end'], 'concept': s['concept_id']}]}) term_to_sampled[term] = term_sampled utils.save_json_array(convert_encoding(term_to_sampled, 'cp1252', 'utf-8'), sample_out_file)
def random_extract_annotated_docs(cohort_name, study_analyzer, out_file, docs_by_term_sql, docs_by_cohort_sql, docs_by_ids_sql, sample_size=5): term_to_docs = {} study_concepts = study_analyzer.study_concepts is_NOT_cohort_based = study_analyzer.study_options is None \ or study_analyzer.study_options['sample_non_hits'] is None \ or (not study_analyzer.study_options['sample_non_hits']) for sc in study_concepts: sc_key = '%s(%s)' % (sc.name, len(sc.concept_closure)) concept_list = ', '.join(['\'%s\'' % c for c in sc.concept_closure]) doc_ids = [] if len(sc.concept_closure) > 0: if is_NOT_cohort_based: dutil.query_data( docs_by_term_sql.format( **{'concepts': concept_list, 'cohort_id': cohort_name, 'extra_constrains': ' \n '.join( [generate_skip_term_constrain(study_analyzer)] + [] if (study_analyzer.study_options is None or study_analyzer.study_options['extra_constrains'] is None) else study_analyzer.study_options['extra_constrains'])}), doc_ids) else: doc_sql = docs_by_cohort_sql.format( **{'cohort_id': cohort_name, 'extra_constrains': ' and '.join( [generate_skip_term_constrain(study_analyzer)] + [] if (study_analyzer.study_options is None or study_analyzer.study_options['extra_constrains'] is None) else study_analyzer.study_options['extra_constrains'])}) print doc_sql dutil.query_data(doc_sql, doc_ids) if len(doc_ids) > 0: sample_ids = [] if len(doc_ids) <= sample_size: sample_ids = [r['CN_Doc_ID'] for r in doc_ids] else: for i in xrange(sample_size): index = random.randrange(len(doc_ids)) sample_ids.append(doc_ids[index]['CN_Doc_ID']) del doc_ids[index] doc_list = ', '.join(['\'%s\'' % d for d in sample_ids]) docs = [] doc_sample_sql = docs_by_ids_sql.format(**{'docs': doc_list, 'concepts': concept_list, 'extra_constrains': generate_skip_term_constrain(study_analyzer)}) print doc_sample_sql dutil.query_data(doc_sample_sql, docs) doc_objs = [] prev_doc_id = '' doc_obj = None for d in docs: if prev_doc_id != d['CN_Doc_ID']: doc_obj = {'id': d['CN_Doc_ID'], 'content': d['TextContent'], 'annotations': [], 'doc_table': d['src_table'], 'doc_col': d['src_col']} doc_objs.append(doc_obj) prev_doc_id = d['CN_Doc_ID'] doc_obj['annotations'].append({'start': d['start_offset'], 'end': d['end_offset'], 'concept': d['inst_uri']}) term_to_docs[sc.name] = doc_objs if not is_NOT_cohort_based: break utils.save_json_array(term_to_docs, out_file) print 'done'
def load_all_docs(): sql = "select TextContent, Date, src_table, src_col, BrcId, CN_Doc_ID from sqlcris_user.KConnect.vw_hepcpos_docs" docs = [] dutil.query_data(sql, docs) return docs
def get_doc_detail_by_id(doc_id, fulltext_date_by_doc_id): sql = fulltext_date_by_doc_id.format(**{'doc_id': doc_id}) docs = [] dutil.query_data(sql, docs) return docs
def populate_patient_study_table_post_ruled(cohort_name, study_analyzer, out_file, rule_executor, sample_size, sample_out_file, ruled_ann_out_file, patients_sql, term_doc_anns_sql, skip_term_sql, db_conn_file, text_preprocessing=False): """ populate patient study result with post processing to remove unwanted mentions :param cohort_name: :param study_analyzer: :param out_file: :param rule_executor: :param sample_size: :param sample_out_file: :return: """ patients = [] dutil.query_data(patients_sql.format(cohort_name), patients, dbconn=dutil.get_db_connection_by_setting(db_conn_file)) id2p = {} for p in patients: id2p[p['brcid']] = p non_empty_concepts = [] study_concepts = study_analyzer.study_concepts term_to_docs = {} ruled_anns = [] positive_dumps = [] skip_terms_list = [t.lower() for t in rule_executor.skip_terms] for sc in study_concepts: positive_doc_anns = [] sc_key = '%s(%s)' % (sc.name, len(sc.concept_closure)) concept_list = ', '.join(['\'%s\'' % c for c in sc.concept_closure]) doc_anns = [] if len(sc.concept_closure) > 0: sql_temp = term_doc_anns_sql data_sql = sql_temp.format(**{'concepts': concept_list, 'cohort_id': cohort_name, 'extra_constrains': ' \n '.join( [generate_skip_term_constrain(study_analyzer, skip_term_sql)] + [] if (study_analyzer.study_options is None or study_analyzer.study_options['extra_constrains'] is None) else study_analyzer.study_options['extra_constrains'])}) print data_sql dutil.query_data(data_sql, doc_anns, dbconn=dutil.get_db_connection_by_setting(db_conn_file)) if len(doc_anns) > 0: p_to_dfreq = {} counted_docs = set() for ann in doc_anns: p = ann['brcid'] d = ann['CN_Doc_ID'] if d in counted_docs: continue ruled = False case_instance = '' if not ruled: # skip term rules if 'string_orig' in ann and ann['string_orig'].lower() in skip_terms_list: ruled = True rule = 'skip-term' case_instance = ann['string_orig'] if not ruled: # string orign rules - not used now ruled, case_instance = rule_executor.execute_original_string_rules( ann['string_orig'] if 'string_orig' in ann else ann['TextContent'][int(ann['start_offset']):int(ann['end_offset'])]) rule = 'original-string-rule' if not ruled: # post processing rules ruled, case_instance, rule = \ rule_executor.execute(ann['TextContent'] if not text_preprocessing else preprocessing_text_befor_rule_execution(ann['TextContent']), int(ann['start_offset']), int(ann['end_offset']), string_orig=ann['string_orig'] if 'string_orig' in ann else None) rule = 'semehr ' + rule if not ruled: # bio-yodie labels if 'experiencer' in ann: if ann['experiencer'].lower() != 'patient' or \ ann['temporality'].lower() != 'recent' or \ ann['negation'].lower() != 'affirmed': ruled = True case_instance = '\t'.join([ann['experiencer'], ann['temporality'], ann['negation']]) rule = 'yodie' if ruled: ruled_anns.append({'p': p, 'd': d, 'ruled': rule, 's': ann['start_offset'], 'e': ann['end_offset'], 'c': ann['inst_uri'], 'case-instance': case_instance, 'string_orig': ann['string_orig'] }) else: counted_docs.add(d) p_to_dfreq[p] = 1 if p not in p_to_dfreq else 1 + p_to_dfreq[p] positive_doc_anns.append({'id': ann['CN_Doc_ID'], 'content': ann['TextContent'], 'annotations': [{'start': ann['start_offset'], 'end': ann['end_offset'], 'concept': ann['inst_uri'], 'string_orig': ann[ 'string_orig'] if 'string_orig' in ann else ''}], 'doc_table': ann['src_table'], 'doc_col': ann['src_col']}) positive_dumps.append({'p': p, 'd': d, 's': ann['start_offset'], 'e': ann['end_offset'], 'c': ann['inst_uri'], 'string_orig': ann['string_orig']}) if len(counted_docs) > 0: non_empty_concepts.append(sc_key) for p in p_to_dfreq: id2p[p][sc_key] = str(p_to_dfreq[p]) # save sample docs if sample_size >= len(positive_doc_anns): term_to_docs[sc_key] = positive_doc_anns else: sampled = [] for i in xrange(sample_size): index = random.randrange(len(positive_doc_anns)) sampled.append(positive_doc_anns[index]) positive_doc_anns.pop(index) term_to_docs[sc_key] = sampled concept_labels = sorted(non_empty_concepts) s = '\t'.join(['brcid'] + concept_labels) + '\n' for p in patients: s += '\t'.join([p['brcid']] + [p[k] if k in p else '0' for k in concept_labels]) + '\n' utils.save_string(s, out_file) utils.save_string('var sample_docs=' + json.dumps(convert_encoding(term_to_docs, 'cp1252', 'utf-8')), sample_out_file) utils.save_json_array(convert_encoding(ruled_anns, 'cp1252', 'utf-8'), ruled_ann_out_file) utils.save_json_array(positive_dumps, out_file + "_json") print 'done'