def merge_and_output(dir_path, cohort, default_results='hepc_results.json'): headers = ['all', 'positive', 'Negated', 'hypothetical', 'historical', 'Other', 'first_pos_time'] results = {} for pid in cohort: results[pid] = {} c_results = utils.load_json_data(join(dir_path, default_results)) for p in c_results: results[p['id']] = p for f in [f for f in listdir(dir_path) if isfile(join(dir_path, f))]: if f != default_results: c_results = utils.load_json_data(join(dir_path, f)) d = f.replace('_results.json', '') print f if d not in headers: headers.append(d) for p in c_results: results[p['id']][d] = p['all'] s = '\t'.join(['id'] + headers) + '\n' for pid in results: p = results[pid] row = [pid] + ['-' if h not in p else str(p[h]) for h in headers] s += '\t'.join(row) + '\n' utils.save_string(s, './valid_doc_files/merged_output_liverdiseases.tsv') print 'output generated'
def from_json(json_object, user): if json_object['id'] != user.id: raise Exception(f"Can't load user {user.id} : IDs don't match !") user.afk_mentions = load_json_data(json_object, 'afk_mentions', True) user.xp = load_json_data(json_object, 'xp', 0) user.last_login = load_json_data(json_object, 'last_login', None) user.last_daily_reward = load_json_data(json_object, 'last_daily_reward', None) if user.active_since is not None: user.last_active_xp = load_json_data(json_object, 'last_active_xp', None) user.muted = bool(load_json_data(json_object, 'muted', False)) user.muted_until = load_json_data(json_object, 'muted_until', None) user.deaf = bool(load_json_data(json_object, 'deaf', False)) user.deaf_until = load_json_data(json_object, 'deaf_until', None) if user.last_login is not None: user.last_login = datetime.datetime.strptime( user.last_login, '%a %b %d %H:%M:%S %Y') if user.last_daily_reward is not None: user.last_daily_reward = datetime.datetime.strptime( user.last_daily_reward, '%a %b %d %H:%M:%S %Y') if user.last_active_xp is not None: user.last_active_xp = datetime.datetime.strptime( user.last_active_xp, '%a %b %d %H:%M:%S %Y') if user.muted_until is not None: user.muted_until = datetime.datetime.strptime( user.muted_until, '%a %b %d %H:%M:%S %Y') if user.deaf_until is not None: user.deaf_until = datetime.datetime.strptime( user.deaf_until, '%a %b %d %H:%M:%S %Y') user.warnings = load_json_data(json_object, 'warnings', []) return user
def load(server): loaded_server = db.get_server(server.id) if loaded_server is None: print(f'no save found for {server.id}') return False if loaded_server['id'] != server.id: raise Exception( f"Can't load server {server.id} : IDs don't match !") server.lang = load_json_data(loaded_server, 'lang', cfg.get_value('SRV_DEFAULT_LANG')) server.bot_text_channel_name = load_json_data( loaded_server, 'bot_text_channel_name', cfg.get_value('SRV_DEFAULT_BOT_TEXT_CHANNEL_NAME')) server.log_text_channel_name = load_json_data( loaded_server, 'log_text_channel_name', cfg.get_value('SRV_DEFAULT_LOGS_TEXT_CHANNEL_NAME')) server.cmd_prefix = load_json_data( loaded_server, 'cmd_prefix', cfg.get_value('SRV_DEFAULT_CMD_PREFIX_NAME')) server.admin_logs = load_json_data( loaded_server, 'admin_logs', bool(cfg.get_value('SRV_DEFAULT_DISPLAY_ADMIN_LOGS'))) server.group_perks = load_json_data(loaded_server, 'group_perks', {}) server.use_accept_command = load_json_data( loaded_server, 'use_accept', bool(cfg.get_value('SRV_DEFAULT_USE_ACCEPT_COMMAND'))) server.use_accept_command = load_json_data( loaded_server, 'accept_rank', cfg.get_value('SRV_DEFAULT_ACCEPT_RANK')) for key, member in server.members.items(): for json_member in load_json_data(loaded_server, 'members', []): if load_json_data(json_member, 'id', -1) == member.id: User.from_json(json_member, member) break return True
def dump_mention_detail(studies_folder, include_study_pattern, dump_tsv_file, dump_concept_file): reg_p = re.compile(include_study_pattern) rows = ['\t'.join(['concept', 'pt', 'doc', 's', 'e', 'label', 'ruled'])] c_group = {} for f in listdir(studies_folder): m = reg_p.match(f) if m is not None: ruled_file = join(studies_folder, f, 'ruled_anns.json') if isfile(ruled_file): # {"p": "pid", "s": 3356, "e": 3365, "d": "did", "case-instance": [xxx"], # "c": "C0000833", "string_orig": "abscesses", # "ruled": "semehr hypothetical_filters.json"} ruleds = utils.load_json_data(ruled_file) for r in ruleds: rows.append('\t'.join([ r['c'], r['p'], r['d'], str(r['s']), str(r['e']), r['string_orig'], r['ruled'] ])) increase_freq_on_dict( c_group, r['c'], r['ruled'], '-'.join([r['d'], str(r['s']), str(r['e'])])) pos_file = join(studies_folder, f, 'result.csv_json') if isfile(pos_file): # {"c": "C0000833", "e": 467, "d": "52773120", "string_orig": "abscess", "p": "10110421", "s": 460} poses = utils.load_json_data(pos_file) for r in poses: rows.append('\t'.join([ r['c'], r['p'], r['d'], str(r['s']), str(r['e']), r['string_orig'], '' ])) increase_freq_on_dict( c_group, r['c'], 'pos', '-'.join([r['d'], str(r['s']), str(r['e'])])) rule_headers = [ 'semehr negation_filters.json', 'semehr hypothetical_filters.json', 'semehr not_mention_filters.json', 'semehr other_experiencer_filters.json', 'semehr cris_document_filters.json', 'skip-term', 'semehr s_skin.json', 'semehr s_karen.json', 'yodie', 'pos' ] c_rows = ['\t'.join(['concept'] + rule_headers)] for c in c_group: co = c_group[c] c_rows.append( '\t'.join([c] + [str(co[h]) if h in co else '0' for h in rule_headers])) utils.save_string('\n'.join(rows), dump_tsv_file) utils.save_string('\n'.join(c_rows), dump_concept_file) print 'dumped to %s' % dump_tsv_file
def do_phenotype_analysis(phenotype_result_file, c_map_file, output_folder): c_map = utils.load_json_data(c_map_file) p_map = utils.load_json_data(phenotype_result_file) # extract performances of phenotypes headers = ["posM", "hisM", "negM", "otherM", "wrongM"] rows = ['\t'.join(["phenotype"] + headers)] for p in p_map: v = p_map[p]['validation'] if v is None or len(v) == 0: continue rows.append('\t'.join([p] + [str(v[h]) if h in v else '0' for h in headers])) utils.save_string('\n'.join(rows), join(output_folder, 'phenotype_performance.tsv'))
def load_rule_config(self, config_file): rule_config = utils.load_json_data(config_file) r_path = rule_config['rules_folder'] print 'loading rules from [%s]' % r_path for rf in rule_config['active_rules']: for r in utils.load_json_data(join(r_path, rf)): self.add_filter_rule(r['offset'], r['regs'], rule_name=rf) print '%s loaded' % rf if 'osf_rules' in rule_config: for osf in rule_config['osf_rules']: self.add_original_string_filters(utils.load_json_data(join(r_path, osf))) print 'original string filters from [%s] loaded' % osf if 'skip_term_setting' in rule_config: self.skip_terms = utils.load_json_data(rule_config['skip_term_setting'])
def study(folder, episode_file, cohort, date_format='%d/%m/%Y %H:%M'): episodes = load_episode_data(episode_file, date_format=date_format) p, fn = split(folder) if isfile(join(folder, 'study_analyzer.pickle')): sa = StudyAnalyzer.deserialise(join(folder, 'study_analyzer.pickle')) else: sa = StudyAnalyzer(fn) if isfile(join(folder, 'exact_concepts_mappings.json')): concept_mappings = utils.load_json_data(join(folder, 'exact_concepts_mappings.json')) scs = [] for t in concept_mappings: sc = StudyConcept(t, [t]) t_c = {} t_c[t] = [concept_mappings[t]] sc.gen_concept_closure(term_concepts=t_c) scs.append(sc) print sc.concept_closure sa.study_concepts = scs sa.serialise(join(folder, 'study_analyzer.pickle')) else: concepts = utils.load_json_data(join(folder, 'study_concepts.json')) if len(concepts) > 0: scs = [] for name in concepts: scs.append(StudyConcept(name, concepts[name])) print name, concepts[name] sa.study_concepts = scs sa.serialise(join(folder, 'study_analyzer.pickle')) # compute disjoint concepts sa.generate_exclusive_concepts() merged_mappings = {} for c in sa.study_concepts: for t in c.term_to_concept: all_concepts = list(c.concept_closure) if len(all_concepts) > 1: idx = 0 for cid in all_concepts: merged_mappings['(%s) %s (%s)' % (c.name, t, idx)] = {'closure': len(all_concepts), 'mapped': cid} idx += 1 else: merged_mappings['(%s) %s' % (c.name, t)] = c.term_to_concept[t] print c.name, c.term_to_concept, c.concept_closure print json.dumps(list(c.concept_closure)) print json.dumps(merged_mappings) print 'generating result table...' populate_episode_study_table(sa, episodes, './resources', cohort) print 'done'
def load_phenotype_def_into_db(): db_cnf = '../../studies/COMOB_SD/dbcnn_input.json' p_def_file = './data/phenotype_defs.json' pd = utils.load_json_data(p_def_file) w_sql = """ insert into tp_phenotype_concepts (phenotype_id, concept_id) values ('{pid}', '{cid}'); """ r_sql = """ select * from tp_phenotypes """ p_data = [] dutil.query_data(r_sql, p_data, dutil.get_db_connection_by_setting(db_cnf)) p2pid = {} for r in p_data: p2pid[r['phenotype_name']] = r['id'] for p in pd: if p not in p2pid: print '%s not found in definition table' % p continue for c in pd[p]['concepts']: sql = w_sql.format(**{'pid': p2pid[p], 'cid': c}) print 'executing [%s]' % sql dutil.query_data(sql, None, dbconn=dutil.get_db_connection_by_setting(db_cnf)) print 'done'
def phenotype_counting(phenotype_def, concept_level_results, output_file): pd = utils.load_json_data(phenotype_def) npd = {} cd = utils.read_text_file(concept_level_results) c_headers = cd[0].split('\t') headers = [h for h in c_headers[2:len(c_headers) - 1]] for r in cd[1:]: arr = r.split('\t') c = arr[0] num_mention = arr[12] for p in pd: if c in pd[p]['concepts']: po = npd[p] if p in npd else {'freq':0, 'p': p, 'num_concepts': len(pd[p]['concepts'])} npd[p] = po po['freq'] += int(num_mention) for idx in xrange(2, len(arr) - 1): h = headers[idx-2] po[h] = int(arr[idx]) if h not in po else (int(arr[idx]) + int(po[h])) rows = ['\t'.join(['phenotype', 'num concepts'] + headers + ['prevalence'])] for p in npd: po = npd[p] rows.append('\t'.join([p, str(po['num_concepts'])] + [str(po[h]) for h in headers] + [str(po['freq'])])) utils.save_string('\n'.join(rows), output_file)
def get_instance_by_setting_file(setting_file_path): setting = utils.load_json_data(setting_file_path) return SemEHRES.get_instance_by_setting(setting['es_host'], setting['es_index'], setting['es_doc_type'], setting['es_concept_type'], setting['es_patient_type'])
def main(args): # load dataset information from setup json file metad = utils.load_json_data(args.metadata_file) # generate data splits and keep them fixed for the whole project # MAKE SURE THIS IS ONLY CALLED ONCE splits_path = os.path.join(config.data_save_folder, 'data_splits.json') utils.create_data_split(metad, splits_path) mtracks = [] for ky in metad.keys(): mtrack = metad[ky] mtrack['filename'] = ky mtracks.append(mtrack) nmixes = len(metad.keys()) print("{} mixes to be processed".format(nmixes)) idx=0 Parallel(n_jobs=4, verbose=5)( delayed(utils.compute_features_mtrack)( mtrack, args.save_dir, args.wavmixes_path, idx ) for mtrack in mtracks)
def direct_nlp_prediction(settings): ann_dir = settings['test_ann_dir'] test_text_dir = settings['test_fulltext_dir'] _concept_mapping = settings['concept_mapping_file'] _learning_model_dir = settings['learning_model_dir'] _labels = utils.read_text_file(settings['entity_types_file']) ignore_mappings = utils.load_json_data(settings['ignore_mapping_file']) _cm_obj = Concept2Mapping(_concept_mapping) file_keys = [ f.split('.')[0] for f in listdir(ann_dir) if isfile(join(ann_dir, f)) ] doc2predicted = {} for fk in file_keys: cr = CustomisedRecoginiser(join(ann_dir, '%s.json' % fk), _concept_mapping) d = cr.full_text_file_pattern % fk for ann in cr.annotations: if ann.cui in _concept_mapping.cui2label: lbl = _concept_mapping.cui2label[ann.cui] pheAnn = PhenotypeAnn(ann.str, ann.start, ann.end, ann.negation, ann.temporality, ann.experiencer, 'StudyName', lbl) put_ann_label(lbl, pheAnn, doc2predicted, d) for ann in cr.phenotypes: put_ann_label(ann.minor_type, ann, doc2predicted, d) return doc2predicted
def run_learning( train_ann_dir, train_gold_dir, train_text_dir, test_ann_dir, test_gold_dir, test_text_dir, settings): log_level = 'DEBUG' log_format = '[%(filename)s:%(lineno)d] %(name)s %(asctime)s %(message)s' logging.basicConfig(level='DEBUG', format=log_format) log_file = './settings/processing.log' logging.basicConfig(level=log_level, format=log_format) global _min_sample_size, _ann_dir, _gold_dir, _test_ann_dir, _test_gold_dir, _gold_text_dir, _test_text_dir, _concept_mapping, _learning_model_dir global _labels, _gold_file_pattern, _ignore_mappings, _eHostGD, _cm_obj global _annotated_anns _annotated_anns = {} _min_sample_size = settings['min_sample_size'] _ann_dir = train_ann_dir _gold_dir = train_gold_dir _test_ann_dir = test_ann_dir _test_gold_dir = test_gold_dir _gold_text_dir = train_text_dir _test_text_dir = test_text_dir _concept_mapping = settings['concept_mapping_file'] _learning_model_dir = settings['learning_model_dir'] _labels = utils.read_text_file(settings['entity_types_file']) _gold_file_pattern = "%s_ann.xml" if 'gold_file_pattern' not in settings else settings['gold_file_pattern'] _ignore_mappings = utils.load_json_data(settings['ignore_mapping_file']) _eHostGD = settings['eHostGD'] if 'eHostGD' in settings else False _cm_obj = Concept2Mapping(_concept_mapping) # not using mention patterns for prediction as this is only a in-development feature mp_inst = None return do_learn_exp(settings['viz_file'], num_dimensions=[50], ignore_context=settings['ignore_context'] if 'ignore_context' in settings else False, separate_by_label=True, conll_output_file=settings['conll_output_file'], eHostGD=_eHostGD, mention_pattern=mp_inst)
def run_study(folder_path, no_sql_filter=None): study_config = 'study.json' if no_sql_filter is None else 'study_no_filter.json' if isfile(join(folder_path, study_config)): r = utils.load_json_data(join(folder_path, study_config)) retained_patients = None if 'query_patients_file' in r: retained_patients = [] lines = utils.read_text_file(r['query_patients_file']) for l in lines: arr = l.split('\t') retained_patients.append(arr[0]) study(folder_path, r['cohort'], r['sql_config'], r['db_conn'], concept_mapping.get_umls_client_inst(r['umls_key']), do_preprocessing=r['do_preprocessing'], rule_setting_file=r['rule_setting_file'], do_one_iter=r['do_one_iter'], sem_idx_setting_file=None if 'sem_idx_setting_file' not in r else r['sem_idx_setting_file'], concept_filter_file=None if 'concept_filter_file' not in r else r['concept_filter_file'], retained_patients_filter=retained_patients, filter_obj_setting=None if 'filter_obj_setting' not in r else r['filter_obj_setting'], do_disjoint_computing=True if 'do_disjoint' not in r else r['do_disjoint'], export_study_concept_only=False if 'export_study_concept' not in r else r['export_study_concept']) else: print 'study.json not found in the folder'
def doc_infer(settings): rules = PhenotypeRule.load_rules(settings['rule_file']) d2predicted = utils.load_json_data(settings['doc_nlp_results']) doc_labels_output = settings['doc_inference_output'] s = '' doc_type2id = {} pids = [] for d in d2predicted: m = re.match(r'Stroke\_id\_(\d+)(\.\d+){0,1}', d) pid = d if m is not None: pid = m.group(1) pids.append(pid) label_provs = PhenotypeRuleExecutor.apply_rules(d2predicted[d], rules) print(pid, d, label_provs) for lp in label_provs: if lp['label'] != '': s += '%s\t%s\n' % (pid, lp['label']) if lp['label'] not in doc_type2id: doc_type2id[lp['label']] = [] doc_type2id[lp['label']].append(pid) pids = list(set(pids)) print(json.dumps(pids)) utils.save_string(s, doc_labels_output) if 'patient_level_truth_tsv' in settings: doc_infer_with_ground_truth(settings['patient_level_truth_tsv'], pids, doc_type2id)
def load_study_ruler(study_folder, rule_config_file, study_config='study.json'): sa = None if study_folder is not None and study_folder != '': r = utils.load_json_data(join(study_folder, study_config)) ret = load_study_settings( study_folder, umls_instance=None, rule_setting_file=r['rule_setting_file'], concept_filter_file=None if 'concept_filter_file' not in r else r['concept_filter_file'], do_disjoint_computing=True if 'do_disjoint' not in r else r['do_disjoint'], export_study_concept_only=False if 'export_study_concept' not in r else r['export_study_concept']) sa = ret['study_analyzer'] ruler = ret['ruler'] else: logging.info( 'no study configuration provided, applying rules to all annotations...' ) ruler = load_ruler(rule_config_file) return {'sa': sa, 'ruler': ruler}
def merge_mappings_dictionary(map_files, dict_dirs, new_map_file, new_dict_folder): maps = [utils.load_json_data(mf) for mf in map_files] new_m = {} for m in maps: new_m.update(m) t2list = {} for dd in dict_dirs: lst_files = [ f for f in listdir(dd) if isfile(join(dd, f)) and f.endswith('.lst') ] for f in lst_files: t = f[:f.index('.')] labels = utils.read_text_file(join(dd, f)) if t not in t2list: t2list[t] = set() for l in labels: if len(l) > 0: t2list[t].add(l) utils.save_json_array(new_m, new_map_file) logging.info('mapping saved to %s' % new_map_file) for t in t2list: utils.save_string('\n'.join(list(t2list[t])) + '\n', join(new_dict_folder, t + '.lst')) logging.info('%s.lst saved' % t) logging.info('all done')
def gen_box_plot_statistics(folder): files = [f for f in listdir(folder) if isfile(join(folder, f))] d_dicts = {} for f in files: ds_name = f[:f.rfind('.')] dd = utils.load_json_data(join(folder, f)) for v in dd: if None == dd[v]: continue if v not in d_dicts: d_dicts[v] = { 'label': [ 'min', 'max', 'median', 'q3', 'q1-min', 'q1', 'median-q1', 'q3-median', 'max-q3' ] } v_min = dd[v][0] q1 = dd[v][1] median = dd[v][2] q3 = dd[v][3] v_max = dd[v][4] d_dicts[v][ds_name] = [ v_min, v_max, median, q3, q1 - v_min, q1, median - q1, q3 - median, v_max - q3 ] for v in d_dicts: df = pd.DataFrame(d_dicts[v]) print('%s\n' % v) print(df.head(10)) print('\n\n')
def regenerate_manual_mapped_concepts(tsv, closure_file): selected_concepts = set() c2l = {} for l in utils.read_text_file(tsv): arr = l.split('\t') selected_concepts.add(arr[1]) c2l[arr[1]] = arr[0] t2closure = utils.load_json_data(closure_file) mapped_concepts = [] map = {} v_map = {} for t in t2closure: disjoint_list = list(set(t2closure[t]) & selected_concepts) if len(disjoint_list) > 0: mapped_concepts += disjoint_list map[t] = { "tc": { "closure": len(disjoint_list), "mapped": disjoint_list[0] }, "concepts": disjoint_list } v_map[t] = [('%s [%s]' % (c2l[c], c)) for c in disjoint_list] print json.dumps(map) print selected_concepts - set(mapped_concepts) print json.dumps(v_map)
def direct_nlp_prediction(settings): ann_dir = settings['test_ann_dir'] test_text_dir = settings['test_fulltext_dir'] _concept_mapping = settings['concept_mapping_file'] _learning_model_dir = settings['learning_model_dir'] _labels = utils.read_text_file(settings['entity_types_file']) ignore_mappings = utils.load_json_data(settings['ignore_mapping_file']) _cm_obj = Concept2Mapping(_concept_mapping) file_keys = [ f[:f.rfind('.')].replace('se_ann_', '') for f in listdir(ann_dir) if isfile(join(ann_dir, f)) ] doc2predicted = {} for fk in file_keys: cr = CustomisedRecoginiser(join(ann_dir, 'se_ann_%s.json' % fk), _concept_mapping) d = fk for ann in cr.annotations: if ann.cui in _cm_obj.concept2label: lbl = _cm_obj.concept2label[ann.cui][0] pheAnn = PhenotypeAnn(ann.str, ann.start, ann.end, ann.negation, ann.temporality, ann.experiencer, 'StudyName', lbl) if ann.negation != 'Affirmed' or len(ann.ruled_by) > 0: continue put_ann_label(lbl, pheAnn, doc2predicted, d) for ann in cr.phenotypes: put_ann_label(ann.minor_type, ann, doc2predicted, d) return doc2predicted
def convert_text_ann_from_files(full_text_folder, ann_folder, output_folder, full_text_file_pattern='(%s).txt', ann_file_pattern='se_ann_%s.json', output_file_pattern='%s.txt.knowtator.xml', ann_to_convert=None): text_files = [ f for f in listdir(full_text_folder) if isfile(join(full_text_folder, f)) ] p = re.compile(full_text_file_pattern) for f in text_files: logging.info('working on [%s]' % f) m = p.match(f) if m is not None: fk = m.group(1) text = utils.read_text_file_as_string(join( full_text_folder, f)) anns = utils.load_json_data( join(ann_folder, ann_file_pattern % fk)) xml = AnnConverter.to_eHOST(AnnConverter.load_ann(anns, fk), full_text=text, ann_to_convert=ann_to_convert) utils.save_string( xml, join(output_folder, output_file_pattern % fk)) utils.save_string(text.replace('\r', ' '), join(full_text_folder, f)) logging.info('doc [%s] done' % fk)
def main(args): parser = argparse.ArgumentParser() parser.add_argument('-i', dest='inputs', type=str, nargs="+", help="Input files (JSON) for SPR1 splits.") parser.add_argument('-o', dest='output_dir', type=str, required=True, help="Output directory.") args = parser.parse_args(args) if not os.path.isdir(args.output_dir): os.mkdir(args.output_dir) pd.options.display.float_format = '{:.2f}'.format for fname in args.inputs: log.info("Converting %s", fname) source_records = list(utils.load_json_data(fname)) converted_records = (convert_record(r) for r in tqdm(source_records)) stats = utils.EdgeProbingDatasetStats() converted_records = stats.passthrough(converted_records) target_fname = os.path.join(args.output_dir, os.path.basename(fname)) utils.write_json_data(target_fname, converted_records) log.info("Wrote examples to %s", target_fname) log.info(stats.format())
def generate_all_queries(): concepts = utils.load_json_data('./resources/autoimmune-concepts.json') concept2queries = {} for c in concepts: concept2queries[c] = generate_prospector_query(concepts[c]) print '%s done' % c utils.save_json_array(concept2queries, './resources/mimir_queries.json')
def get_db_connection_by_setting(setting_file=None, setting_obj=None): if setting_file is not None: settings = imutil.load_json_data(setting_file) else: settings = setting_obj if 'db_type' in settings and settings['db_type'] == 'mysql_socket': return get_mysqldb_connection(settings['server'], settings['user'], settings['password'], settings['database'], settings['mysql_sock_file']) elif 'db_type' in settings and settings['db_type'] == 'mysql': return get_mysqldb_host_connection(settings['server'], settings['user'], settings['password'], settings['database']) if 'trusted_connection' in settings: con_string = 'driver=%s;server=%s;trusted_connection=yes;DATABASE=%s;' % ( settings['driver'], settings['server'], settings['database']) elif 'dsn' in settings: con_string = 'DSN=%s;UID=%s;PWD=%s;DATABASE=%s;' % ( settings['dsn'], settings['user'], settings['password'], settings['database']) else: con_string = 'driver=%s;server=%s;UID=%s;PWD=%s;DATABASE=%s;' % ( settings['driver'], settings['server'], settings['user'], settings['password'], settings['database']) # print pyodbc.drivers() cnxn = pyodbc.connect(con_string) cursor = cnxn.cursor() return {'cnxn': cnxn, 'cursor': cursor}
def phenotype_prevalence(phenotype_with_prev, output_file): pd = utils.load_json_data(phenotype_with_prev) utils.save_string( '\n'.join([ '\t'.join( [p, str(pd[p]['prevalence']), str(len(pd[p]['concepts']))]) for p in pd ]), output_file)
def output_phenotypes(phenotype_file, phenotype_performance, c_map_file, output_file): p = utils.load_json_data(phenotype_file) c_map = utils.load_json_data(c_map_file) new_p = {} p_lines = utils.read_text_file(phenotype_performance) for l in p_lines[1:]: arr = l.split('\t') new_p[arr[0]] = p[arr[0]] pt = new_p[arr[0]] concepts = pt['concepts'] pt['concepts'] = {} pt['prevalence'] = 0 for c in concepts: pt['concepts'][c] = 0 if c not in c_map else c_map[c]['freq'] pt['prevalence'] += pt['concepts'][c] utils.save_json_array(new_p, output_file) print 'new data saved to %s' % output_file
def analyse_doc_anns_file(ann_doc_path, rule_executor, text_reader, output_folder, fn_pattern='se_ann_%s.json', es_inst=None, es_output_index=None, es_output_doc='doc', study_analyzer=None): p, fn = split(ann_doc_path) file_key = splitext(fn)[0] json_doc = utils.load_json_data(ann_doc_path) return analyse_doc_anns(json_doc, file_key, rule_executor, text_reader, output_folder, fn_pattern, es_inst, es_output_index, es_output_doc, study_analyzer)
def load_data(self): if isfile(self._job_file): d = utils.load_json_data(self._job_file) self._end_time_point = d['end_time_point'] self._start_time_point = d['start_time_point'] self._last_status = d['last_status'] else: self._end_time_point = datetime.datetime.now().strftime(self._dfmt) self._start_time_point = datetime.date(2000, 1, 1).strftime(self._dfmt) self._last_status = JobStatus.STATUS_UNKNOWN
def align_mapped_concepts(map_file, disorder_file): concept_map = utils.load_json_data(map_file) disorders = [d.strip() for d in utils.read_text_file(disorder_file)] exact_mapped = {} for d in disorders: if d in concept_map: exact_mapped[d] = concept_map[d] else: exact_mapped[d] = "" print json.dumps(exact_mapped)
def mapping_headings(heading_stats_file, output_file, freq_threshold=1000): heading_freq = utils.load_json_data(heading_stats_file) sorted_top_k_headings = sorted([(h, heading_freq[h]) for h in heading_freq], key=lambda x: -x[1])[:freq_threshold] s = '' for r in sorted_top_k_headings[:500:]: s += '%s\t%s\n' % (r[0], r[1]) utils.save_string(s, './top500heading_discharge_summary.txt') utils.save_json_array(sorted_top_k_headings, output_file)