def merge_and_output(dir_path, cohort, default_results='hepc_results.json'):
    headers = ['all', 'positive', 'Negated', 'hypothetical', 'historical', 'Other', 'first_pos_time']

    results = {}
    for pid in cohort:
        results[pid] = {}
    c_results = utils.load_json_data(join(dir_path, default_results))
    for p in c_results:
        results[p['id']] = p
    for f in [f for f in listdir(dir_path) if isfile(join(dir_path, f))]:
        if f != default_results:
            c_results = utils.load_json_data(join(dir_path, f))
            d = f.replace('_results.json', '')
            print f
            if d not in headers:
                headers.append(d)
            for p in c_results:
                results[p['id']][d] = p['all']

    s = '\t'.join(['id'] + headers) + '\n'
    for pid in results:
        p = results[pid]
        row = [pid] + ['-' if h not in p else str(p[h]) for h in headers]
        s += '\t'.join(row) + '\n'
    utils.save_string(s, './valid_doc_files/merged_output_liverdiseases.tsv')
    print 'output generated'
Пример #2
0
 def from_json(json_object, user):
     if json_object['id'] != user.id:
         raise Exception(f"Can't load user {user.id} : IDs don't match !")
     user.afk_mentions = load_json_data(json_object, 'afk_mentions', True)
     user.xp = load_json_data(json_object, 'xp', 0)
     user.last_login = load_json_data(json_object, 'last_login', None)
     user.last_daily_reward = load_json_data(json_object,
                                             'last_daily_reward', None)
     if user.active_since is not None:
         user.last_active_xp = load_json_data(json_object, 'last_active_xp',
                                              None)
     user.muted = bool(load_json_data(json_object, 'muted', False))
     user.muted_until = load_json_data(json_object, 'muted_until', None)
     user.deaf = bool(load_json_data(json_object, 'deaf', False))
     user.deaf_until = load_json_data(json_object, 'deaf_until', None)
     if user.last_login is not None:
         user.last_login = datetime.datetime.strptime(
             user.last_login, '%a %b %d %H:%M:%S %Y')
     if user.last_daily_reward is not None:
         user.last_daily_reward = datetime.datetime.strptime(
             user.last_daily_reward, '%a %b %d %H:%M:%S %Y')
     if user.last_active_xp is not None:
         user.last_active_xp = datetime.datetime.strptime(
             user.last_active_xp, '%a %b %d %H:%M:%S %Y')
     if user.muted_until is not None:
         user.muted_until = datetime.datetime.strptime(
             user.muted_until, '%a %b %d %H:%M:%S %Y')
     if user.deaf_until is not None:
         user.deaf_until = datetime.datetime.strptime(
             user.deaf_until, '%a %b %d %H:%M:%S %Y')
     user.warnings = load_json_data(json_object, 'warnings', [])
     return user
Пример #3
0
 def load(server):
     loaded_server = db.get_server(server.id)
     if loaded_server is None:
         print(f'no save found for {server.id}')
         return False
     if loaded_server['id'] != server.id:
         raise Exception(
             f"Can't load server {server.id} : IDs don't match !")
     server.lang = load_json_data(loaded_server, 'lang',
                                  cfg.get_value('SRV_DEFAULT_LANG'))
     server.bot_text_channel_name = load_json_data(
         loaded_server, 'bot_text_channel_name',
         cfg.get_value('SRV_DEFAULT_BOT_TEXT_CHANNEL_NAME'))
     server.log_text_channel_name = load_json_data(
         loaded_server, 'log_text_channel_name',
         cfg.get_value('SRV_DEFAULT_LOGS_TEXT_CHANNEL_NAME'))
     server.cmd_prefix = load_json_data(
         loaded_server, 'cmd_prefix',
         cfg.get_value('SRV_DEFAULT_CMD_PREFIX_NAME'))
     server.admin_logs = load_json_data(
         loaded_server, 'admin_logs',
         bool(cfg.get_value('SRV_DEFAULT_DISPLAY_ADMIN_LOGS')))
     server.group_perks = load_json_data(loaded_server, 'group_perks', {})
     server.use_accept_command = load_json_data(
         loaded_server, 'use_accept',
         bool(cfg.get_value('SRV_DEFAULT_USE_ACCEPT_COMMAND')))
     server.use_accept_command = load_json_data(
         loaded_server, 'accept_rank',
         cfg.get_value('SRV_DEFAULT_ACCEPT_RANK'))
     for key, member in server.members.items():
         for json_member in load_json_data(loaded_server, 'members', []):
             if load_json_data(json_member, 'id', -1) == member.id:
                 User.from_json(json_member, member)
                 break
     return True
def dump_mention_detail(studies_folder, include_study_pattern, dump_tsv_file,
                        dump_concept_file):
    reg_p = re.compile(include_study_pattern)
    rows = ['\t'.join(['concept', 'pt', 'doc', 's', 'e', 'label', 'ruled'])]
    c_group = {}
    for f in listdir(studies_folder):
        m = reg_p.match(f)
        if m is not None:
            ruled_file = join(studies_folder, f, 'ruled_anns.json')
            if isfile(ruled_file):
                # {"p": "pid", "s": 3356, "e": 3365, "d": "did", "case-instance": [xxx"],
                # "c": "C0000833", "string_orig": "abscesses",
                # "ruled": "semehr hypothetical_filters.json"}
                ruleds = utils.load_json_data(ruled_file)
                for r in ruleds:
                    rows.append('\t'.join([
                        r['c'], r['p'], r['d'],
                        str(r['s']),
                        str(r['e']), r['string_orig'], r['ruled']
                    ]))
                    increase_freq_on_dict(
                        c_group, r['c'], r['ruled'],
                        '-'.join([r['d'], str(r['s']),
                                  str(r['e'])]))
            pos_file = join(studies_folder, f, 'result.csv_json')
            if isfile(pos_file):
                # {"c": "C0000833", "e": 467, "d": "52773120", "string_orig": "abscess", "p": "10110421", "s": 460}
                poses = utils.load_json_data(pos_file)
                for r in poses:
                    rows.append('\t'.join([
                        r['c'], r['p'], r['d'],
                        str(r['s']),
                        str(r['e']), r['string_orig'], ''
                    ]))
                    increase_freq_on_dict(
                        c_group, r['c'], 'pos',
                        '-'.join([r['d'], str(r['s']),
                                  str(r['e'])]))

    rule_headers = [
        'semehr negation_filters.json', 'semehr hypothetical_filters.json',
        'semehr not_mention_filters.json',
        'semehr other_experiencer_filters.json',
        'semehr cris_document_filters.json', 'skip-term', 'semehr s_skin.json',
        'semehr s_karen.json', 'yodie', 'pos'
    ]
    c_rows = ['\t'.join(['concept'] + rule_headers)]
    for c in c_group:
        co = c_group[c]
        c_rows.append(
            '\t'.join([c] +
                      [str(co[h]) if h in co else '0' for h in rule_headers]))
    utils.save_string('\n'.join(rows), dump_tsv_file)
    utils.save_string('\n'.join(c_rows), dump_concept_file)
    print 'dumped to  %s' % dump_tsv_file
Пример #5
0
def do_phenotype_analysis(phenotype_result_file, c_map_file, output_folder):
    c_map = utils.load_json_data(c_map_file)
    p_map = utils.load_json_data(phenotype_result_file)
    # extract performances of phenotypes
    headers = ["posM", "hisM", "negM", "otherM", "wrongM"]
    rows = ['\t'.join(["phenotype"] + headers)]
    for p in p_map:
        v = p_map[p]['validation']
        if v is None or len(v) == 0:
            continue
        rows.append('\t'.join([p] + [str(v[h]) if h in v else '0' for h in headers]))
    utils.save_string('\n'.join(rows), join(output_folder, 'phenotype_performance.tsv'))
Пример #6
0
 def load_rule_config(self, config_file):
     rule_config = utils.load_json_data(config_file)
     r_path = rule_config['rules_folder']
     print 'loading rules from [%s]' % r_path
     for rf in rule_config['active_rules']:
         for r in utils.load_json_data(join(r_path, rf)):
             self.add_filter_rule(r['offset'], r['regs'], rule_name=rf)
         print '%s loaded' % rf
     if 'osf_rules' in rule_config:
         for osf in rule_config['osf_rules']:
             self.add_original_string_filters(utils.load_json_data(join(r_path, osf)))
             print 'original string filters from [%s] loaded' % osf
     if 'skip_term_setting' in rule_config:
         self.skip_terms = utils.load_json_data(rule_config['skip_term_setting'])
Пример #7
0
def study(folder, episode_file, cohort, date_format='%d/%m/%Y %H:%M'):
    episodes = load_episode_data(episode_file, date_format=date_format)
    p, fn = split(folder)
    if isfile(join(folder, 'study_analyzer.pickle')):
        sa = StudyAnalyzer.deserialise(join(folder, 'study_analyzer.pickle'))
    else:
        sa = StudyAnalyzer(fn)
        if isfile(join(folder, 'exact_concepts_mappings.json')):
            concept_mappings = utils.load_json_data(join(folder, 'exact_concepts_mappings.json'))
            scs = []
            for t in concept_mappings:
                sc = StudyConcept(t, [t])
                t_c = {}
                t_c[t] = [concept_mappings[t]]
                sc.gen_concept_closure(term_concepts=t_c)
                scs.append(sc)
                print sc.concept_closure
            sa.study_concepts = scs
            sa.serialise(join(folder, 'study_analyzer.pickle'))
        else:
            concepts = utils.load_json_data(join(folder, 'study_concepts.json'))
            if len(concepts) > 0:
                scs = []
                for name in concepts:
                    scs.append(StudyConcept(name, concepts[name]))
                    print name, concepts[name]
            sa.study_concepts = scs
            sa.serialise(join(folder, 'study_analyzer.pickle'))

    # compute disjoint concepts
    sa.generate_exclusive_concepts()
    merged_mappings = {}
    for c in sa.study_concepts:
        for t in c.term_to_concept:
            all_concepts = list(c.concept_closure)
            if len(all_concepts) > 1:
                idx = 0
                for cid in all_concepts:
                    merged_mappings['(%s) %s (%s)' % (c.name, t, idx)] = {'closure': len(all_concepts), 'mapped': cid}
                    idx += 1
            else:
                merged_mappings['(%s) %s' % (c.name, t)] = c.term_to_concept[t]
        print c.name, c.term_to_concept, c.concept_closure
        print json.dumps(list(c.concept_closure))
    print json.dumps(merged_mappings)
    print 'generating result table...'
    populate_episode_study_table(sa, episodes, './resources', cohort)
    print 'done'
Пример #8
0
def load_phenotype_def_into_db():
    db_cnf = '../../studies/COMOB_SD/dbcnn_input.json'
    p_def_file = './data/phenotype_defs.json'
    pd = utils.load_json_data(p_def_file)
    w_sql = """
    insert into tp_phenotype_concepts (phenotype_id, concept_id) values 
    ('{pid}', '{cid}');
    """
    r_sql = """
    select * from tp_phenotypes
    """
    p_data = []
    dutil.query_data(r_sql, p_data, dutil.get_db_connection_by_setting(db_cnf))
    p2pid = {}
    for r in p_data:
        p2pid[r['phenotype_name']] = r['id']
    for p in pd:
        if p not in p2pid:
            print '%s not found in definition table' % p
            continue
        for c in pd[p]['concepts']:
            sql = w_sql.format(**{'pid': p2pid[p], 'cid': c})
            print 'executing [%s]' % sql
            dutil.query_data(sql, None, dbconn=dutil.get_db_connection_by_setting(db_cnf))
    print 'done'
Пример #9
0
def phenotype_counting(phenotype_def, concept_level_results, output_file):
    pd = utils.load_json_data(phenotype_def)
    npd = {}
    cd = utils.read_text_file(concept_level_results)
    c_headers = cd[0].split('\t')
    headers = [h for h in c_headers[2:len(c_headers) - 1]]
    for r in cd[1:]:
        arr = r.split('\t')
        c = arr[0]
        num_mention = arr[12]
        for p in pd:
            if c in pd[p]['concepts']:
                po = npd[p] if p in npd else {'freq':0, 'p': p,
                                              'num_concepts': len(pd[p]['concepts'])}
                npd[p] = po
                po['freq'] += int(num_mention)
                for idx in xrange(2, len(arr) - 1):
                    h = headers[idx-2]
                    po[h] = int(arr[idx]) if h not in po else (int(arr[idx]) + int(po[h]))

    rows = ['\t'.join(['phenotype', 'num concepts'] + headers + ['prevalence'])]
    for p in npd:
        po = npd[p]
        rows.append('\t'.join([p, str(po['num_concepts'])] + [str(po[h]) for h in headers] + [str(po['freq'])]))
    utils.save_string('\n'.join(rows), output_file)
Пример #10
0
 def get_instance_by_setting_file(setting_file_path):
     setting = utils.load_json_data(setting_file_path)
     return SemEHRES.get_instance_by_setting(setting['es_host'],
                                             setting['es_index'],
                                             setting['es_doc_type'],
                                             setting['es_concept_type'],
                                             setting['es_patient_type'])
def main(args):

    # load dataset information from setup json file
    metad = utils.load_json_data(args.metadata_file)

    # generate data splits and keep them fixed for the whole project
    # MAKE SURE THIS IS ONLY CALLED ONCE
    splits_path = os.path.join(config.data_save_folder, 'data_splits.json')
    utils.create_data_split(metad, splits_path)

    mtracks = []
    for ky in metad.keys():
        mtrack = metad[ky]
        mtrack['filename'] = ky
        mtracks.append(mtrack)


    nmixes = len(metad.keys())
    print("{} mixes to be processed".format(nmixes))
    idx=0

    Parallel(n_jobs=4, verbose=5)(
            delayed(utils.compute_features_mtrack)(
                mtrack, args.save_dir, args.wavmixes_path, idx
            ) for mtrack in mtracks)
Пример #12
0
def direct_nlp_prediction(settings):
    ann_dir = settings['test_ann_dir']
    test_text_dir = settings['test_fulltext_dir']
    _concept_mapping = settings['concept_mapping_file']
    _learning_model_dir = settings['learning_model_dir']
    _labels = utils.read_text_file(settings['entity_types_file'])
    ignore_mappings = utils.load_json_data(settings['ignore_mapping_file'])
    _cm_obj = Concept2Mapping(_concept_mapping)
    file_keys = [
        f.split('.')[0] for f in listdir(ann_dir) if isfile(join(ann_dir, f))
    ]
    doc2predicted = {}
    for fk in file_keys:
        cr = CustomisedRecoginiser(join(ann_dir, '%s.json' % fk),
                                   _concept_mapping)
        d = cr.full_text_file_pattern % fk
        for ann in cr.annotations:
            if ann.cui in _concept_mapping.cui2label:
                lbl = _concept_mapping.cui2label[ann.cui]
                pheAnn = PhenotypeAnn(ann.str, ann.start, ann.end,
                                      ann.negation, ann.temporality,
                                      ann.experiencer, 'StudyName', lbl)
                put_ann_label(lbl, pheAnn, doc2predicted, d)
        for ann in cr.phenotypes:
            put_ann_label(ann.minor_type, ann, doc2predicted, d)
    return doc2predicted
Пример #13
0
def run_learning(
        train_ann_dir, train_gold_dir, train_text_dir,
        test_ann_dir, test_gold_dir, test_text_dir,
        settings):
    log_level = 'DEBUG'
    log_format = '[%(filename)s:%(lineno)d] %(name)s %(asctime)s %(message)s'
    logging.basicConfig(level='DEBUG', format=log_format)
    log_file = './settings/processing.log'
    logging.basicConfig(level=log_level, format=log_format)
    global _min_sample_size, _ann_dir, _gold_dir, _test_ann_dir, _test_gold_dir, _gold_text_dir, _test_text_dir, _concept_mapping, _learning_model_dir
    global _labels, _gold_file_pattern, _ignore_mappings, _eHostGD, _cm_obj
    global _annotated_anns
    _annotated_anns = {}
    _min_sample_size = settings['min_sample_size']
    _ann_dir = train_ann_dir
    _gold_dir = train_gold_dir
    _test_ann_dir = test_ann_dir
    _test_gold_dir = test_gold_dir
    _gold_text_dir = train_text_dir
    _test_text_dir = test_text_dir
    _concept_mapping = settings['concept_mapping_file']
    _learning_model_dir = settings['learning_model_dir']
    _labels = utils.read_text_file(settings['entity_types_file'])
    _gold_file_pattern = "%s_ann.xml" if 'gold_file_pattern' not in settings else settings['gold_file_pattern']
    _ignore_mappings = utils.load_json_data(settings['ignore_mapping_file'])
    _eHostGD = settings['eHostGD'] if 'eHostGD' in settings else False
    _cm_obj = Concept2Mapping(_concept_mapping)

    # not using mention patterns for prediction as this is only a in-development feature
    mp_inst = None
    return do_learn_exp(settings['viz_file'],
                        num_dimensions=[50],
                        ignore_context=settings['ignore_context'] if 'ignore_context' in settings else False,
                        separate_by_label=True,
                        conll_output_file=settings['conll_output_file'], eHostGD=_eHostGD, mention_pattern=mp_inst)
Пример #14
0
def run_study(folder_path, no_sql_filter=None):
    study_config = 'study.json' if no_sql_filter is None else 'study_no_filter.json'
    if isfile(join(folder_path, study_config)):
        r = utils.load_json_data(join(folder_path, study_config))
        retained_patients = None
        if 'query_patients_file' in r:
            retained_patients = []
            lines = utils.read_text_file(r['query_patients_file'])
            for l in lines:
                arr = l.split('\t')
                retained_patients.append(arr[0])

        study(folder_path,
              r['cohort'],
              r['sql_config'],
              r['db_conn'],
              concept_mapping.get_umls_client_inst(r['umls_key']),
              do_preprocessing=r['do_preprocessing'],
              rule_setting_file=r['rule_setting_file'],
              do_one_iter=r['do_one_iter'],
              sem_idx_setting_file=None if 'sem_idx_setting_file' not in r else
              r['sem_idx_setting_file'],
              concept_filter_file=None
              if 'concept_filter_file' not in r else r['concept_filter_file'],
              retained_patients_filter=retained_patients,
              filter_obj_setting=None
              if 'filter_obj_setting' not in r else r['filter_obj_setting'],
              do_disjoint_computing=True
              if 'do_disjoint' not in r else r['do_disjoint'],
              export_study_concept_only=False if 'export_study_concept'
              not in r else r['export_study_concept'])
    else:
        print 'study.json not found in the folder'
Пример #15
0
def doc_infer(settings):
    rules = PhenotypeRule.load_rules(settings['rule_file'])
    d2predicted = utils.load_json_data(settings['doc_nlp_results'])
    doc_labels_output = settings['doc_inference_output']
    s = ''
    doc_type2id = {}
    pids = []
    for d in d2predicted:
        m = re.match(r'Stroke\_id\_(\d+)(\.\d+){0,1}', d)
        pid = d
        if m is not None:
            pid = m.group(1)
            pids.append(pid)
        label_provs = PhenotypeRuleExecutor.apply_rules(d2predicted[d], rules)
        print(pid, d, label_provs)
        for lp in label_provs:
            if lp['label'] != '':
                s += '%s\t%s\n' % (pid, lp['label'])
                if lp['label'] not in doc_type2id:
                    doc_type2id[lp['label']] = []
                doc_type2id[lp['label']].append(pid)

    pids = list(set(pids))
    print(json.dumps(pids))
    utils.save_string(s, doc_labels_output)
    if 'patient_level_truth_tsv' in settings:
        doc_infer_with_ground_truth(settings['patient_level_truth_tsv'], pids, doc_type2id)
Пример #16
0
def load_study_ruler(study_folder,
                     rule_config_file,
                     study_config='study.json'):
    sa = None
    if study_folder is not None and study_folder != '':
        r = utils.load_json_data(join(study_folder, study_config))

        ret = load_study_settings(
            study_folder,
            umls_instance=None,
            rule_setting_file=r['rule_setting_file'],
            concept_filter_file=None
            if 'concept_filter_file' not in r else r['concept_filter_file'],
            do_disjoint_computing=True
            if 'do_disjoint' not in r else r['do_disjoint'],
            export_study_concept_only=False
            if 'export_study_concept' not in r else r['export_study_concept'])
        sa = ret['study_analyzer']
        ruler = ret['ruler']
    else:
        logging.info(
            'no study configuration provided, applying rules to all annotations...'
        )
        ruler = load_ruler(rule_config_file)
    return {'sa': sa, 'ruler': ruler}
Пример #17
0
def merge_mappings_dictionary(map_files, dict_dirs, new_map_file,
                              new_dict_folder):
    maps = [utils.load_json_data(mf) for mf in map_files]
    new_m = {}
    for m in maps:
        new_m.update(m)
    t2list = {}
    for dd in dict_dirs:
        lst_files = [
            f for f in listdir(dd)
            if isfile(join(dd, f)) and f.endswith('.lst')
        ]
        for f in lst_files:
            t = f[:f.index('.')]
            labels = utils.read_text_file(join(dd, f))
            if t not in t2list:
                t2list[t] = set()
            for l in labels:
                if len(l) > 0:
                    t2list[t].add(l)
    utils.save_json_array(new_m, new_map_file)
    logging.info('mapping saved to %s' % new_map_file)
    for t in t2list:
        utils.save_string('\n'.join(list(t2list[t])) + '\n',
                          join(new_dict_folder, t + '.lst'))
        logging.info('%s.lst saved' % t)
    logging.info('all done')
def gen_box_plot_statistics(folder):
    files = [f for f in listdir(folder) if isfile(join(folder, f))]
    d_dicts = {}
    for f in files:
        ds_name = f[:f.rfind('.')]
        dd = utils.load_json_data(join(folder, f))
        for v in dd:
            if None == dd[v]:
                continue
            if v not in d_dicts:
                d_dicts[v] = {
                    'label': [
                        'min', 'max', 'median', 'q3', 'q1-min', 'q1',
                        'median-q1', 'q3-median', 'max-q3'
                    ]
                }
            v_min = dd[v][0]
            q1 = dd[v][1]
            median = dd[v][2]
            q3 = dd[v][3]
            v_max = dd[v][4]
            d_dicts[v][ds_name] = [
                v_min, v_max, median, q3, q1 - v_min, q1, median - q1,
                q3 - median, v_max - q3
            ]
    for v in d_dicts:
        df = pd.DataFrame(d_dicts[v])
        print('%s\n' % v)
        print(df.head(10))
        print('\n\n')
Пример #19
0
def regenerate_manual_mapped_concepts(tsv, closure_file):
    selected_concepts = set()
    c2l = {}
    for l in utils.read_text_file(tsv):
        arr = l.split('\t')
        selected_concepts.add(arr[1])
        c2l[arr[1]] = arr[0]
    t2closure = utils.load_json_data(closure_file)
    mapped_concepts = []
    map = {}
    v_map = {}
    for t in t2closure:
        disjoint_list = list(set(t2closure[t]) & selected_concepts)
        if len(disjoint_list) > 0:
            mapped_concepts += disjoint_list
            map[t] = {
                "tc": {
                    "closure": len(disjoint_list),
                    "mapped": disjoint_list[0]
                },
                "concepts": disjoint_list
            }
            v_map[t] = [('%s [%s]' % (c2l[c], c)) for c in disjoint_list]
    print json.dumps(map)
    print selected_concepts - set(mapped_concepts)
    print json.dumps(v_map)
Пример #20
0
def direct_nlp_prediction(settings):
    ann_dir = settings['test_ann_dir']
    test_text_dir = settings['test_fulltext_dir']
    _concept_mapping = settings['concept_mapping_file']
    _learning_model_dir = settings['learning_model_dir']
    _labels = utils.read_text_file(settings['entity_types_file'])
    ignore_mappings = utils.load_json_data(settings['ignore_mapping_file'])
    _cm_obj = Concept2Mapping(_concept_mapping)
    file_keys = [
        f[:f.rfind('.')].replace('se_ann_', '') for f in listdir(ann_dir)
        if isfile(join(ann_dir, f))
    ]
    doc2predicted = {}
    for fk in file_keys:
        cr = CustomisedRecoginiser(join(ann_dir, 'se_ann_%s.json' % fk),
                                   _concept_mapping)
        d = fk
        for ann in cr.annotations:
            if ann.cui in _cm_obj.concept2label:
                lbl = _cm_obj.concept2label[ann.cui][0]
                pheAnn = PhenotypeAnn(ann.str, ann.start, ann.end,
                                      ann.negation, ann.temporality,
                                      ann.experiencer, 'StudyName', lbl)
                if ann.negation != 'Affirmed' or len(ann.ruled_by) > 0:
                    continue
                put_ann_label(lbl, pheAnn, doc2predicted, d)
        for ann in cr.phenotypes:
            put_ann_label(ann.minor_type, ann, doc2predicted, d)
    return doc2predicted
Пример #21
0
 def convert_text_ann_from_files(full_text_folder,
                                 ann_folder,
                                 output_folder,
                                 full_text_file_pattern='(%s).txt',
                                 ann_file_pattern='se_ann_%s.json',
                                 output_file_pattern='%s.txt.knowtator.xml',
                                 ann_to_convert=None):
     text_files = [
         f for f in listdir(full_text_folder)
         if isfile(join(full_text_folder, f))
     ]
     p = re.compile(full_text_file_pattern)
     for f in text_files:
         logging.info('working on [%s]' % f)
         m = p.match(f)
         if m is not None:
             fk = m.group(1)
             text = utils.read_text_file_as_string(join(
                 full_text_folder, f))
             anns = utils.load_json_data(
                 join(ann_folder, ann_file_pattern % fk))
             xml = AnnConverter.to_eHOST(AnnConverter.load_ann(anns, fk),
                                         full_text=text,
                                         ann_to_convert=ann_to_convert)
             utils.save_string(
                 xml, join(output_folder, output_file_pattern % fk))
             utils.save_string(text.replace('\r', ' '),
                               join(full_text_folder, f))
             logging.info('doc [%s] done' % fk)
Пример #22
0
def main(args):
    parser = argparse.ArgumentParser()
    parser.add_argument('-i',
                        dest='inputs',
                        type=str,
                        nargs="+",
                        help="Input files (JSON) for SPR1 splits.")
    parser.add_argument('-o',
                        dest='output_dir',
                        type=str,
                        required=True,
                        help="Output directory.")
    args = parser.parse_args(args)

    if not os.path.isdir(args.output_dir):
        os.mkdir(args.output_dir)

    pd.options.display.float_format = '{:.2f}'.format
    for fname in args.inputs:
        log.info("Converting %s", fname)
        source_records = list(utils.load_json_data(fname))
        converted_records = (convert_record(r) for r in tqdm(source_records))
        stats = utils.EdgeProbingDatasetStats()
        converted_records = stats.passthrough(converted_records)
        target_fname = os.path.join(args.output_dir, os.path.basename(fname))
        utils.write_json_data(target_fname, converted_records)
        log.info("Wrote examples to %s", target_fname)
        log.info(stats.format())
Пример #23
0
def generate_all_queries():
    concepts = utils.load_json_data('./resources/autoimmune-concepts.json')
    concept2queries = {}
    for c in concepts:
        concept2queries[c] = generate_prospector_query(concepts[c])
        print '%s done' % c
    utils.save_json_array(concept2queries, './resources/mimir_queries.json')
Пример #24
0
def get_db_connection_by_setting(setting_file=None, setting_obj=None):
    if setting_file is not None:
        settings = imutil.load_json_data(setting_file)
    else:
        settings = setting_obj
    if 'db_type' in settings and settings['db_type'] == 'mysql_socket':
        return get_mysqldb_connection(settings['server'], settings['user'],
                                      settings['password'],
                                      settings['database'],
                                      settings['mysql_sock_file'])
    elif 'db_type' in settings and settings['db_type'] == 'mysql':
        return get_mysqldb_host_connection(settings['server'],
                                           settings['user'],
                                           settings['password'],
                                           settings['database'])

    if 'trusted_connection' in settings:
        con_string = 'driver=%s;server=%s;trusted_connection=yes;DATABASE=%s;' % (
            settings['driver'], settings['server'], settings['database'])
    elif 'dsn' in settings:
        con_string = 'DSN=%s;UID=%s;PWD=%s;DATABASE=%s;' % (
            settings['dsn'], settings['user'], settings['password'],
            settings['database'])
    else:
        con_string = 'driver=%s;server=%s;UID=%s;PWD=%s;DATABASE=%s;' % (
            settings['driver'], settings['server'], settings['user'],
            settings['password'], settings['database'])
    # print pyodbc.drivers()
    cnxn = pyodbc.connect(con_string)
    cursor = cnxn.cursor()
    return {'cnxn': cnxn, 'cursor': cursor}
def phenotype_prevalence(phenotype_with_prev, output_file):
    pd = utils.load_json_data(phenotype_with_prev)
    utils.save_string(
        '\n'.join([
            '\t'.join(
                [p, str(pd[p]['prevalence']),
                 str(len(pd[p]['concepts']))]) for p in pd
        ]), output_file)
Пример #26
0
def output_phenotypes(phenotype_file, phenotype_performance, c_map_file, output_file):
    p = utils.load_json_data(phenotype_file)
    c_map = utils.load_json_data(c_map_file)
    new_p = {}
    p_lines = utils.read_text_file(phenotype_performance)
    for l in p_lines[1:]:
        arr = l.split('\t')
        new_p[arr[0]] = p[arr[0]]
        pt = new_p[arr[0]]
        concepts = pt['concepts']
        pt['concepts'] = {}
        pt['prevalence'] = 0
        for c in concepts:
            pt['concepts'][c] = 0 if c not in c_map else c_map[c]['freq']
            pt['prevalence'] += pt['concepts'][c]
    utils.save_json_array(new_p, output_file)
    print 'new data saved to %s' % output_file
Пример #27
0
def analyse_doc_anns_file(ann_doc_path, rule_executor, text_reader, output_folder,
                          fn_pattern='se_ann_%s.json', es_inst=None, es_output_index=None, es_output_doc='doc',
                          study_analyzer=None):
    p, fn = split(ann_doc_path)
    file_key = splitext(fn)[0]
    json_doc = utils.load_json_data(ann_doc_path)
    return analyse_doc_anns(json_doc, file_key, rule_executor, text_reader, output_folder,
                            fn_pattern, es_inst, es_output_index, es_output_doc,
                            study_analyzer)
 def load_data(self):
     if isfile(self._job_file):
         d = utils.load_json_data(self._job_file)
         self._end_time_point = d['end_time_point']
         self._start_time_point = d['start_time_point']
         self._last_status = d['last_status']
     else:
         self._end_time_point = datetime.datetime.now().strftime(self._dfmt)
         self._start_time_point = datetime.date(2000, 1, 1).strftime(self._dfmt)
         self._last_status = JobStatus.STATUS_UNKNOWN
Пример #29
0
def align_mapped_concepts(map_file, disorder_file):
    concept_map = utils.load_json_data(map_file)
    disorders = [d.strip() for d in utils.read_text_file(disorder_file)]
    exact_mapped = {}
    for d in disorders:
        if d in concept_map:
            exact_mapped[d] = concept_map[d]
        else:
            exact_mapped[d] = ""
    print json.dumps(exact_mapped)
Пример #30
0
def mapping_headings(heading_stats_file, output_file, freq_threshold=1000):
    heading_freq = utils.load_json_data(heading_stats_file)
    sorted_top_k_headings = sorted([(h, heading_freq[h])
                                    for h in heading_freq],
                                   key=lambda x: -x[1])[:freq_threshold]
    s = ''
    for r in sorted_top_k_headings[:500:]:
        s += '%s\t%s\n' % (r[0], r[1])
    utils.save_string(s, './top500heading_discharge_summary.txt')
    utils.save_json_array(sorted_top_k_headings, output_file)