def analyse_discharge_summaries(es, q, doc_type='eprdoc',
                                full_text_field='fulltext',
                                reg_exp=r'^([^\n\:]+)\:$',
                                output_file='../resources/wrappers/section_freqs.json'):
    """
    iterate all discharge summaries and create the section dictionary for
    the corpus (EHR system)
    :param es:
    :param q:
    :param doc_type:
    :param full_text_field
    :param reg_exp
    :param output_file
    :return:
    """
    scroll_obj = es.scroll(q, doc_type, include_fields=[full_text_field], size=500)
    container = []
    utils.multi_thread_tasking_it(scroll_obj, 10, do_query_analysis, args=[container, full_text_field, reg_exp])
    print 'search finished. merging sections...'
    sec_freq = {}
    for ss in container:
        for s in ss:
            sec_freq[s] = 1 if s not in sec_freq else 1 + sec_freq[s]
    utils.save_json_array(sec_freq, output_file)
    print json.dumps(sec_freq)
    print 'done'
Exemplo n.º 2
0
def generate_all_queries():
    concepts = utils.load_json_data('./resources/autoimmune-concepts.json')
    concept2queries = {}
    for c in concepts:
        concept2queries[c] = generate_prospector_query(concepts[c])
        print '%s done' % c
    utils.save_json_array(concept2queries, './resources/mimir_queries.json')
Exemplo n.º 3
0
def analyse_doc_anns(json_doc, file_key, rule_executor, text_reader, output_folder,
                     fn_pattern='se_ann_%s.json', es_inst=None, es_output_index=None, es_output_doc='doc',
                     study_analyzer=None, contextualised_concept_index='semehr_ctx_concepts',
                     ctx_doc_type='ctx_concept'):
    ann_doc = SemEHRAnnDoc()
    ann_doc.load(json_doc, file_key=file_key)
    read_obj = text_reader.read_full_text(ann_doc.file_key)
    patient_id = None
    if isinstance(read_obj, dict):
        text = read_obj['text']
        patient_id = read_obj['pid']
    else:
        text = read_obj
    if text is None:
        logging.error('file [%s] full text not found' % ann_doc.file_key)
        return
    reader = WrapperTextReader(text)
    process_doc_rule(ann_doc, rule_executor, reader, None, study_analyzer)
    if es_inst is None:
        utils.save_json_array(ann_doc.serialise_json(), join(output_folder, fn_pattern % ann_doc.file_key))
    else:
        data = ann_doc.serialise_json()
        data['doc_id'] = file_key
        data['patient_id'] = patient_id
        es_inst.index_new_doc(index=es_output_index, doc_type=es_output_doc,
                              data=data, doc_id=file_key)
        # index conceptualised concepts
        if contextualised_concept_index is not None:
            for ann in data['annotations']:
                index_ctx_concept(ann, contextualised_concept_index, ctx_doc_type, es_inst)

    return ann_doc.serialise_json()
def patient_level_analysis(complete_anns_file, output_file):
    lines = utils.read_text_file(complete_anns_file)
    pos_condition2patients = {}
    patient2conditions = {}
    positive_labels = ['posM', 'hisM']
    indexable_labels = ['posM', 'hisM', 'negM']
    for l in lines:
        arr = l.split('\t')
        label = arr[2]
        condition = arr[3]
        pid = arr[8]
        if label in positive_labels:
            pos_condition2patients[condition] = [pid] if condition not in pos_condition2patients else \
                pos_condition2patients[condition] + [pid]
        if label in indexable_labels:
            pd = patient2conditions[pid] if pid in patient2conditions else {}
            patient2conditions[pid] = pd
            if label in pd:
                pd[label].append(condition)
                pd[label] = list(set(pd[label]))
            else:
                pd[label] = [condition]
    utils.save_json_array(
        {
            'p2c': patient2conditions,
            'c2p': pos_condition2patients
        }, output_file)
Exemplo n.º 5
0
 def collect_result(self, output_file, graph_file_path):
     files = [
         f for f in listdir(self._doc_pth) if isfile(join(self._doc_pth, f))
     ]
     f_did = []
     for f in files:
         sr = re.search(self._did_pattern, f, re.IGNORECASE)
         if sr:
             f_did.append((f, sr.group(1)))
     results = []
     logging.info('collecting results ...')
     utils.multi_thread_tasking(
         lst=f_did,
         num_threads=10,
         process_func=DocCohort.collect_doc_anns_by_types,
         args=[self._doc_pth, self.collect_semantic_types, results])
     logging.info('total anns collected %s' % len(results))
     ret = {'concepts': {}, 'p2c': {}}
     for r in results:
         if r['d'] in self._d2p:
             p = self._d2p[r['d']]
             if p not in ret['p2c']:
                 ret['p2c'][p] = {}
             pd = ret['p2c'][p]
             if r['cui'] not in ret['concepts']:
                 ret['concepts'][r['cui']] = r['pref']
             if r['cui'] not in pd:
                 pd[r['cui']] = 1
             else:
                 pd[r['cui']] += 1
         else:
             logging.error('doc %s not in cohort map' % r['d'])
     utils.save_json_array(ret, output_file)
     utils.save_json_array(DocCohort.result_to_graph(ret), graph_file_path)
     logging.info('result collected')
def parse_es_docs(
    es,
    q,
    writing_es_host,
    writing_index_name,
    writing_doc_type,
    doc_type='eprdoc',
    full_text_field='fulltext',
    output_file='../resources/wrappers/sen_data_extracted.json',
    failed_docs_file='../resources/wrappers/sen_failed_docs.json',
):
    writing_es = Elasticsearch([writing_es_host], verify_certs=False)
    # scroll_obj = es.scroll(q, doc_type, include_fields=[full_text_field], size=500)
    ret_count, docs = es.search(doc_type, q, offset=0, size=30)
    container = []
    failed_docs = []
    print 'anonymising... %s, %s' % (len(docs), ','.join(
        [d['_id'] for d in docs]))
    utils.multi_thread_tasking_it(docs,
                                  1,
                                  do_doc_anonymisation,
                                  args=[
                                      writing_es, writing_index_name,
                                      writing_doc_type, full_text_field,
                                      container, failed_docs
                                  ])
    print 'search finished. merging sections...'
    utils.save_json_array(container, output_file)
    utils.save_json_array(failed_docs_file, failed_docs_file)
    print 'done'
Exemplo n.º 7
0
def merge_mappings_dictionary(map_files, dict_dirs, new_map_file,
                              new_dict_folder):
    maps = [utils.load_json_data(mf) for mf in map_files]
    new_m = {}
    for m in maps:
        new_m.update(m)
    t2list = {}
    for dd in dict_dirs:
        lst_files = [
            f for f in listdir(dd)
            if isfile(join(dd, f)) and f.endswith('.lst')
        ]
        for f in lst_files:
            t = f[:f.index('.')]
            labels = utils.read_text_file(join(dd, f))
            if t not in t2list:
                t2list[t] = set()
            for l in labels:
                if len(l) > 0:
                    t2list[t].add(l)
    utils.save_json_array(new_m, new_map_file)
    logging.info('mapping saved to %s' % new_map_file)
    for t in t2list:
        utils.save_string('\n'.join(list(t2list[t])) + '\n',
                          join(new_dict_folder, t + '.lst'))
        logging.info('%s.lst saved' % t)
    logging.info('all done')
def predict_exp(corpus_trans_file, ann_file, cache_file, output_file):
    # initialise pattern instances from documents
    if not isfile(cache_file):
        # load labelled data
        ann_lines = utils.read_text_file(ann_file)
        prev_doc = None
        anns = []
        doc_anns = []
        ptn_insts = []
        doc_to_pt = {}
        for ls in ann_lines:
            l = ls.split('\t')
            doc_id = l[1]
            doc_to_pt[doc_id] = l[0]
            if prev_doc != doc_id:
                if prev_doc is not None:
                    if exists(join(working_folder, 'docs',
                                   '%s.txt' % prev_doc)):
                        doc_anns.append((prev_doc, anns))
                anns = []
                prev_doc = doc_id
            anns.append({
                's': int(l[2]),
                'e': int(l[3]),
                'signed_label': l[4],
                'gt_label': l[5]
            })
        if prev_doc is not None:
            if exists(join(working_folder, 'docs', '%s.txt' % prev_doc)):
                doc_anns.append((prev_doc, anns))
        # mutithreading do processing labelled docs
        print 'processing docs...'
        utils.multi_thread_tasking(doc_anns,
                                   30,
                                   do_process_labelled_doc,
                                   args=[ptn_insts])
        jl.dump({'insts': ptn_insts, 'doc_to_pt': doc_to_pt}, cache_file)
    else:
        cached = jl.load(cache_file)
        ptn_insts = cached['insts']
        doc_to_pt = cached['doc_to_pt']

    cp = sp.CorpusPredictor.load_corpus_model(corpus_trans_file)
    ret = []
    for inst in ptn_insts:
        print 'predicting [%s]...' % inst.sentence
        acc = cp.predcit(inst)
        print 'accuracy: %s' % acc
        ann = inst.annotations[0]
        ret.append(
            (doc_to_pt[inst.doc_id], inst.doc_id, str(ann['s']), str(ann['e']),
             ann['signed_label'], ann['gt_label'], str(acc)))
    s = []
    for r in ret:
        s.append(u'\t'.join(r))
    print u'\n'.join(s)
    utils.save_json_array(ret, output_file)
    return ret
def query_liver_diseases(concepts, prefix, patient_filter, d2time):
    es = SemEHRES.get_instance()
    results, docs = es.summary_patients_by_concepts(concepts,
                                                    filter_func=None,
                                                    args=[d2time],
                                                    patient_filters=patient_filter,
                                                    data_collection_func=first_time_collector)
    utils.save_json_array(results, './addiction_res/%s_results.json' % prefix)
    utils.save_json_array(docs, './valid_doc_files/%s_valid_docs.json' % prefix)
Exemplo n.º 10
0
def mapping_headings(heading_stats_file, output_file, freq_threshold=1000):
    heading_freq = utils.load_json_data(heading_stats_file)
    sorted_top_k_headings = sorted([(h, heading_freq[h])
                                    for h in heading_freq],
                                   key=lambda x: -x[1])[:freq_threshold]
    s = ''
    for r in sorted_top_k_headings[:500:]:
        s += '%s\t%s\n' % (r[0], r[1])
    utils.save_string(s, './top500heading_discharge_summary.txt')
    utils.save_json_array(sorted_top_k_headings, output_file)
Exemplo n.º 11
0
def compute_all_subconcepts(concepts, file_path):
    c_to_subs = {}
    umls = UMLSAPI(_umls_api_key)
    container = []
    utils.multi_thread_tasking(concepts,
                               10,
                               do_compute_subconcept,
                               args=[umls, container])
    for p in container:
        c_to_subs[p[0]] = p[1]
    utils.save_json_array(c_to_subs, file_path)
Exemplo n.º 12
0
def test_models_and_ensemble(model_files, x, weights=None, outcome='death', threshold=0.5, result_csv=None,
                             severity_conf=None, generate_figs=False, auc_fig_file=None,
                             calibration_fig_file=None, event_rate=None, nri_json=None):
    """
    do tests on individual models and also ensemble methods
    :param event_rate:
    :param model_files:
    :param x:
    :param weights:
    :param outcome:
    :param threshold:
    :param result_csv:
    :param severity_conf: severity configuration for setting weights on the alignments between model
    outcomes and what to predict
    :param generate_figs: generate figs or not
    :param auc_fig_file: roc curve figure output file
    :param calibration_fig_file: calibration figure output file
    :return:
    """
    data = {}
    ve = me.BasicEnsembler()
    y_list = []
    predicted_list = []
    models = []
    for idx in range(len(model_files)):
        mf = model_files[idx]
        m = load_model(mf)
        models.append(m)
        y, pred = test_single_model(m, x, outcome=outcome, threshold=threshold)
        y_list.append(y)
        predicted_list.append(pred)
        ve.add_model(m, 1 if weights is None else weights[idx])
        # results['{0}\n({1})'.format(m.id, m.model_type)] = result

    ve.mode = me.VoteMode.competence_fusion
    y, pred = test_ensemble(ve, x, threshold=threshold, outcome=outcome, severity_conf=severity_conf,
                            generate_figs=generate_figs)
    y_list.append(y)
    predicted_list.append(pred)
    results, nri_result = eval.evaluate_pipeline(y_list, predicted_list, model_names=[m.id for m in models] + ['ensemble model'],
                                     threshold=threshold,
                                     figs=generate_figs, outcome=outcome, auc_fig_file=auc_fig_file,
                                     calibration_fig_file=calibration_fig_file,
                                     event_rate=event_rate)
    model_labels = ['{0}\n({1})'.format(m.id, m.model_type) for m in models] + ['ensemble model']
    for idx in range(len(model_labels)):
        data[model_labels[idx]] = {}
        for k in results:
            data[model_labels[idx]][k] = results[k][idx]
    result_df = eval.format_result(data)
    if result_csv is not None:
        result_df.to_csv(result_csv, sep='\t', index=False)
    if nri_json is not None:
        utils.save_json_array(nri_result, nri_json)
Exemplo n.º 13
0
def extend_manual_mappings(mapping_file, new_mapping_file):
    umls = get_umls_client_inst('./resources/HW_UMLS_KEY.txt')
    m = utils.load_json_data(mapping_file)
    for k in m:
        logging.info('working on %s' % k)
        new_concepts = [] + m[k]['concepts']
        for c in m[k]['concepts']:
            new_concepts += umls.transitive_narrower(c)
        m[k]['concepts'] = list(set(new_concepts))
    logging.info('saving new results to %s' % new_mapping_file)
    utils.save_json_array(m, new_mapping_file)
Exemplo n.º 14
0
def encode_doc_anns(d2anns, ann_ctx_file=None):
    ann_context_list = []
    for d in d2anns:
        print('getting %s' % d)
        doc = get_es_instance().get(d2anns[d][0]['index'],
                                    d,
                                    doc_type=_es_doc_type)
        ann_context_list += extract_text(doc['_source']['fulltext'], d2anns[d])
    if ann_ctx_file is not None:
        utils.save_json_array(ann_context_list, ann_ctx_file)
        print('annotation context results saved to %s' % ann_ctx_file)
    return ann_context_list
Exemplo n.º 15
0
def extract_study_phenotypes(study_folder, output_file, exclude_filter=None):
    reg_p = re.compile(exclude_filter) if exclude_filter is not None else None
    all_phenotype_concepts = {}
    for f in listdir(study_folder):
        if reg_p is not None:
            m = reg_p.match(f)
            if m is not None:
                print '%s matched [%s], skipped' % (f, m)
                continue
        folder = join(study_folder, f)
        if isdir(folder):
            print 'inspecting %s ...' % folder
            if isfile(join(folder, 'study_analyzer.pickle')):
                sa = StudyAnalyzer.deserialise(
                    join(folder, 'study_analyzer.pickle'))
                for c in sa.study_concepts:
                    if c.name in all_phenotype_concepts:
                        all_phenotype_concepts[
                            c.name]['freq'] = all_phenotype_concepts[
                                c.name]['freq'] + 1
                    else:
                        all_phenotype_concepts[c.name] = {
                            "phenotype":
                            c.name,
                            "concepts":
                            list(c.concept_closure),
                            "subtypes": [{
                                "phenotype":
                                t,
                                "concept":
                                c.term_to_concept[t]['mapped']
                            } for t in c.term_to_concept],
                            "freq":
                            1
                        }
                    # for t in c.term_to_concept:
                    #     if t in all_phenotype_concepts:
                    #         all_phenotype_concepts[t]['freq'] = all_phenotype_concepts[t]['freq'] + 1
                    #     else:
                    #         all_phenotype_concepts[t] = {"phenotype": t,
                    #                                      "concepts": [c.term_to_concept[t]['mapped']]
                    #                                      if c.term_to_concept[t]['closure'] == 0 else
                    #                                      list(set(list(c.concept_closure) +
                    #                                               [c.term_to_concept[t]['mapped']])),
                    #                                      "freq": 1}
    print 'total phenotypes %s' % len(all_phenotype_concepts)
    if len(all_phenotype_concepts) > 0:
        utils.save_json_array(all_phenotype_concepts, output_file)
        print 'saved to %s' % output_file
    else:
        print 'no data found'
Exemplo n.º 16
0
def icd10_mapping_convert(json_file, output_json):
    c2concepts = utils.load_json_data(json_file)
    result = {}
    for c in c2concepts:
        r = {
            "tc": {
                "closure": len(c2concepts[c]),
                "mapped": c2concepts[c][0]
            },
            "concepts": c2concepts[c]
        }
        result[c] = r
    utils.save_json_array(result, output_json)
    logging.info('all done')
Exemplo n.º 17
0
def break_down_study_concepts(scs, umls, new_mapping_file):
    mmc = {}
    for sc in scs:
        cui = sc.term_to_concept[sc.terms[0]]['mapped']
        m = {"tc": {"closure": 1, "mapped": cui}, "concepts": [cui]}
        mmc[sc.name] = m

        c2n = get_concepts_names(umls, list(sc.concept_closure))

        for c in sc.concept_closure:
            if c != cui:
                # for each single concept create a studyconcept
                mc = {"tc": {"closure": 1, "mapped": c}, "concepts": [c]}
                mmc[c2n[c]] = mc
    utils.save_json_array(mmc, new_mapping_file)
Exemplo n.º 18
0
def export_pickled_study_concept_2_flat_json(pickle_file, output_file):
    if isfile(pickle_file):
        obj = {}
        sa = StudyAnalyzer.deserialise(pickle_file)
        for sc in sa.study_concepts:
            for t in sc.term_to_concepts:
                for c in sc.term_to_concepts[t]['closure']:
                    obj[c] = {
                        "tc": {
                            "closure": 1,
                            "mapped": c
                        },
                        "concepts": [c]
                    }

        utils.save_json_array(obj, output_file)
        print 'flat json saved to %s' % output_file
Exemplo n.º 19
0
def predict_to_eHOST_results(predict_setting):
    ss = StrokeSettings(predict_setting)
    if 'predict_mode' in ss.settings and ss.settings['predict_mode'] == 'direct_nlp':
        logging.info('predicting with direct nlp...')
        predicted_results = direct_nlp_prediction(ss.settings)
    elif 'predict_mode' in ss.settings and ss.settings['predict_mode'] == 'hybrid':
        predicted_results = hybrid_prediciton(ss.settings)
    else:
        logging.info('predicting...')
        predicted_results = predict(ss.settings)
    output_eHOST_format(predicted_results, ss.settings['output_folder'])
    logging.info('results saved to %s' % ss.settings['output_folder'])
    if 'output_file' in ss.settings:
        d2ann = {}
        for d in predicted_results:
            d2ann[d] = [{'label': t['label'], 'ann': t['ann'].to_dict()} for t in predicted_results[d]]
        utils.save_json_array(d2ann, ss.settings['output_file'])
Exemplo n.º 20
0
def output_phenotypes(phenotype_file, phenotype_performance, c_map_file, output_file):
    p = utils.load_json_data(phenotype_file)
    c_map = utils.load_json_data(c_map_file)
    new_p = {}
    p_lines = utils.read_text_file(phenotype_performance)
    for l in p_lines[1:]:
        arr = l.split('\t')
        new_p[arr[0]] = p[arr[0]]
        pt = new_p[arr[0]]
        concepts = pt['concepts']
        pt['concepts'] = {}
        pt['prevalence'] = 0
        for c in concepts:
            pt['concepts'][c] = 0 if c not in c_map else c_map[c]['freq']
            pt['prevalence'] += pt['concepts'][c]
    utils.save_json_array(new_p, output_file)
    print 'new data saved to %s' % output_file
Exemplo n.º 21
0
def add_concept_level_freqs(data_folder, c_map_file):
    reg_p = re.compile(".*annotations\\.csv")
    c_map = utils.load_json_data(c_map_file)
    for f in listdir(data_folder):
        if reg_p is not None:
            m = reg_p.match(f)
            if m is not None:
                print '%s matched, reading...' % f
                lines = utils.read_text_file(join(data_folder, f))
                for l in lines:
                    arr = l.split('\t')
                    if arr[0] not in c_map:
                        continue
                    if 'freq' not in c_map[arr[0]]:
                        c_map[arr[0]]['freq'] = 0
                    c_map[arr[0]]['freq'] += int(arr[1])
    utils.save_json_array(c_map, c_map_file)
Exemplo n.º 22
0
def populate_phenotype_validation_results(phenotype_def_file,
                                          complete_validation_file, c_map_file,
                                          output_file):
    c_map = populate_concept_level_performance(complete_validation_file, c_map_file)
    phenotypes = utils.load_json_data(phenotype_def_file)
    for p_name in phenotypes:
        p = phenotypes[p_name]
        p['validation'] = {}
        for c in p['concepts']:
            if c not in c_map:
                continue
            for label in c_map[c]:
                if label in p['validation']:
                    p['validation'][label] += c_map[c][label]
                else:
                    p['validation'][label] = c_map[c][label]
    utils.save_json_array(phenotypes, output_file)
    print 'done'
Exemplo n.º 23
0
def get_what_is_changing(ann_folder,
                         text_folder,
                         output_file,
                         eHostAnnFile=True):
    """
    get what is getting better/worse
    :param ann_folder:
    :param text_folder:
    :param output_file:
    :return:
    """
    nlp = rr.get_nlp_instance()
    files = [f for f in listdir(ann_folder) if isfile(join(ann_folder, f))]
    type2abstractions = {}
    for f in files:
        anns = []
        text_file = join(text_folder, f[0:-14])
        if eHostAnnFile:
            d = eHostAnnDoc(join(ann_folder, f))
            anns = d.get_ess_entities(no_context=True)
        else:
            d = eHostGenedDoc(join(ann_folder, f))
            anns = d.get_ess_entities()
        if len(anns) == 0:
            logging.info('anns is empty for [{:s}]'.format(f))
        text = utils.read_text_file_as_string(join(text_folder, f[0:-14]),
                                              encoding='cp1252')
        sents = rr.get_sentences_as_anns(nlp, text)
        for ann in anns:
            for s in sents:
                if ann.overlap(s):
                    abss = rr.AbstractedSentence(1)
                    abss.text = s.str
                    result = abss.get_abstaction_by_pos(
                        abss.locate_pos(ann.str), nlp)
                    if result is None:
                        logging.info('%s not found in %s' % (ann.str, f))
                        continue
                    type = ann.label
                    if type not in type2abstractions:
                        type2abstractions[type] = []
                    type2abstractions[type].append(result.to_dict())
    logging.debug(type2abstractions)
    utils.save_json_array(type2abstractions, output_file)
Exemplo n.º 24
0
def encode_text(ann_ctxs, word_to_index_file=None):
    # Tokenize the sentences into words
    tokenized_sentences = [
        nltk.word_tokenize(' '.join(ctx['prev'] + ctx['next']).lower())
        for ctx in ann_ctxs
    ]

    # Count the word frequencies
    word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
    print "Found %d unique words tokens." % len(word_freq.items())

    # Get the most common words and build index_to_word and word_to_index vectors
    vocab = word_freq.most_common(vocabulary_size - len(ann_type_tokens))
    index_to_word = [x[0] for x in vocab]
    index_to_word += ann_type_tokens
    word_to_index = dict([(w, i) for i, w in enumerate(index_to_word)])
    if word_to_index_file is not None:
        utils.save_json_array(word_to_index, word_to_index_file)
    return word_to_index
def analyse_doc_anns(ann_doc_path,
                     rule_executor,
                     text_reader,
                     output_folder,
                     fn_pattern='se_ann_%s.json',
                     study_analyzer=None):
    p, fn = split(ann_doc_path)
    file_key = fn[:fn.index('.')]
    json_doc = utils.load_json_data(ann_doc_path)
    ann_doc = SemEHRAnnDoc()
    ann_doc.load(json_doc, file_key=file_key)
    text = text_reader.read_full_text(ann_doc.file_key)
    if text is None:
        logging.error('file [%s] full text not found' % ann_doc.file_key)
        return
    reader = WrapperTextReader(text)
    process_doc_rule(ann_doc, rule_executor, reader, None, study_analyzer)
    utils.save_json_array(ann_doc.serialise_json(),
                          join(output_folder, fn_pattern % ann_doc.file_key))
    return ann_doc.serialise_json()
Exemplo n.º 26
0
def parse_disease_phenotypes(disease_phenotype_csv, disease_model_json):
    lines = utils.read_text_file(disease_phenotype_csv)
    dis_to_data = {}
    for l in lines[1:]:
        arr = l.split(',')
        lv4_id = arr[4]
        lv4_disease = arr[5]
        hpo_label = arr[7]
        hpo_id = arr[8]
        test = arr[9]
        test_id = arr[10]
        dis_data = []
        if lv4_disease in dis_to_data:
            dis_data = dis_to_data[lv4_disease]
        else:
            dis_to_data[lv4_disease] = dis_data
        if len(test.strip()) > 0:
            dis_data.append({'test': test, 'test_id': test_id})
        else:
            dis_data.append({'hpo_label': hpo_label, 'hpo_id': hpo_id})
    utils.save_json_array(dis_to_data, disease_model_json)
Exemplo n.º 27
0
def populate_concept_level_performance(complete_validation_file, c_map_file):
    if isfile(c_map_file):
        return utils.load_json_data(c_map_file)
    lines = utils.read_text_file(complete_validation_file)
    concept2label = {}
    for l in lines[1:]:
        arr = l.split('\t')
        label = arr[2]
        concept = arr[8]
        c_map = None
        if concept not in concept2label:
            c_map = {}
            concept2label[concept] = c_map
        else:
            c_map = concept2label[concept]
        if label not in c_map:
            c_map[label] = 1
        else:
            c_map[label] += 1
    utils.save_json_array(concept2label, c_map_file)
    return concept2label
Exemplo n.º 28
0
 def convert_csv_annotations(csv_file,
                             text_folder,
                             ann_folder,
                             mapping_file,
                             annotated_anns_file,
                             id_pattern='%s-%s',
                             ann_file_pattern='%s.txt.knowtator.xml'):
     with open(csv_file, newline='') as cf:
         reader = csv.DictReader(cf)
         label2concepts = {}
         d2annotated_anns = {}
         for r in reader:
             d2annotated_anns[r['doc_id'] + ".txt"] = [{
                 's': r['start'],
                 'e': r['end']
             }]
             if r['Skip Document'] != 'Yes':
                 utils.save_string(r['text'],
                                   join(text_folder, r['doc_id'] + ".txt"))
                 elem_annotations = ET.Element("annotations")
                 elem_annotations.set('textSource', r['doc_id'])
                 mention_id = id_pattern % (r['doc_id'], 0)
                 if r['Correct'] == 'Yes' and r['Negation'] == 'NOT Negated':
                     AnnConverter.create_elem_ann(elem_annotations,
                                                  mention_id, r['start'],
                                                  r['end'],
                                                  r['string_orig'],
                                                  r['icd10-ch'])
                 xml = ET.tostring(elem_annotations,
                                   encoding='unicode',
                                   method='xml')
                 utils.save_string(
                     xml, join(ann_folder, ann_file_pattern % r['doc_id']))
                 if r['icd10-ch'] not in label2concepts:
                     label2concepts[r['icd10-ch']] = []
                 if r['cui'] not in label2concepts[r['icd10-ch']]:
                     label2concepts[r['icd10-ch']].append(r['cui'])
         utils.save_json_array(label2concepts, mapping_file)
         utils.save_json_array(d2annotated_anns, annotated_anns_file)
Exemplo n.º 29
0
def query_all_concepts():
    total = 0
    docs = []
    concept2query = utils.load_json_data('./resources/mimir_queries.json')
    for c in concept2query:
        print 'querying %s' % c
        r = query_mimir('postQuery', {'queryString': concept2query[c]})
        qid = get_xml_data(r, 'm:data/m:queryId', mimir_ns)
        print 'query id: %s' % qid

        r = query_mimir('documentsCount', {'queryId': qid})
        document_count = get_xml_data(r, 'm:data/m:value', mimir_ns)
        print 'documentCount: %s' % document_count
        if document_count != '':
            document_count = int(document_count)
            if document_count > 0:
                total += document_count
                docs.append(
                    random_pick_results(c, qid, document_count,
                                        min(5, document_count)))
                print 'random picked %s' % c
    utils.save_json_array(docs, './samples/samples.json')
    print 'total docs: %s' % total
Exemplo n.º 30
0
def encode_ann_ctx(dic_file, ann_ctx_file, output_file=None):
    word_to_index = utils.load_json_data(dic_file)
    ann_ctxs = utils.load_json_data(ann_ctx_file)
    encoded = []
    for ann in ann_ctxs:
        encoded.append({
            'prev': [
                word_to_index[w.lower()]
                for w in nltk.word_tokenize(' '.join(ann['prev']).lower())
            ],
            'next': [
                word_to_index[w.lower()]
                for w in nltk.word_tokenize(' '.join(ann['next']).lower())
            ],
            'label':
            ann['label'],
            'annId':
            ann['annId'],
            'label_encoded':
            word_to_index[ann['label']]
        })
    if output_file is not None:
        utils.save_json_array(encoded, output_file)
    return encoded