Пример #1
0
def calculate_case_statistics():
    """Calculate statistics of patient cases."""
    words = data.get_stopwords()
    terms = data.get_medical_terms()
    print("Case | Lines | Stopwords | Terms | Medical terms")
    for code, case in sorted(PatientCase.ALL.items()):
        print(' & '.join((code, str(len(case.text.split('\n'))),
                str(len([i for i in case.text.split() if i in words])),
                str(len(case.vector)),
                str(len([i for i in case.vector.keys() if i in terms])))) + r' \\')
    print()
Пример #2
0
def main(script):
    """Run all the functions in this module."""
    data.main()  # Populate all objects

    # Generate a LaTeX table with all stopwords
    _generate_columned_table(sorted(data.get_stopwords()),
                             6, 'stopwords', 'Norwegian stopwords')

    # Generate a LaTeX table with all medical terms
    _generate_columned_table(sorted(data.get_medical_terms()),
                             3, 'medicalterms', 'Medical terms')

    generate_cases_table()
    calculate_chapter_statistics()
    calculate_case_statistics()
Пример #3
0
def parse_case_file(path, stopwords=get_stopwords()):
    """Read lines from case file in 'path'."""
    # Read in lines from case files
    with open(path) as f:
        text = []
        for line in f.readlines():
            line = ' '.join(i for i in line.strip().split(' ')
                                    if i.lower() not in stopwords)
            if line:
                if line[-1] == '.':
                    line = line[:-1]  # Remove period from queries
                text.append(line)

    filename, ext = os.path.splitext(os.path.split(path)[1])
    PatientCase(filename.replace('case', ''), '\n'.join(text))
Пример #4
0
from math import log

from whoosh.index import create_in, open_dir, exists_in
from whoosh.analysis import StandardAnalyzer
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.qparser import QueryParser, OrGroup

from data import ATC, ICD, PatientCase, Therapy, populate_all, get_stopwords


# Folder to store whoosh index in
INDEX_DIR = 'whooshindex'


# Analyzer which removes stopwords
ANALYZER = StandardAnalyzer(stoplist=get_stopwords())


# Schema for storing and indexing ATC codes in whoosh database
ATC_SCHEMA = Schema(code=ID(stored=True), title=TEXT(stored=True))


# Schema for storing and indexing ICD10 codes in whoosh database
ICD_SCHEMA = Schema(code=ID(stored=True), short=ID(stored=True),
                    label=TEXT(stored=True, analyzer=ANALYZER),
                    type=TEXT, icpc2_code=ID,
                    icpc2_label=TEXT, synonyms=TEXT, terms=TEXT,
                    inclusions=TEXT, exclusions=TEXT,
                    description=TEXT(analyzer=ANALYZER))