예제 #1
0
def main(args, debug=False):

    if debug:
        field_limit = 10
        print('!!! Debugging - will only check first {} fields'.format(
            field_limit))
    else:
        field_limit = None

    data_path = args.data_path
    spec_path = args.spec
    results_path = args.to
    bysite = args.bysite

    spec = load_spec(spec_path)
    spec_df = pd.DataFrame(spec).T

    ccd = CCD(data_path, spec)

    non_text_fields = ['numeric', 'list', 'list / logical', 'Logical']
    fields2check = {
        k: v
        for k, v in spec.items() if v['Datatype'] in non_text_fields
    }
    fields = [k for k in fields2check.keys()][:field_limit]

    # parentheses turn the following into a generator expression
    rows = list((row_generator(f, ccd=ccd, spec=spec, by=bysite, verbose=True)
                 for f in fields))

    # Convert list of dataframes to single data frame
    results = pd.concat(rows)
    # Merge in the rest of the data spec
    results = pd.merge(results, spec_df, on='NHICcode')

    gaps = ['gap_period', 'gap_start', 'gap_stop']
    for i in gaps:
        results[i] = to_decimal_hours(results[i])

    col_order = "NHICcode site_id dataItem level count nunique n pct min 25% 50% 75% max mean std coerced_values miss_by_episode gap_period gap_start gap_stop".split(
    )
    # results[col_order].to_clipboard()
    results[col_order].to_csv(results_path)
예제 #2
0
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.graphics.mosaicplot import mosaic

import sqlite3

from inspectEHR.utils import load_spec
from inspectEHR.CCD import CCD
from inspectEHR.data_classes import DataRaw, ContMixin, CatMixin


# ccd = CCD(os.path.join('data-raw', 'anon_public_da1000.JSON'), random_sites=True)
refs = load_spec(os.path.join('data-raw', 'N_DataItems.yml'))

# - [ ] @TODO: (2017-07-14)
# - need a _load_from_sqlite method (i.e. connect to sqlite)
import sqlite3
conn = sqlite3.connect('ccd.db')
c = conn.cursor()

# - then once connected need an extract a _build_df (which just queries sqlite)
# given NHICcode and byvar build df for item1d or item2d
nhic_code = "NIHR_HIC_ICU_0108"
sql = "SELECT * FROM item_tb WHERE NHICcode == '{}'".format(nhic_code)
df = pd.read_sql_query('SELECT * FROM item_tb WHERE NHICcode = ?', conn, params=(nhic_code, ))
df.shape
df.columns
df = df[['site_id', 'episode_id', 'value', 'time']]