def main(args, debug=False): if debug: field_limit = 10 print('!!! Debugging - will only check first {} fields'.format( field_limit)) else: field_limit = None data_path = args.data_path spec_path = args.spec results_path = args.to bysite = args.bysite spec = load_spec(spec_path) spec_df = pd.DataFrame(spec).T ccd = CCD(data_path, spec) non_text_fields = ['numeric', 'list', 'list / logical', 'Logical'] fields2check = { k: v for k, v in spec.items() if v['Datatype'] in non_text_fields } fields = [k for k in fields2check.keys()][:field_limit] # parentheses turn the following into a generator expression rows = list((row_generator(f, ccd=ccd, spec=spec, by=bysite, verbose=True) for f in fields)) # Convert list of dataframes to single data frame results = pd.concat(rows) # Merge in the rest of the data spec results = pd.merge(results, spec_df, on='NHICcode') gaps = ['gap_period', 'gap_start', 'gap_stop'] for i in gaps: results[i] = to_decimal_hours(results[i]) col_order = "NHICcode site_id dataItem level count nunique n pct min 25% 50% 75% max mean std coerced_values miss_by_episode gap_period gap_start gap_stop".split( ) # results[col_order].to_clipboard() results[col_order].to_csv(results_path)
import os import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt from statsmodels.graphics.mosaicplot import mosaic import sqlite3 from inspectEHR.utils import load_spec from inspectEHR.CCD import CCD from inspectEHR.data_classes import DataRaw, ContMixin, CatMixin # ccd = CCD(os.path.join('data-raw', 'anon_public_da1000.JSON'), random_sites=True) refs = load_spec(os.path.join('data-raw', 'N_DataItems.yml')) # - [ ] @TODO: (2017-07-14) # - need a _load_from_sqlite method (i.e. connect to sqlite) import sqlite3 conn = sqlite3.connect('ccd.db') c = conn.cursor() # - then once connected need an extract a _build_df (which just queries sqlite) # given NHICcode and byvar build df for item1d or item2d nhic_code = "NIHR_HIC_ICU_0108" sql = "SELECT * FROM item_tb WHERE NHICcode == '{}'".format(nhic_code) df = pd.read_sql_query('SELECT * FROM item_tb WHERE NHICcode = ?', conn, params=(nhic_code, )) df.shape df.columns df = df[['site_id', 'episode_id', 'value', 'time']]