# -*- coding: utf-8 -*- """MIMIC dataset handling with full dataset """ # Author: Yue Zhao <*****@*****.**> # License: BSD 2 clause import os import sys import pandas as pd import json from joblib import Parallel, delayed # temporary solution for relative imports in case combo is not installed # if combo is installed, no need to use the following line sys.path.append( os.path.abspath(os.path.join(os.path.dirname("__file__"), '..'))) from pyhealth.data.base_mimic import parallel_parse_tables from pyhealth.utils.utility_parallel import unfold_parallel from pyhealth.utils.utility_parallel import partition_estimators from pyhealth.utils.utility import read_csv_to_df import warnings if not sys.warnoptions: warnings.simplefilter("ignore") if __name__ == "__main__": n_jobs = 6 # number of parallel jobs duration = 21600 # time window for episode generation selection_method = 'last' mimic_data_loc = 'D:\\mimic-iii-clinical-database-1.4' # change this to your mimic full data location save_dir = os.path.join('outputs', 'mimic') # make saving directory if needed if not os.path.isdir(save_dir): os.makedirs(save_dir)
'icd9_dgns_cd_8', 'icd9_dgns_cd_9', 'icd9_dgns_cd_10', ] patient_data_loc = 'cms_patient_data.json' patient_list_loc = 'cms_patient_list.json' valid_data_list = [] # keep tracking the stored data valid_id_list = [] # keep tracking a list of patient IDs# valid_sequence_list = [] # read in tables patient_df = read_csv_to_df( os.path.join('data', 'cms-sample-1', 'DE1_0_2008_Beneficiary_Summary_File_Sample_1.csv')) patient_id_list = patient_df['desynpuf_id'].tolist() # change the format of the date patient_df['dob'] = pd.to_datetime(patient_df['bene_birth_dt'], format='%Y%m%d') event_df = read_csv_to_df( os.path.join('data', 'cms-sample-1', 'DE1_0_2008_to_2010_Inpatient_Claims_Sample_1.csv')) event_df['icd9_prcdr_cd_1'] = event_df['icd9_prcdr_cd_1'].astype('Int64') event_df['icd9_prcdr_cd_1'] = event_df['icd9_prcdr_cd_1'].astype(str) # change the format of the date event_df['clm_from_dt'] = pd.to_datetime(event_df['clm_from_dt'], format='%Y%m%d')
# -*- coding: utf-8 -*- """MIMIC dataset handling using parallelization on demo data """ # Author: Yue Zhao <*****@*****.**> # License: BSD 2 clause import os import sys import pandas as pd import json from joblib import Parallel, delayed # temporary solution for relative imports in case combo is not installed # if combo is installed, no need to use the following line sys.path.append( os.path.abspath(os.path.join(os.path.dirname("__file__"), '..'))) from pyhealth.data.base_mimic import parallel_parse_tables from pyhealth.utils.utility_parallel import unfold_parallel from pyhealth.utils.utility_parallel import partition_estimators from pyhealth.utils.utility import read_csv_to_df from pyhealth.utils.utility import make_dirs_if_not_exists import warnings if not sys.warnoptions: warnings.simplefilter("ignore") if __name__ == "__main__": n_jobs = 4 # number of parallel jobs duration = 21600 # time window for episode generation selection_method = 'last' save_dir = os.path.join('outputs', 'mimic_demo', 'raw') make_dirs_if_not_exists(save_dir) patient_data_loc = os.path.join(save_dir, 'patient_data_demo.json') valid_data_list = [] # keep tracking the stored data
# -*- coding: utf-8 -*- """MIMIC dataset handling with single thread on demo data """ # Author: Yue Zhao <*****@*****.**> # License: BSD 2 clause import os import sys import time import pandas as pd from tqdm import tqdm import json # temporary solution for relative imports in case combo is not installed # if combo is installed, no need to use the following line sys.path.append( os.path.abspath(os.path.join(os.path.dirname("__file__"), '..'))) from pyhealth.data.base_mimic import MIMIC_Data from pyhealth.utils.utility import read_csv_to_df from pyhealth.utils.utility import make_dirs_if_not_exists import warnings if not sys.warnoptions: warnings.simplefilter("ignore") if __name__ == "__main__": n_jobs = 4 # number of parallel jobs duration = 21600 # time window for episode generation selection_method = 'last' save_dir = os.path.join('outputs', 'mimic_demo', 'raw') make_dirs_if_not_exists(save_dir) patient_data_loc = os.path.join(save_dir, 'patient_data_demo.json') valid_data_list = [] # keep tracking the stored data valid_id_list = [] # keep tracking a list of patient IDs
def test_01_flow(self): n_jobs = 2 # number of parallel jobs n_samples = 10 duration = 21600 # time window for episode generation selection_method = 'last' # make saving directory if needed if not os.path.isdir(self.save_dir): os.makedirs(self.save_dir) patient_data_loc = 'patient_data.json' patient_list_loc = 'patient_list.json' valid_data_list = [] # keep tracking the stored data valid_id_list = [] # keep tracking a list of patient IDs # key variables to track in the episode var_list = [ 'Capillary refill rate', 'Diastolic blood pressure', 'Fraction inspired oxygen', 'Glascow coma scale eye opening', 'Glascow coma scale motor response', 'Glascow coma scale total', 'Glascow coma scale verbal response', 'Glucose', 'Heart Rate', 'Height', 'Mean blood pressure', 'Oxygen saturation', 'Respiratory rate', 'Systolic blood pressure', 'Temperature', 'Weight', 'pH' ] # enforce and convert to lower case var_list = [item.lower() for item in var_list] print(os.getcwd()) print(os.getcwd()) print(os.getcwd()) event_mapping_df = read_csv_to_df( os.path.join('examples', 'data_generation', 'resources', 'itemid_to_variable_map.csv')) event_mapping_df['level2'] = event_mapping_df['level2'].str.lower() key_df = event_mapping_df[event_mapping_df['level2'].isin(var_list)] key_items = key_df['itemid'].tolist() ################################################################# # read in tables patient_df = read_csv_to_df( os.path.join('examples', 'data_generation', 'data', 'mimic-iii-clinical-database-demo-1.4', 'PATIENTS.csv')) patient_id_list = patient_df['subject_id'].tolist() admission_df = read_csv_to_df( os.path.join('examples', 'data_generation', 'data', 'mimic-iii-clinical-database-demo-1.4', 'ADMISSIONS.csv')) icu_df = read_csv_to_df( os.path.join('examples', 'data_generation', 'data', 'mimic-iii-clinical-database-demo-1.4', 'ICUSTAYS.csv')) events_vars = [ 'subject_id', 'hadm_id', 'icustay_id', 'itemid', 'charttime', 'value', 'valueuom', ] # because MIMIC's header is in upper case # however, demo code does not # events_vars = [item.upper() for item in events_vars] # define datatype to reduce the memory cost dtypes_dict = { 'subject_id': 'int32', 'hadm_id': 'int32', 'icustay_id': 'object', 'itemid': 'int32', 'charttime': 'object', 'value': 'object', 'valueuom': 'object', } event_df = read_csv_to_df( # os.path.join('data', 'mimic-iii-clinical-database-demo-1.4', os.path.join('examples', 'data_generation', 'data', 'mimic-iii-clinical-database-demo-1.4', 'CHARTEVENTS.csv'), usecols=events_vars, dtype=dtypes_dict, low_memory=True) # only keep the events we are interested in event_df = event_df[event_df['itemid'].isin(key_items)] oevent_df = read_csv_to_df(os.path.join( 'examples', 'data_generation', 'data', 'mimic-iii-clinical-database-demo-1.4', 'OUTPUTEVENTS.csv'), usecols=events_vars, dtype=dtypes_dict, low_memory=True) # only keep the events we are interested in oevent_df = oevent_df[oevent_df['itemid'].isin(key_items)] event_df = pd.concat([event_df, oevent_df]) event_df['charttime'] = pd.to_datetime(event_df['charttime']) # Start data processing n_patients_list, starts, n_jobs = partition_estimators(n_samples, n_jobs=n_jobs) all_results = Parallel(n_jobs=n_jobs, max_nbytes=None, verbose=True)( delayed(parallel_parse_tables)( patient_id_list=patient_id_list[starts[i]:starts[i + 1]], patient_df=patient_df, admission_df=admission_df, icu_df=icu_df, event_df=event_df, event_mapping_df=event_mapping_df, duration=duration, selection_method=selection_method, var_list=var_list, save_dir=self.save_dir) for i in range(n_jobs)) all_results = list(map(list, zip(*all_results))) valid_data_list = unfold_parallel(all_results[0], n_jobs) valid_id_list = unfold_parallel(all_results[1], n_jobs) patient_data_list = [] for p in valid_data_list: patient_data_list.append(p.data) with open(self.patient_data_loc, 'w') as outfile: json.dump(patient_data_list, outfile) print(patient_data_list)
# key variables to track in the episode var_list = [ 'Capillary refill rate', 'Diastolic blood pressure', 'Fraction inspired oxygen', 'Glascow coma scale eye opening', 'Glascow coma scale motor response', 'Glascow coma scale total', 'Glascow coma scale verbal response', 'Glucose', 'Heart Rate', 'Height', 'Mean blood pressure', 'Oxygen saturation', 'Respiratory rate', 'Systolic blood pressure', 'Temperature', 'Weight', 'pH' ] # enforce and convert to lower case var_list = [item.lower() for item in var_list] event_mapping_df = read_csv_to_df( os.path.join('resources', 'itemid_to_variable_map.csv')) event_mapping_df['level2'] = event_mapping_df['level2'].str.lower() key_df = event_mapping_df[event_mapping_df['level2'].isin(var_list)] key_items = key_df['itemid'].tolist() ################################################################# # read in tables patient_df = read_csv_to_df(os.path.join(mimic_data_loc, 'PATIENTS.csv.gz')) patient_id_list = patient_df['subject_id'].tolist() admission_df = read_csv_to_df( os.path.join(mimic_data_loc, 'ADMISSIONS.csv.gz')) icu_df = read_csv_to_df(os.path.join(mimic_data_loc, 'ICUSTAYS.csv.gz'))