def clean_dgfa(): fn = 'CASEDDGFA.csv' read_file_path = gu.get_file_path(fn, under_raw=True) df_dgfa = pd.read_csv(read_file_path, encoding='utf8') df_dgfa = df_dgfa.drop('IPROTOCOL_ID', axis=1) df_dgfa = df_dgfa.drop([ 'HDMT_ID', 'PCVAMT_ID', 'POMT_ID', 'UA_ID', 'UAMT_ID', 'URMT_ID', 'SMC_NM', 'SMY_NM', 'SMCP_ID', 'PTIAMT_ID', 'HCY_NM', 'HCMT_ID', 'HTY_NM', 'HTMT_ID', 'DMY_NM', 'DMMT_ID', 'PADMT_ID', 'CA_TX', 'OT_ID', 'OT_TX', 'THISHC_ID', 'THISHY_ID', 'THISDI_ID', 'IGUID_FT' ], axis=1) df_dgfa.loc[out_of_range(df_dgfa['HD_ID'], ['0', '1']), 'HD_ID'] = np.nan df_dgfa.loc[out_of_range(df_dgfa['PCVA_ID'], ['0', '1']), 'PCVA_ID'] = np.nan df_dgfa.loc[out_of_range(df_dgfa['PCVACI_ID'], ['0', '1']), 'PCVACI_ID'] = np.nan df_dgfa.loc[out_of_range(df_dgfa['PCVACH_ID'], ['0', '1']), 'PCVACH_ID'] = np.nan df_dgfa.loc[out_of_range(df_dgfa['PO_ID'], ['0', '1']), 'PO_ID'] = np.nan df_dgfa.loc[out_of_range(df_dgfa['UR_ID'], ['0', '1']), 'UR_ID'] = np.nan df_dgfa.loc[out_of_range(df_dgfa['SM_ID'], ['0', '1']), 'SM_ID'] = np.nan df_dgfa.loc[out_of_range(df_dgfa['PTIA_ID'], ['0', '1']), 'PTIA_ID'] = np.nan df_dgfa.loc[out_of_range(df_dgfa['HC_ID'], ['0', '1']), 'HC_ID'] = np.nan df_dgfa.loc[out_of_range(df_dgfa['HCHT_ID'], ['0', '1']), 'HCHT_ID'] = np.nan df_dgfa.loc[out_of_range(df_dgfa['HCHC_ID'], ['0', '1']), 'HCHC_ID'] = np.nan df_dgfa.loc[out_of_range(df_dgfa['HT_ID'], ['0', '1']), 'HT_ID'] = np.nan df_dgfa.loc[out_of_range(df_dgfa['DM_ID'], ['0', '1']), 'DM_ID'] = np.nan df_dgfa.loc[out_of_range(df_dgfa['PAD_ID'], ['0', '1']), 'PAD_ID'] = np.nan df_dgfa.loc[out_of_range(df_dgfa['AL_ID'], ['0', '1']), 'AL_ID'] = np.nan df_dgfa.loc[out_of_range(df_dgfa['CA_ID'], ['0', '1']), 'CA_ID'] = np.nan return df_dgfa
def clean_dbmrs(): fn = 'CASEDBMRS(denormalized).csv' read_file_path = gu.get_file_path(fn, under_raw=True) df_dbmrs = pd.read_csv(read_file_path, encoding='utf8') df_dbmrs.loc[out_of_range(df_dbmrs['Feeding'], ['0', '5', '10']), 'Feeding'] = np.nan df_dbmrs.loc[out_of_range(df_dbmrs['Transfers'], ['0', '5', '10', '15']), 'Transfers'] = np.nan df_dbmrs.loc[out_of_range(df_dbmrs['Bathing'], ['0', '5']), 'Bathing'] = np.nan df_dbmrs.loc[out_of_range(df_dbmrs['Toilet_use'], ['0', '5', '10']), 'Toilet_use'] = np.nan df_dbmrs.loc[out_of_range(df_dbmrs['Grooming'], ['0', '5']), 'Grooming'] = np.nan df_dbmrs.loc[out_of_range(df_dbmrs['Mobility'], ['0', '5', '10', '15']), 'Mobility'] = np.nan df_dbmrs.loc[out_of_range(df_dbmrs['Stairs'], ['0', '5', '10']), 'Stairs'] = np.nan df_dbmrs.loc[out_of_range(df_dbmrs['Dressing'], ['0', '5', '10']), 'Dressing'] = np.nan df_dbmrs.loc[out_of_range(df_dbmrs['Bowel_control'], ['0', '5', '10']), 'Bowel_control'] = np.nan df_dbmrs.loc[out_of_range(df_dbmrs['Bladder_control'], ['0', '5', '10']), 'Bladder_control'] = np.nan df_dbmrs.loc[out_of_range(df_dbmrs['discharged_mrs'], ['0', '1', '2', '3', '4', '5', '6']), 'discharged_mrs'] = np.nan return df_dbmrs
def de_casedfahi(): patients_dic = {} title = ['ICASE_ID', 'IDCASE_ID', 'FH_HBP', 'FH_DB', 'FH_HD', 'FH_ST'] diseace_code = {'1': 'FH_HBP', '2': 'FH_DB', '3': 'FH_HD', '4': 'FH_ST'} read_file_path = gu.get_file_path('CASEDFAHI.csv', under_raw=True) with open(read_file_path, 'r', encoding='utf8') as csvfile: reader = csv.DictReader(csvfile) for row in reader: icase_id = row['ICASE_ID'] idcase_id = row['IDCASE_ID'] combind_id = icase_id + idcase_id fahiid_id = row['FAHIID_ID'] parents_v = row['PARENTS_CD'] brsi_v = row['BRSI_CD'] if combind_id in patients_dic.keys(): key = diseace_code.get(fahiid_id) patients_dic.get(combind_id)[key] = get_hist_value( parents_v, brsi_v) else: # initial a patient's dictionary p_dic = { 'ICASE_ID': icase_id, 'IDCASE_ID': idcase_id, 'FH_HBP': '', 'FH_DB': '', 'FH_HD': '', 'FH_ST': '' } key = diseace_code.get(fahiid_id) p_dic[key] = get_hist_value(parents_v, brsi_v) patients_dic[combind_id] = p_dic gu.save_array_to_csv('CASEDFAHI(denormalized)', title, patients_dic, under_raw=True)
def de_casedbmrs(): patients_dic = {} title = [ 'ICASE_ID', 'IDCASE_ID', 'Feeding', 'Transfers', 'Bathing', 'Toilet_use', 'Grooming', 'Mobility', 'Stairs', 'Dressing', 'Bowel_control', 'Bladder_control', 'discharged_mrs' ] bid_code = { '1.00': 'Feeding', '2.00': 'Transfers', '3.00': 'Bathing', '4.00': 'Toilet_use', '5.00': 'Grooming', '6.00': 'Mobility', '7.00': 'Stairs', '8.00': 'Dressing', '9.00': 'Bowel_control', '10.00': 'Bladder_control', '11.00': 'discharged_mrs' } read_file_path = gu.get_file_path('CASEDBMRS.csv', under_raw=True) with open(read_file_path, 'r', encoding='utf8') as csvfile: reader = csv.DictReader(csvfile) for row in reader: icase_id = row['ICASE_ID'] idcase_id = row['IDCASE_ID'] combind_id = icase_id + idcase_id bid_nm = row['BID_NM'] botv_nm = row['BOTV_NM'] if combind_id in patients_dic.keys(): key = bid_code.get(bid_nm) patients_dic.get(combind_id)[key] = botv_nm else: # initial a patient's dictionary p_dic = { 'ICASE_ID': icase_id, 'IDCASE_ID': idcase_id, 'Feeding': '', 'Transfers': '', 'Bathing': '', 'Toilet_use': '', 'Grooming': '', 'Mobility': '', 'Stairs': '', 'Dressing': '', 'Bowel_control': '', 'Bladder_control': '', 'discharged_mrs': '' } key = bid_code.get(bid_nm) p_dic[key] = botv_nm patients_dic[combind_id] = p_dic gu.save_array_to_csv('CASEDBMRS(denormalized)', title, patients_dic, under_raw=True)
def clean_mcase(): fn = 'CASEMCASE.csv' read_file_path = gu.get_file_path(fn, under_raw=True) df_mcase = pd.read_csv(read_file_path, encoding='utf8') df_mcase = df_mcase.drop(['IPROTOCOL_ID', 'CNAME_TX', 'CID_ID'], axis=1) df_mcase['GENDER_TX'] = df_mcase['GENDER_TX'].replace({'F': '0', 'M': '1'}) df_mcase['GENDER_TX'] = df_mcase['GENDER_TX'].replace(to_replace=r"[^0-1]", value=np.NaN, regex=True) return df_mcase
def clean_ctmr(): fn = 'CASEDCTMR(denormalized).csv' read_file_path = gu.get_file_path(fn, under_raw=True) df_ctmr = pd.read_csv(read_file_path, encoding='utf8') df_ctmr.iloc[:, 2:df_ctmr.shape[1]] = df_ctmr.iloc[:, 2:df_ctmr. shape[1]].replace({ 'N': '0', 'Y': '1' }) return df_ctmr
def de_casedrfur(): patients_dic = {} title = [ 'ICASE_ID', 'IDCASE_ID', 'VERS_1', 'VERS_3', 'VERS_6', 'VERS_12', 'VEIHD_1', 'VEIHD_3', 'VEIHD_6', 'VEIHD_12', 'MRS_1', 'MRS_3', 'MRS_6', 'MRS_12' ] read_file_path = gu.get_file_path('CASEDRFUR.csv', under_raw=True) with open(read_file_path, 'r', encoding='utf8') as csvfile: reader = csv.DictReader(csvfile) for row in reader: icase_id = row['ICASE_ID'] idcase_id = row['IDCASE_ID'] combind_id = icase_id + idcase_id rfur_nm = row['RFUR_NM'] vers_fl = row['VERS_FL'] veihd_fl = row['VEIHD_FL'] mrs_tx = row['MRS_TX'] if combind_id in patients_dic.keys(): patients_dic.get(combind_id)['VERS_' + rfur_nm] = vers_fl patients_dic.get(combind_id)['VEIHD_' + rfur_nm] = veihd_fl patients_dic.get(combind_id)['MRS_' + rfur_nm] = mrs_tx else: # initial a patient's dictionary p_dic = { 'ICASE_ID': icase_id, 'IDCASE_ID': idcase_id, 'VERS_1': '', 'VERS_3': '', 'VERS_6': '', 'VERS_12': '', 'VEIHD_1': '', 'VEIHD_3': '', 'VEIHD_6': '', 'VEIHD_12': '', 'MRS_1': '', 'MRS_3': '', 'MRS_6': '', 'MRS_12': '' } p_dic['VERS_' + rfur_nm] = vers_fl p_dic['VEIHD_' + rfur_nm] = veihd_fl p_dic['MRS_' + rfur_nm] = mrs_tx patients_dic[combind_id] = p_dic gu.save_array_to_csv('CASEDRFUR(denormalized)', title, patients_dic, under_raw=True)
def clean_rfur(): fn = 'CASEDRFUR(denormalized).csv' read_file_path = gu.get_file_path(fn, under_raw=True) df_rfur = pd.read_csv(read_file_path, encoding='utf8') rfur_cols = [ 'VERS_1', 'VERS_3', 'VERS_6', 'VERS_12', 'VEIHD_1', 'VEIHD_3', 'VEIHD_6', 'VEIHD_12' ] df_rfur[rfur_cols] = df_rfur[rfur_cols].replace({'N': '0', 'Y': '1'}) df_rfur.loc[ out_of_range(df_rfur['MRS_1'], ['0', '1', '2', '3', '4', '5', '6']), 'MRS_1'] = np.nan df_rfur.loc[ out_of_range(df_rfur['MRS_3'], ['0', '1', '2', '3', '4', '5', '6']), 'MRS_3'] = np.nan df_rfur.loc[ out_of_range(df_rfur['MRS_6'], ['0', '1', '2', '3', '4', '5', '6']), 'MRS_6'] = np.nan df_rfur.loc[ out_of_range(df_rfur['MRS_12'], ['0', '1', '2', '3', '4', '5', '6']), 'MRS_12'] = np.nan return df_rfur
def clean_fahi(): fn = 'CASEDFAHI(denormalized).csv' read_file_path = gu.get_file_path(fn, under_raw=True) df_fahi = pd.read_csv(read_file_path, encoding='utf8') return df_fahi
from tools import genral_utils as gu import numpy as np from visualization import plot_utils as pu from sklearn.preprocessing import MinMaxScaler import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn.cluster import KMeans, DBSCAN from visualization import plot_utils as pu if __name__ == '__main__': # -- Load Data df = pd.read_csv(gu.get_file_path('NIH_data.csv', under_raw=False), encoding='utf8') n = 'Discharge: NIHSS Total' b = 'Discharge: Barthel Scale Total' m = 'Discharge: Rankin Score' df_nbm = df[[n, b, m]] df_nbm = df_nbm[df_nbm[m] != 6] df_nbm = df_nbm.dropna() # -- Plot # df_nbm[[m, b]].boxplot(column=[b], by=m) # fig = plt.figure(figsize=(15, 5)) # pu.bubble_plot(df_nbm[[m, b]], [m, b]) # pu.violin_plot(df_nbm[[m, n]]) # df_nbm[[n, b]].boxplot(column=[b], by=n) pu.scatt_plot(df_nbm) plt.show() # for i in range(3, 4, 1):
def de_casedfahi(): patients_dic_1 = {} patients_dic_2 = {} title_1 = ['ICASE_ID', 'IDCASE_ID', 'GUID_TSYM', 'FAHIID_PARENTS_1', 'FAHIID_PARENTS_2', 'FAHIID_PARENTS_3', 'FAHIID_PARENTS_4'] diseace_code = { '1': 'FAHIID_PARENTS_1', '2': 'FAHIID_PARENTS_2', '3': 'FAHIID_PARENTS_3', '4': 'FAHIID_PARENTS_4'} read_file_path = gu.get_file_path('CASEDFAHI.csv', under_raw=True) with open(read_file_path, 'r', encoding='utf8', errors='ignore') as csvfile: reader = csv.DictReader(csvfile) for row in reader: icase_id = row['ICASE_ID'] idcase_id = row['IDCASE_ID'] combind_id = icase_id + idcase_id fahiid_id = str(int(float(row['FAHIID_ID']))) parents_v = row['PARENTS_CD'] # guid = row['GUID_TSYM'] if combind_id in patients_dic_1.keys(): key = diseace_code.get(fahiid_id) patients_dic_1.get(combind_id)[key] = parents_v else: # initial a patient's dictionary # p_dic = {'ICASE_ID': icase_id, 'IDCASE_ID': idcase_id, 'GUID_TSYM': guid, # 'FAHIID_PARENTS_1': '', 'FAHIID_PARENTS_2': '', 'FAHIID_PARENTS_3': '', 'FAHIID_PARENTS_4': ''} p_dic = {'ICASE_ID': icase_id, 'IDCASE_ID': idcase_id, 'FAHIID_PARENTS_1': '', 'FAHIID_PARENTS_2': '', 'FAHIID_PARENTS_3': '', 'FAHIID_PARENTS_4': ''} key = diseace_code.get(fahiid_id) p_dic[key] = parents_v patients_dic_1[combind_id] = p_dic # == title_2 = ['ICASE_ID', 'IDCASE_ID', 'GUID_TSYM', 'FAHIID_BRSI_1', 'FAHIID_BRSI_2', 'FAHIID_BRSI_3', 'FAHIID_BRSI_4'] diseace_code = { '1': 'FAHIID_BRSI_1', '2': 'FAHIID_BRSI_2', '3': 'FAHIID_BRSI_3', '4': 'FAHIID_BRSI_4'} read_file_path = gu.get_file_path('CASEDFAHI.csv', under_raw=True) with open(read_file_path, 'r', encoding='utf8', errors='ignore') as csvfile: reader = csv.DictReader(csvfile) for row in reader: icase_id = row['ICASE_ID'] idcase_id = row['IDCASE_ID'] combind_id = icase_id + idcase_id fahiid_id = str(int(float(row['FAHIID_ID']))) brsi_v = row['BRSI_CD'] # guid = row['GUID_TSYM'] if combind_id in patients_dic_2.keys(): key = diseace_code.get(fahiid_id) patients_dic_2.get(combind_id)[key] = brsi_v else: # initial a patient's dictionary # p_dic = {'ICASE_ID': icase_id, 'IDCASE_ID': idcase_id, 'GUID_TSYM': guid, # 'FAHIID_BRSI_1': '', 'FAHIID_BRSI_2': '', 'FAHIID_BRSI_3': '', 'FAHIID_BRSI_4': ''} p_dic = {'ICASE_ID': icase_id, 'IDCASE_ID': idcase_id, 'FAHIID_BRSI_1': '', 'FAHIID_BRSI_2': '', 'FAHIID_BRSI_3': '', 'FAHIID_BRSI_4': ''} key = diseace_code.get(fahiid_id) p_dic[key] = brsi_v patients_dic_2[combind_id] = p_dic # title = ['ICASE_ID', 'IDCASE_ID', 'GUID_TSYM', 'FAHIID_PARENTS_1', 'FAHIID_PARENTS_2', 'FAHIID_PARENTS_3', # 'FAHIID_PARENTS_4', 'FAHIID_BRSI_1', 'FAHIID_BRSI_2', 'FAHIID_BRSI_3', 'FAHIID_BRSI_4'] title = ['ICASE_ID', 'IDCASE_ID', 'FAHIID_PARENTS_1', 'FAHIID_PARENTS_2', 'FAHIID_PARENTS_3', 'FAHIID_PARENTS_4', 'FAHIID_BRSI_1', 'FAHIID_BRSI_2', 'FAHIID_BRSI_3', 'FAHIID_BRSI_4'] patients_dic = {} if len(patients_dic_1) == len(patients_dic_2): for k in patients_dic_1.keys(): dic_1 = patients_dic_1[k] dic_2 = patients_dic_2[k] patients_dic[k] = {**dic_1, **dic_2} gu.save_array_to_csv('CASEDFAHI(denormalized)', title, patients_dic, under_raw=True)
def de_casedrfur(): patients_dic = {} # title = ['ICASE_ID', 'IDCASE_ID', 'GUID_TSYM', # 'FSTATUS_ID_1', 'RFUR_DT_1', 'LOCATION_ID_1', 'TORG_ID_1', 'FLU_ID_1', 'FLUORG_ID_1', 'FLUORG_TX_1', 'FLURESULT_TX_1', 'DEATH_DT_1', 'DEATH_ID_1', 'DEATHSK_ID_1', 'DEATHO_TX_1', 'VE_ID_1', 'VERS_FL_1', 'VERSCICH_ID_1', 'VERS_DT_1', 'VERSORG_ID_1', 'VEIHD_FL_1', 'VEIHD_ID_1', 'VEIHD_DT_1', 'VEIHDORG_ID_1', 'MRS_TX_1', 'TORG_TX_1', 'VERSORG_TX_1', 'VEIHDORG_TX_1', # 'FSTATUS_ID_3', 'RFUR_DT_3', 'LOCATION_ID_3', 'TORG_ID_3', 'FLU_ID_3', 'FLUORG_ID_3', 'FLUORG_TX_3', 'FLURESULT_TX_3', 'DEATH_DT_3', 'DEATH_ID_3', 'DEATHSK_ID_3', 'DEATHO_TX_3', 'VE_ID_3', 'VERS_FL_3', 'VERSCICH_ID_3', 'VERS_DT_3', 'VERSORG_ID_3', 'VEIHD_FL_3', 'VEIHD_ID_3', 'VEIHD_DT_3', 'VEIHDORG_ID_3', 'MRS_TX_3', 'TORG_TX_3', 'VERSORG_TX_3', 'VEIHDORG_TX_3', # 'FSTATUS_ID_6', 'RFUR_DT_6', 'LOCATION_ID_6', 'TORG_ID_6', 'FLU_ID_6', 'FLUORG_ID_6', 'FLUORG_TX_6', 'FLURESULT_TX_6', 'DEATH_DT_6', 'DEATH_ID_6', 'DEATHSK_ID_6', 'DEATHO_TX_6', 'VE_ID_6', 'VERS_FL_6', 'VERSCICH_ID_6', 'VERS_DT_6', 'VERSORG_ID_6', 'VEIHD_FL_6', 'VEIHD_ID_6', 'VEIHD_DT_6', 'VEIHDORG_ID_6', 'MRS_TX_6', 'TORG_TX_6', 'VERSORG_TX_6', 'VEIHDORG_TX_6', # 'FSTATUS_ID_12', 'RFUR_DT_12', 'LOCATION_ID_12', 'TORG_ID_12', 'FLU_ID_12', 'FLUORG_ID_12', 'FLUORG_TX_12', 'FLURESULT_TX_12', 'DEATH_DT_12', 'DEATH_ID_12', 'DEATHSK_ID_12', 'DEATHO_TX_12', 'VE_ID_12', 'VERS_FL_12', 'VERSCICH_ID_12', 'VERS_DT_12', 'VERSORG_ID_12', 'VEIHD_FL_12', 'VEIHD_ID_12', 'VEIHD_DT_12', 'VEIHDORG_ID_12', 'MRS_TX_12', 'TORG_TX_12', 'VERSORG_TX_12', 'VEIHDORG_TX_12' # ] title = ['ICASE_ID', 'IDCASE_ID', 'FSTATUS_ID_1', 'RFUR_DT_1', 'LOCATION_ID_1', 'TORG_ID_1', 'FLU_ID_1', 'FLUORG_ID_1', 'FLUORG_TX_1', 'FLURESULT_TX_1', 'DEATH_DT_1', 'DEATH_ID_1', 'DEATHSK_ID_1', 'DEATHO_TX_1', 'VE_ID_1', 'VERS_FL_1', 'VERSCICH_ID_1', 'VERS_DT_1', 'VERSORG_ID_1', 'VEIHD_FL_1', 'VEIHD_ID_1', 'VEIHD_DT_1', 'VEIHDORG_ID_1', 'MRS_TX_1', 'TORG_TX_1', 'VERSORG_TX_1', 'VEIHDORG_TX_1', 'FSTATUS_ID_3', 'RFUR_DT_3', 'LOCATION_ID_3', 'TORG_ID_3', 'FLU_ID_3', 'FLUORG_ID_3', 'FLUORG_TX_3', 'FLURESULT_TX_3', 'DEATH_DT_3', 'DEATH_ID_3', 'DEATHSK_ID_3', 'DEATHO_TX_3', 'VE_ID_3', 'VERS_FL_3', 'VERSCICH_ID_3', 'VERS_DT_3', 'VERSORG_ID_3', 'VEIHD_FL_3', 'VEIHD_ID_3', 'VEIHD_DT_3', 'VEIHDORG_ID_3', 'MRS_TX_3', 'TORG_TX_3', 'VERSORG_TX_3', 'VEIHDORG_TX_3', 'FSTATUS_ID_6', 'RFUR_DT_6', 'LOCATION_ID_6', 'TORG_ID_6', 'FLU_ID_6', 'FLUORG_ID_6', 'FLUORG_TX_6', 'FLURESULT_TX_6', 'DEATH_DT_6', 'DEATH_ID_6', 'DEATHSK_ID_6', 'DEATHO_TX_6', 'VE_ID_6', 'VERS_FL_6', 'VERSCICH_ID_6', 'VERS_DT_6', 'VERSORG_ID_6', 'VEIHD_FL_6', 'VEIHD_ID_6', 'VEIHD_DT_6', 'VEIHDORG_ID_6', 'MRS_TX_6', 'TORG_TX_6', 'VERSORG_TX_6', 'VEIHDORG_TX_6', 'FSTATUS_ID_12', 'RFUR_DT_12', 'LOCATION_ID_12', 'TORG_ID_12', 'FLU_ID_12', 'FLUORG_ID_12', 'FLUORG_TX_12', 'FLURESULT_TX_12', 'DEATH_DT_12', 'DEATH_ID_12', 'DEATHSK_ID_12', 'DEATHO_TX_12', 'VE_ID_12', 'VERS_FL_12', 'VERSCICH_ID_12', 'VERS_DT_12', 'VERSORG_ID_12', 'VEIHD_FL_12', 'VEIHD_ID_12', 'VEIHD_DT_12', 'VEIHDORG_ID_12', 'MRS_TX_12', 'TORG_TX_12', 'VERSORG_TX_12', 'VEIHDORG_TX_12' ] read_file_path = gu.get_file_path('CASEDRFUR.csv', under_raw=True) with open(read_file_path, 'r', encoding='utf8', errors='replace') as csvfile: reader = csv.DictReader(csvfile) for row in reader: icase_id = row['ICASE_ID'] idcase_id = row['IDCASE_ID'] combind_id = icase_id + idcase_id # guid = row['GUID_TSYM'] rfur_nm = str(int(float(row['RFUR_NM']))) fstatus_id = row['FSTATUS_ID'] rfur_dt = row['RFUR_DT'] location_id = row['LOCATION_ID'] torg_id = row['TORG_ID'] flu_id = row['FLU_ID'] fluorg_id = row['FLUORG_ID'] fluorg_tx = row['FLUORG_TX'] fluresult_tx = row['FLURESULT_TX'] death_dt = row['DEATH_DT'] death_id = row['DEATH_ID'] deathsk_id = row['DEATHSK_ID'] deatho_tx = row['DEATHO_TX'] ve_id = row['VE_ID'] vers_fl = row['VERS_FL'] verscich_id = row['VERSCICH_ID'] vers_dt = row['VERS_DT'] versorg_id = row['VERSORG_ID'] veihd_fl = row['VEIHD_FL'] veihd_id = row['VEIHD_ID'] veihd_dt = row['VEIHD_DT'] veihdorg_id = row['VEIHDORG_ID'] mrs_tx = row['MRS_TX'] torg_tx = row['TORG_TX'] versorg_tx = row['VERSORG_TX'] veihdorg_tx = row['VEIHDORG_TX'] if combind_id in patients_dic.keys(): patients_dic.get(combind_id)['FSTATUS_ID_' + rfur_nm] = fstatus_id patients_dic.get(combind_id)['RFUR_DT_' + rfur_nm] = rfur_dt patients_dic.get(combind_id)['LOCATION_ID_' + rfur_nm] = location_id patients_dic.get(combind_id)['TORG_ID_' + rfur_nm] = torg_id patients_dic.get(combind_id)['FLU_ID_' + rfur_nm] = flu_id patients_dic.get(combind_id)['FLUORG_ID_' + rfur_nm] = fluorg_id patients_dic.get(combind_id)['FLUORG_TX_' + rfur_nm] = fluorg_tx patients_dic.get(combind_id)['FLURESULT_TX_' + rfur_nm] = fluresult_tx patients_dic.get(combind_id)['DEATH_DT_' + rfur_nm] = death_dt patients_dic.get(combind_id)['DEATH_ID_' + rfur_nm] = death_id patients_dic.get(combind_id)['DEATHSK_ID_' + rfur_nm] = deathsk_id patients_dic.get(combind_id)['DEATHO_TX_' + rfur_nm] = deatho_tx patients_dic.get(combind_id)['VE_ID_' + rfur_nm] = ve_id patients_dic.get(combind_id)['VERS_FL_' + rfur_nm] = vers_fl patients_dic.get(combind_id)['VERSCICH_ID_' + rfur_nm] = verscich_id patients_dic.get(combind_id)['VERS_DT_' + rfur_nm] = vers_dt patients_dic.get(combind_id)['VERSORG_ID_' + rfur_nm] = versorg_id patients_dic.get(combind_id)['VEIHD_FL_' + rfur_nm] = veihd_fl patients_dic.get(combind_id)['VEIHD_ID_' + rfur_nm] = veihd_id patients_dic.get(combind_id)['VEIHD_DT_' + rfur_nm] = veihd_dt patients_dic.get(combind_id)['VEIHDORG_ID_' + rfur_nm] = veihdorg_id patients_dic.get(combind_id)['MRS_TX_' + rfur_nm] = mrs_tx patients_dic.get(combind_id)['TORG_TX_' + rfur_nm] = torg_tx patients_dic.get(combind_id)['VERSORG_TX_' + rfur_nm] = versorg_tx patients_dic.get(combind_id)['VEIHDORG_TX_' + rfur_nm] = veihdorg_tx else: # initial a patient's dictionary # p_dic = {'ICASE_ID': icase_id, 'IDCASE_ID': idcase_id, 'GUID_TSYM': guid, # 'FSTATUS_ID_1': '', 'RFUR_DT_1': '', 'LOCATION_ID_1': '', 'TORG_ID_1': '', 'FLU_ID_1': '', 'FLUORG_ID_1': '', 'FLUORG_TX_1': '', 'FLURESULT_TX_1': '', 'DEATH_DT_1': '', 'DEATH_ID_1': '', 'DEATHSK_ID_1': '', 'DEATHO_TX_1': '', 'VE_ID_1': '', 'VERS_FL_1': '', 'VERSCICH_ID_1': '', 'VERS_DT_1': '', 'VERSORG_ID_1': '', 'VEIHD_FL_1': '', 'VEIHD_ID_1': '', 'VEIHD_DT_1': '', 'VEIHDORG_ID_1': '', 'MRS_TX_1': '', 'TORG_TX_1': '', 'VERSORG_TX_1': '', 'VEIHDORG_TX_1': '', # 'FSTATUS_ID_3': '', 'RFUR_DT_3': '', 'LOCATION_ID_3': '', 'TORG_ID_3': '', 'FLU_ID_3': '', 'FLUORG_ID_3': '', 'FLUORG_TX_3': '', 'FLURESULT_TX_3': '', 'DEATH_DT_3': '', 'DEATH_ID_3': '', 'DEATHSK_ID_3': '', 'DEATHO_TX_3': '', 'VE_ID_3': '', 'VERS_FL_3': '', 'VERSCICH_ID_3': '', 'VERS_DT_3': '', 'VERSORG_ID_3': '', 'VEIHD_FL_3': '', 'VEIHD_ID_3': '', 'VEIHD_DT_3': '', 'VEIHDORG_ID_3': '', 'MRS_TX_3': '', 'TORG_TX_3': '', 'VERSORG_TX_3': '', 'VEIHDORG_TX_3': '', # 'FSTATUS_ID_6': '', 'RFUR_DT_6': '', 'LOCATION_ID_6': '', 'TORG_ID_6': '', 'FLU_ID_6': '', 'FLUORG_ID_6': '', 'FLUORG_TX_6': '', 'FLURESULT_TX_6': '', 'DEATH_DT_6': '', 'DEATH_ID_6': '', 'DEATHSK_ID_6': '', 'DEATHO_TX_6': '', 'VE_ID_6': '', 'VERS_FL_6': '', 'VERSCICH_ID_6': '', 'VERS_DT_6': '', 'VERSORG_ID_6': '', 'VEIHD_FL_6': '', 'VEIHD_ID_6': '', 'VEIHD_DT_6': '', 'VEIHDORG_ID_6': '', 'MRS_TX_6': '', 'TORG_TX_6': '', 'VERSORG_TX_6': '', 'VEIHDORG_TX_6': '', # 'FSTATUS_ID_12': '', 'RFUR_DT_12': '', 'LOCATION_ID_12': '', 'TORG_ID_12': '', 'FLU_ID_12': '', 'FLUORG_ID_12': '', 'FLUORG_TX_12': '', 'FLURESULT_TX_12': '', 'DEATH_DT_12': '', 'DEATH_ID_12': '', 'DEATHSK_ID_12': '', 'DEATHO_TX_12': '', 'VE_ID_12': '', 'VERS_FL_12': '', 'VERSCICH_ID_12': '', 'VERS_DT_12': '', 'VERSORG_ID_12': '', 'VEIHD_FL_12': '', 'VEIHD_ID_12': '', 'VEIHD_DT_12': '', 'VEIHDORG_ID_12': '', 'MRS_TX_12': '', 'TORG_TX_12': '', 'VERSORG_TX_12': '', 'VEIHDORG_TX_12': '' # } p_dic = {'ICASE_ID': icase_id, 'IDCASE_ID': idcase_id, 'FSTATUS_ID_1': '', 'RFUR_DT_1': '', 'LOCATION_ID_1': '', 'TORG_ID_1': '', 'FLU_ID_1': '', 'FLUORG_ID_1': '', 'FLUORG_TX_1': '', 'FLURESULT_TX_1': '', 'DEATH_DT_1': '', 'DEATH_ID_1': '', 'DEATHSK_ID_1': '', 'DEATHO_TX_1': '', 'VE_ID_1': '', 'VERS_FL_1': '', 'VERSCICH_ID_1': '', 'VERS_DT_1': '', 'VERSORG_ID_1': '', 'VEIHD_FL_1': '', 'VEIHD_ID_1': '', 'VEIHD_DT_1': '', 'VEIHDORG_ID_1': '', 'MRS_TX_1': '', 'TORG_TX_1': '', 'VERSORG_TX_1': '', 'VEIHDORG_TX_1': '', 'FSTATUS_ID_3': '', 'RFUR_DT_3': '', 'LOCATION_ID_3': '', 'TORG_ID_3': '', 'FLU_ID_3': '', 'FLUORG_ID_3': '', 'FLUORG_TX_3': '', 'FLURESULT_TX_3': '', 'DEATH_DT_3': '', 'DEATH_ID_3': '', 'DEATHSK_ID_3': '', 'DEATHO_TX_3': '', 'VE_ID_3': '', 'VERS_FL_3': '', 'VERSCICH_ID_3': '', 'VERS_DT_3': '', 'VERSORG_ID_3': '', 'VEIHD_FL_3': '', 'VEIHD_ID_3': '', 'VEIHD_DT_3': '', 'VEIHDORG_ID_3': '', 'MRS_TX_3': '', 'TORG_TX_3': '', 'VERSORG_TX_3': '', 'VEIHDORG_TX_3': '', 'FSTATUS_ID_6': '', 'RFUR_DT_6': '', 'LOCATION_ID_6': '', 'TORG_ID_6': '', 'FLU_ID_6': '', 'FLUORG_ID_6': '', 'FLUORG_TX_6': '', 'FLURESULT_TX_6': '', 'DEATH_DT_6': '', 'DEATH_ID_6': '', 'DEATHSK_ID_6': '', 'DEATHO_TX_6': '', 'VE_ID_6': '', 'VERS_FL_6': '', 'VERSCICH_ID_6': '', 'VERS_DT_6': '', 'VERSORG_ID_6': '', 'VEIHD_FL_6': '', 'VEIHD_ID_6': '', 'VEIHD_DT_6': '', 'VEIHDORG_ID_6': '', 'MRS_TX_6': '', 'TORG_TX_6': '', 'VERSORG_TX_6': '', 'VEIHDORG_TX_6': '', 'FSTATUS_ID_12': '', 'RFUR_DT_12': '', 'LOCATION_ID_12': '', 'TORG_ID_12': '', 'FLU_ID_12': '', 'FLUORG_ID_12': '', 'FLUORG_TX_12': '', 'FLURESULT_TX_12': '', 'DEATH_DT_12': '', 'DEATH_ID_12': '', 'DEATHSK_ID_12': '', 'DEATHO_TX_12': '', 'VE_ID_12': '', 'VERS_FL_12': '', 'VERSCICH_ID_12': '', 'VERS_DT_12': '', 'VERSORG_ID_12': '', 'VEIHD_FL_12': '', 'VEIHD_ID_12': '', 'VEIHD_DT_12': '', 'VEIHDORG_ID_12': '', 'MRS_TX_12': '', 'TORG_TX_12': '', 'VERSORG_TX_12': '', 'VEIHDORG_TX_12': '' } p_dic['FSTATUS_ID_' + rfur_nm] = fstatus_id p_dic['RFUR_DT_' + rfur_nm] = rfur_dt p_dic['LOCATION_ID_' + rfur_nm] = location_id p_dic['TORG_ID_' + rfur_nm] = torg_id p_dic['FLU_ID_' + rfur_nm] = flu_id p_dic['FLUORG_ID_' + rfur_nm] = fluorg_id p_dic['FLUORG_TX_' + rfur_nm] = fluorg_tx p_dic['FLURESULT_TX_' + rfur_nm] = fluresult_tx p_dic['DEATH_DT_' + rfur_nm] = death_dt p_dic['DEATH_ID_' + rfur_nm] = death_id p_dic['DEATHSK_ID_' + rfur_nm] = deathsk_id p_dic['DEATHO_TX_' + rfur_nm] = deatho_tx p_dic['VE_ID_' + rfur_nm] = ve_id p_dic['VERS_FL_' + rfur_nm] = vers_fl p_dic['VERSCICH_ID_' + rfur_nm] = verscich_id p_dic['VERS_DT_' + rfur_nm] = vers_dt p_dic['VERSORG_ID_' + rfur_nm] = versorg_id p_dic['VEIHD_FL_' + rfur_nm] = veihd_fl p_dic['VEIHD_ID_' + rfur_nm] = veihd_id p_dic['VEIHD_DT_' + rfur_nm] = veihd_dt p_dic['VEIHDORG_ID_' + rfur_nm] = veihdorg_id p_dic['MRS_TX_' + rfur_nm] = mrs_tx p_dic['TORG_TX_' + rfur_nm] = torg_tx p_dic['VERSORG_TX_' + rfur_nm] = versorg_tx p_dic['VEIHDORG_TX_' + rfur_nm] = veihdorg_tx patients_dic[combind_id] = p_dic gu.save_array_to_csv('CASEDRFUR(denormalized)', title, patients_dic, under_raw=True)
n_class = 2 if n_class == 2: id_data, x_data, y_data = genral_utils.get_poor_god( 'wholeset_Jim_nomissing_validated.csv') fn = 'reduced_dimension_30_2c' else: id_data, x_data, y_data = genral_utils.get_individual( 'wholeset_Jim_nomissing_validated.csv') fn = 'reduced_dimension_30_individual' # calculation # x_data_train = genral_utils.scale(x_data) # t_sne = TSNE(n_components=2, perplexity=30).fit_transform(x_data_train) # df = pd.DataFrame(t_sne, columns=['x', 'y']) # df['p'] = y_data.values # genral_utils.save_dataframe_to_csv(df, fn) df = pd.read_csv(genral_utils.get_file_path(fn + '.csv', under_raw=False), encoding='utf8') plt.figure() plt.scatter(df.ix[:, 0], df.ix[:, 1], c=df.ix[:, 2], s=0.1, cmap=plt.cm.get_cmap("jet", n_class)) plt.colorbar(ticks=range(n_class)) plt.title('t-SNE 2D visualization of Taiwan stoke registry data') plt.xlabel('t-SNE 1') plt.ylabel('t-SNE 2') plt.savefig("t-sne.png", dpi=300) plt.show()
def clean_case(): fn = 'CASEDCASE.csv' read_file_path = gu.get_file_path(fn, under_raw=True) df_case = pd.read_csv(read_file_path, encoding='utf8') # Dropping unused column df_case = df_case.drop([ 'IPROTOCOL_ID', 'IPROTOCOL_ID', 'ORG_ID', 'CSTATUS_ID', 'DCTYPE24_ID', 'PATIENT_ID', 'INPUT_NM', 'AGE_NM', 'EDU_ID', 'PRO_ID', 'PROOT_TX', 'ITOWN_ID', 'ADDR_TX', 'TELH_TX', 'TELP_TX', 'TELF_TX', 'FTITLE_TX', 'CASEMEMO_TX', 'IH_FL', 'IH_DT', 'OH_DT', 'ONSETH_NM', 'ONSETM_NM', 'ONSET_FL', 'OT_DT', 'OTTIH_NM', 'OTTIM_NM', 'OT_FL', 'FLOOK_DT', 'FLOOKH_NM', 'FLOOKM_NM', 'FLOOK_FL', 'FCT_DT', 'FCTH_NM', 'FCTM_NM', 'FCTOH_FL', 'IVTPATH_ID', 'IVTPATH_FL', 'IVTPAAH_FL', 'IVTPA_DT', 'IVTPAH_NM', 'IVTPAM_NM', 'NIVTPA1_FL', 'NIVTPA2_FL', 'NIVTPA3_FL', 'NIVTPA4_FL', 'NIVTPA5_FL', 'NIVTPA6_FL', 'NIVTPA7_FL', 'NIVTPA8_FL', 'NIVTPA9_FL', 'NIVTPA10_FL', 'NIVTPA11_FL', 'NIVTPA99_FL', 'NIVTPA99_TX', 'ICDO_TX', 'TOASTSCAT_TX', 'TOASTSO_FL', 'TOASTSO_TX', 'CSAHO_TX', 'THD_ID', 'THDO_FL', 'THDOO_FL', 'THDOO_TX', 'TRM_ID', 'TRMEN_ID', 'TRMOT_FL', 'TRMOT_TX', 'OM_ID', 'OM_FL', 'OMAND_ID', 'OMLI_ID', 'OMLIOT_FL', 'OMLIOT_TX', 'OMLIOT2_FL', 'OMLIOT2_TX', 'AM_FL', 'AMLIOT_FL', 'AMLIOT_TX', 'AMLIOT2_FL', 'AMLIOT2_TX', 'COM_ID', 'COMO_TX', 'DET_ID', 'DETO_TX', 'DETO_FL', 'OFFD_DT', 'OFFD_ID', 'OFFD_TX', 'OFFDTORG_ID', 'OFFDTORG_TX', 'OFFRE_DT', 'NIHSIN_DT', 'NIHSINTI_TX', 'NIHSINH_NM', 'NIHSINM_NM', 'NIHSOT_DT', 'NIHSOTTI_TX', 'NIHSOTH_NM', 'NIHSOTM_NM', 'BRS_DT', 'CT_DT', 'CTTI_TX', 'CTH_NM', 'CTM_NM', 'CTO_TX', 'MRI_DT', 'MRITI_TX', 'MRIH_NM', 'MRIM_NM', 'MRIO_TX', 'ECG_ID', 'ECGO_FL', 'ECGO_TX', 'CREATE_DT', 'CREATESTAFF_ID', 'SYSUPD_DT', 'SYSUPDSTAFF_ID', 'MODIFY_NM', 'IGUID_FT', 'DETHOH_FL', 'OMAD_FL', 'OMAD_ID' ], axis=1) # Replace NULL to NaN df_case.replace('NULL', np.nan) # Replace outlier to Median outlier_cols = [ 'HEIGHT_NM', 'WEIGHT_NM', 'SBP_NM', 'DBP_NM', 'BT_NM', 'HR_NM', 'RR_NM', 'HB_NM', 'HCT_NM', 'PLATELET_NM', 'WBC_NM', 'PTT1_NM', 'PTT2_NM', 'PTINR_NM', 'ER_NM', 'BUN_NM', 'CRE_NM', 'ALB_NM', 'CRP_NM', 'HBAC_NM', 'AC_NM', 'UA_NM', 'TCHO_NM', 'TG_NM', 'HDL_NM', 'LDL_NM', 'GOT_NM', 'GPT_NM', 'HB_NM', 'HCT_NM', 'PLATELET_NM', 'WBC_NM', 'PTT1_NM', 'PTT2_NM', 'PTINR_NM', 'ER_NM', 'BUN_NM', 'CRE_NM', 'ALB_NM', 'CRP_NM', 'HBAC_NM', 'AC_NM', 'UA_NM', 'TCHO_NM', 'TG_NM', 'HDL_NM', 'LDL_NM', 'GOT_NM', 'GPT_NM', 'OMWA_TX' ] df_case[outlier_cols] = df_case[outlier_cols].replace(999.9, np.nan) for col in outlier_cols: df_case.loc[outliers_iqr(df_case[col]), col] = np.nan df_case[outlier_cols] = df_case[outlier_cols].apply(pd.to_numeric, errors='coerce') df_case[outlier_cols] = Imputer(missing_values=np.nan, strategy='mean', axis=0).fit_transform( df_case[outlier_cols]) # Replace un-coded value to Nan df_case.loc[out_of_range(df_case['OPC_ID'], ['1', '2', '3']), 'OPC_ID'] = np.nan df_case.loc[ out_of_range(df_case['GCSE_NM'], ['1', '2', '3', '4', '5', '6']), 'GCSE_NM'] = np.nan df_case.loc[ out_of_range(df_case['GCSV_NM'], ['1', '2', '3', '4', '5', '6']), 'GCSV_NM'] = np.nan df_case.loc[ out_of_range(df_case['GCSM_NM'], ['1', '2', '3', '4', '5', '6']), 'GCSM_NM'] = np.nan df_case.loc[out_of_range(df_case['ICD_ID'], ['1', '2', '3', '4', '99']), 'ICD_ID'] = np.nan df_case.loc[out_of_range(df_case['ICDTIA_ID'], ['1', '2']), 'ICDTIA_ID'] = np.nan df_case.loc[out_of_range(df_case['TOAST_ID'], ['1', '2', '3', '4', '5']), 'TOAST_ID'] = np.nan df_case.loc[out_of_range(df_case['TOASTU_ID'], ['1', '2', '3']), 'TOASTU_ID'] = np.nan df_case.loc[out_of_range(df_case['CICH_ID'], ['1', '2']), 'CICH_ID'] = np.nan df_case.loc[out_of_range(df_case['CSAH_ID'], ['1', '2', '3', '4']), 'CSAH_ID'] = np.nan df_case.loc[out_of_range(df_case['TRMOP_ID'], ['1', '2', '3', '4', '5']), 'TRMOP_ID'] = np.nan df_case.loc[out_of_range(df_case['OFF_ID'], ['1', '2', '3']), 'OFF_ID'] = np.nan df_case.loc[out_of_range(df_case['OFFDT_ID'], ['1', '2', '3', '4', '5']), 'OFFDT_ID'] = np.nan df_case.loc[out_of_range(df_case['CD_ID'], ['0', '1', '2']), 'CD_ID'] = np.nan df_case.loc[out_of_range(df_case['CDR_ID'], ['1', '2', '3', '4']), 'CDR_ID'] = np.nan df_case.loc[out_of_range(df_case['CDL_ID'], ['1', '2', '3', '4']), 'CDL_ID'] = np.nan df_case.loc[out_of_range(df_case['TCCS_ID'], ['0', '1']), 'TCCS_ID'] = np.nan df_case.loc[out_of_range(df_case['TCCSR_ID'], ['1', '2', '3']), 'TCCSR_ID'] = np.nan df_case.loc[out_of_range(df_case['TCCSL_ID'], ['1', '2', '3']), 'TCCSL_ID'] = np.nan df_case.loc[out_of_range(df_case['TCCSBA_ID'], ['1', '2', '3']), 'TCCSBA_ID'] = np.nan df_case.loc[out_of_range(df_case['MCDR_ID'], ['1', '2', '3']), 'MCDR_ID'] = np.nan df_case.loc[out_of_range(df_case['MCDL_ID'], ['1', '2', '3']), 'MCDL_ID'] = np.nan df_case.loc[out_of_range(df_case['MCDBA_ID'], ['1', '2', '3']), 'MCDBA_ID'] = np.nan df_case.loc[out_of_range(df_case['MCDRI_ID'], ['1', '2', '3']), 'MCDRI_ID'] = np.nan df_case.loc[out_of_range(df_case['MCDLI_ID'], ['1', '2', '3']), 'MCDLI_ID'] = np.nan # df_case.loc[df_case['ICD_TX'].apply(not_icd), 'ICD_TX'] = np.nan # toas_cols = [ 'TOASTLE_FL', 'TOASTLI_FL', 'TOASTSCE_FL', 'TOASTSMO_FL', 'TOASTSRA_FL', 'TOASTSDI_FL', 'TOASTSMI_FL', 'TOASTSANTIP_FL', 'TOASTSAU_FL', 'TOASTSHY_FL', 'TOASTSPR_FL', 'TOASTSANTIT_FL', 'TOASTSHO_FL', 'TOASTSHYS_FL', 'TOASTSCA_FL' ] thd_cols = [ 'THDA_FL', 'THDH_FL', 'THDI_FL', 'THDAM_FL', 'THDV_FL', 'THDE_FL', 'THDM_FL', 'THDR_FL', 'THDP_FL' ] trm_cols = [ 'TRMAN_FL', 'TRMAS_FL', 'TRMTI_FL', 'TRMHE_FL', 'TRMWA_FL', 'TRMIA_FL', 'TRMFO_FL', 'TRMTA_FL', 'TRMSD_FL', 'TRMRE_FL', 'TRMEN_FL', 'TRMAG_FL', 'TRMCL_FL', 'TRMPL_FL', 'TRMLM_FL', 'TRMIV_FL', 'TRMVE_FL', 'TRMNG_FL', 'TRMDY_FL', 'TRMICU_FL', 'TRMSM_FL', 'TRMED_FL', 'TRMOP_FL' ] om_cols = [ 'OMAS_FL', 'OMAG_FL', 'OMTI_FL', 'OMCL_FL', 'OMWA_FL', 'OMPL_FL', 'OMANH_FL', 'OMAND_FL', 'OMORA_FL', 'OMINS_FL', 'OMLI_FL', 'OMST_FL', 'OMNS_FL' ] am_cols = [ 'AMAS_FL', 'AMAG_FL', 'AMTI_FL', 'AMCL_FL', 'AMWA_FL', 'AMPL_FL', 'AMANH_FL', 'AMAND_FL', 'AMLI_FL' ] com_cols = [ 'COMPN_FL', 'COMUT_FL', 'COMUG_FL', 'COMPR_FL', 'COMPU_FL', 'COMAC_FL', 'COMSE_FL', 'COMDE_FL', 'COMO_FL' ] det_cols = [ 'DETST_FL', 'DETHE_FL', 'DETHO_FL', 'DETHA_FL', 'DETVA_FL', 'DETRE_FL', 'DETME_FL' ] cm_cols = ['CT_FL', 'MRI_FL'] ecg_cols = ['ECGL_FL', 'ECGA_FL', 'ECGQ_FL'] mcd_cold = ['MCD_ID', 'MRA_FL', 'CTA_FL', 'DSA_FL'] all_cols = toas_cols + thd_cols + trm_cols + om_cols + am_cols + com_cols + det_cols + cm_cols + ecg_cols + mcd_cold df_case[all_cols] = replace_flg(df_case[all_cols], all_cols) return df_case
# plt.scatter(df_0.ix[:,0], df_0.ix[:,1], c='blue', s=0.1, label='Good') # plt.scatter(df_1.ix[:,0], df_1.ix[:,1], c='red', s=0.1, label='Poor') # # plt.title('t-SNE 2D visualization of 90-day stroke mRS outcome') # plt.rcParams["legend.markerscale"] = 10 # plt.legend() # plt.xlabel('t-SNE 1') # plt.ylabel('t-SNE 2') # plt.savefig("t-sne.png", dpi=300) # plt.show() # BI v.s NIHSS b = 'bi_total' n = 'nihss_total' m = 'discharged_mrs' # TSR data df_3m = pd.read_csv(gu.get_file_path('wholeset_Jim_nomissing.csv', under_raw=False), encoding='utf8') df_3m[b] = pd.DataFrame( np.sum(df_3m[[ 'Feeding', 'Transfers', 'Bathing', 'Toilet_use', 'Grooming', 'Mobility', 'Stairs', 'Dressing', 'Bowel_control', 'Bladder_control' ]], axis=1)) df_3m[n] = pd.DataFrame( np.sum(df_3m[[ 'NIHS_1a_out', 'NIHS_1b_out', 'NIHS_1c_out', 'NIHS_2_out', 'NIHS_3_out', 'NIHS_4_out', 'NIHS_5aL_out', 'NIHS_5bR_out', 'NIHS_6aL_out', 'NIHS_6bR_out', 'NIHS_7_out', 'NIHS_8_out', 'NIHS_9_out', 'NIHS_10_out', 'NIHS_11_out' ]], axis=1))
def de_casednihs(): patients_dic = {} title = [ 'ICASE_ID', 'IDCASE_ID', 'NIHS_1a_in', 'NIHS_1b_in', 'NIHS_1c_in', 'NIHS_2_in', 'NIHS_3_in', 'NIHS_4_in', 'NIHS_5aL_in', 'NIHS_5bR_in', 'NIHS_6aL_in', 'NIHS_6bR_in', 'NIHS_7_in', 'NIHS_8_in', 'NIHS_9_in', 'NIHS_10_in', 'NIHS_11_in', 'NIHS_1a_out', 'NIHS_1b_out', 'NIHS_1c_out', 'NIHS_2_out', 'NIHS_3_out', 'NIHS_4_out', 'NIHS_5aL_out', 'NIHS_5bR_out', 'NIHS_6aL_out', 'NIHS_6bR_out', 'NIHS_7_out', 'NIHS_8_out', 'NIHS_9_out', 'NIHS_10_out', 'NIHS_11_out' ] test_code = { '1.10': 'NIHS_1a', '1.20': 'NIHS_1b', '1.30': 'NIHS_1c', '2.00': 'NIHS_2', '3.00': 'NIHS_3', '4.00': 'NIHS_4', '5.10': 'NIHS_5aL', '5.20': 'NIHS_5bR', '6.10': 'NIHS_6aL', '6.20': 'NIHS_6bR', '7.00': 'NIHS_7', '8.00': 'NIHS_8', '9.00': 'NIHS_9', '10.00': 'NIHS_10', '11.00': 'NIHS_11' } read_file_path = gu.get_file_path('CASEDNIHS.csv', under_raw=True) with open(read_file_path, 'r', encoding='utf8') as csvfile: reader = csv.DictReader(csvfile) for row in reader: icase_id = row['ICASE_ID'] idcase_id = row['IDCASE_ID'] combind_id = icase_id + idcase_id nid_nm = row['NID_NM'] ninv_nm = row['NINV_NM'] notv_nm = row['NOTV_NM'] if combind_id in patients_dic.keys(): key = test_code.get(nid_nm) patients_dic.get(combind_id)[key + '_in'] = ninv_nm patients_dic.get(combind_id)[key + '_out'] = notv_nm else: # initial a patient's dictionary p_dic = { 'ICASE_ID': icase_id, 'IDCASE_ID': idcase_id, 'NIHS_1a_in': '', 'NIHS_1b_in': '', 'NIHS_1c_in': '', 'NIHS_2_in': '', 'NIHS_3_in': '', 'NIHS_4_in': '', 'NIHS_5aL_in': '', 'NIHS_5bR_in': '', 'NIHS_6aL_in': '', 'NIHS_6bR_in': '', 'NIHS_7_in': '', 'NIHS_8_in': '', 'NIHS_9_in': '', 'NIHS_10_in': '', 'NIHS_11_in': '', 'NIHS_1a_out': '', 'NIHS_1b_out': '', 'NIHS_1c_out': '', 'NIHS_2_out': '', 'NIHS_3_out': '', 'NIHS_4_out': '', 'NIHS_5aL_out': '', 'NIHS_5bR_out': '', 'NIHS_6aL_out': '', 'NIHS_6bR_out': '', 'NIHS_7_out': '', 'NIHS_8_out': '', 'NIHS_9_out': '', 'NIHS_10_out': '', 'NIHS_11_out': '' } key = test_code.get(nid_nm) p_dic[key + '_in'] = ninv_nm p_dic[key + '_out'] = notv_nm patients_dic[combind_id] = p_dic gu.save_array_to_csv('CASEDNIHS(denormalized)', title, patients_dic, under_raw=True)
def clean_nihs(): fn = 'CASEDNIHS(denormalized).csv' read_file_path = gu.get_file_path(fn, under_raw=True) df_nihs = pd.read_csv(read_file_path, encoding='utf8') df_nihs.loc[out_of_range(df_nihs['NIHS_1a_in'], ['0', '1', '2', '3']), 'NIHS_1a_in'] = np.nan df_nihs.loc[out_of_range(df_nihs['NIHS_1a_out'], ['0', '1', '2', '3']), 'NIHS_1a_out'] = np.nan df_nihs.loc[out_of_range(df_nihs['NIHS_1b_in'], ['0', '1', '2']), 'NIHS_1b_in'] = np.nan df_nihs.loc[out_of_range(df_nihs['NIHS_1b_out'], ['0', '1', '2']), 'NIHS_1b_out'] = np.nan df_nihs.loc[out_of_range(df_nihs['NIHS_1c_in'], ['0', '1', '2']), 'NIHS_1c_in'] = np.nan df_nihs.loc[out_of_range(df_nihs['NIHS_1c_out'], ['0', '1', '2']), 'NIHS_1c_out'] = np.nan df_nihs.loc[out_of_range(df_nihs['NIHS_2_in'], ['0', '1', '2']), 'NIHS_2_in'] = np.nan df_nihs.loc[out_of_range(df_nihs['NIHS_2_out'], ['0', '1', '2']), 'NIHS_2_out'] = np.nan df_nihs.loc[out_of_range(df_nihs['NIHS_3_in'], ['0', '1', '2', '3']), 'NIHS_3_in'] = np.nan df_nihs.loc[out_of_range(df_nihs['NIHS_3_out'], ['0', '1', '2', '3']), 'NIHS_3_out'] = np.nan df_nihs.loc[out_of_range(df_nihs['NIHS_4_in'], ['0', '1', '2', '3']), 'NIHS_4_in'] = np.nan df_nihs.loc[out_of_range(df_nihs['NIHS_4_out'], ['0', '1', '2', '3']), 'NIHS_4_out'] = np.nan df_nihs.loc[ out_of_range(df_nihs['NIHS_5aL_in'], ['0', '1', '2', '3', '4']), 'NIHS_5aL_in'] = np.nan df_nihs.loc[ out_of_range(df_nihs['NIHS_5aL_out'], ['0', '1', '2', '3', '4']), 'NIHS_5aL_out'] = np.nan df_nihs.loc[ out_of_range(df_nihs['NIHS_5bR_in'], ['0', '1', '2', '3', '4']), 'NIHS_5bR_in'] = np.nan df_nihs.loc[ out_of_range(df_nihs['NIHS_5bR_out'], ['0', '1', '2', '3', '4']), 'NIHS_5bR_out'] = np.nan df_nihs.loc[ out_of_range(df_nihs['NIHS_6aL_in'], ['0', '1', '2', '3', '4']), 'NIHS_6aL_in'] = np.nan df_nihs.loc[ out_of_range(df_nihs['NIHS_6aL_out'], ['0', '1', '2', '3', '4']), 'NIHS_6aL_out'] = np.nan df_nihs.loc[ out_of_range(df_nihs['NIHS_6bR_in'], ['0', '1', '2', '3', '4']), 'NIHS_6bR_in'] = np.nan df_nihs.loc[ out_of_range(df_nihs['NIHS_6bR_out'], ['0', '1', '2', '3', '4']), 'NIHS_6bR_out'] = np.nan df_nihs.loc[out_of_range(df_nihs['NIHS_7_in'], ['0', '1', '2']), 'NIHS_7_in'] = np.nan df_nihs.loc[out_of_range(df_nihs['NIHS_7_out'], ['0', '1', '2']), 'NIHS_7_out'] = np.nan df_nihs.loc[out_of_range(df_nihs['NIHS_8_in'], ['0', '1', '2']), 'NIHS_8_in'] = np.nan df_nihs.loc[out_of_range(df_nihs['NIHS_8_out'], ['0', '1', '2']), 'NIHS_8_out'] = np.nan df_nihs.loc[out_of_range(df_nihs['NIHS_9_in'], ['0', '1', '2', '3']), 'NIHS_9_in'] = np.nan df_nihs.loc[out_of_range(df_nihs['NIHS_9_out'], ['0', '1', '2', '3']), 'NIHS_9_out'] = np.nan df_nihs.loc[out_of_range(df_nihs['NIHS_10_in'], ['0', '1', '2']), 'NIHS_10_in'] = np.nan df_nihs.loc[out_of_range(df_nihs['NIHS_10_out'], ['0', '1', '2']), 'NIHS_10_out'] = np.nan df_nihs.loc[out_of_range(df_nihs['NIHS_11_in'], ['0', '1', '2']), 'NIHS_11_in'] = np.nan df_nihs.loc[out_of_range(df_nihs['NIHS_11_out'], ['0', '1', '2']), 'NIHS_11_out'] = np.nan return df_nihs
def de_casedctmr(): patients_dic = {} title = [ 'ICASE_ID', 'IDCASE_ID', 'cortical_ACA_ctr', 'cortical_MCA_ctr', 'subcortical_ACA_ctr', 'subcortical_MCA_ctr', 'PCA_cortex_ctr', 'thalamus_ctr', 'brainstem_ctr', 'cerebellum_ctr', 'Watershed_ctr', 'Hemorrhagic_infarct_ctr', 'Old_stroke_ctci', 'cortical_ACA_ctl', 'cortical_MCA_ctl', 'subcortical_ACA_ctl', 'subcortical_MCA_ctl', 'PCA_cortex_ctl', 'thalamus_ctl', 'brainstem_ctl', 'cerebellum_ctl', 'Watershed_ctl', 'Hemorrhagic_infarct_ctl', 'Old_stroke_ctch', 'cortical_ACA_mrir', 'cortical_MCA_mrir', 'subcortical_ACA_mrir', 'subcortical_MCA_mrir', 'PCA_cortex_mrir', 'thalamus_mrir', 'brainstem_mrir', 'cerebellum_mrir', 'Watershed_mrir', 'Hemorrhagic_infarct_mrir', 'Old_stroke_mrici', 'cortical_ACA_mril', 'cortical_MCA_mril', 'subcortical_ACA_mril', 'subcortical_MCA_mril', 'PCA_cortex_mril', 'thalamus_mril', 'brainstem_mril', 'cerebellum_mril', 'Watershed_mril', 'Hemorrhagic_infarct_mril', 'Old_stroke_mrich' ] cm_code = { '1': 'cortical_ACA', '2': 'cortical_MCA', '3': 'subcortical_ACA', '4': 'subcortical_MCA', '5': 'PCA_cortex', '6': 'thalamus', '7': 'brainstem', '8': 'cerebellum', '9': 'Watershed', '10': 'Hemorrhagic_infarct', '11': 'Old_stroke' } read_file_path = gu.get_file_path('CASEDCTMR.csv', under_raw=True) with open(read_file_path, 'r', encoding='utf8') as csvfile: reader = csv.DictReader(csvfile) for row in reader: icase_id = row['ICASE_ID'] idcase_id = row['IDCASE_ID'] combind_id = icase_id + idcase_id ctmriid_nm = row['CTMRIID_NM'] ctright_fl = row['CTRIGHT_FL'] ctleft_fl = row['CTLEFT_FL'] mriright_fl = row['MRIRIGHT_FL'] mrileft_fl = row['MRILEFT_FL'] if combind_id in patients_dic.keys(): key = cm_code.get(ctmriid_nm) if ctmriid_nm != '11': patients_dic.get(combind_id)[key + '_ctr'] = ctright_fl patients_dic.get(combind_id)[key + '_ctl'] = ctleft_fl patients_dic.get(combind_id)[key + '_mrir'] = mriright_fl patients_dic.get(combind_id)[key + '_mril'] = mrileft_fl else: patients_dic.get(combind_id)[key + '_ctci'] = ctright_fl patients_dic.get(combind_id)[key + '_ctch'] = ctleft_fl patients_dic.get(combind_id)[key + '_mrici'] = mriright_fl patients_dic.get(combind_id)[key + '_mrich'] = mrileft_fl else: # initial a patient's dictionary p_dic = { 'ICASE_ID': icase_id, 'IDCASE_ID': idcase_id, 'cortical_ACA_ctr': '', 'cortical_MCA_ctr': '', 'subcortical_ACA_ctr': '', 'subcortical_MCA_ctr': '', 'PCA_cortex_ctr': '', 'thalamus_ctr': '', 'brainstem_ctr': '', 'cerebellum_ctr': '', 'Watershed_ctr': '', 'Hemorrhagic_infarct_ctr': '', 'Old_stroke_ctci': '', 'cortical_ACA_ctl': '', 'cortical_MCA_ctl': '', 'subcortical_ACA_ctl': '', 'subcortical_MCA_ctl': '', 'PCA_cortex_ctl': '', 'thalamus_ctl': '', 'brainstem_ctl': '', 'cerebellum_ctl': '', 'Watershed_ctl': '', 'Hemorrhagic_infarct_ctl': '', 'Old_stroke_ctch': '', 'cortical_ACA_mrir': '', 'cortical_MCA_mrir': '', 'subcortical_ACA_mrir': '', 'subcortical_MCA_mrir': '', 'PCA_cortex_mrir': '', 'thalamus_mrir': '', 'brainstem_mrir': '', 'cerebellum_mrir': '', 'Watershed_mrir': '', 'Hemorrhagic_infarct_mrir': '', 'Old_stroke_mrici': '', 'cortical_ACA_mril': '', 'cortical_MCA_mril': '', 'subcortical_ACA_mril': '', 'subcortical_MCA_mril': '', 'PCA_cortex_mril': '', 'thalamus_mril': '', 'brainstem_mril': '', 'cerebellum_mril': '', 'Watershed_mril': '', 'Hemorrhagic_infarct_mril': '', 'Old_stroke_mrich': '' } key = cm_code.get(ctmriid_nm) if ctmriid_nm != '11': p_dic[key + '_ctr'] = ctright_fl p_dic[key + '_ctl'] = ctleft_fl p_dic[key + '_mrir'] = mriright_fl p_dic[key + '_mril'] = mrileft_fl else: p_dic[key + '_ctci'] = ctright_fl p_dic[key + '_ctch'] = ctleft_fl p_dic[key + '_mrici'] = mriright_fl p_dic[key + '_mrich'] = mrileft_fl patients_dic[combind_id] = p_dic gu.save_array_to_csv('CASEDCTMR(denormalized)', title, patients_dic, under_raw=True)
# print(df_fahi.shape) # print(df_nihs.shape) # print(df_rfur.shape) # df_joined = reduce(lambda left, right: pd.merge(left, right, how='outer', on=['ICASE_ID', 'IDCASE_ID']), dfs) # print(df_joined.shape) # ===================== convert feature # df_withMissing = clnUtil.convert_features(df_joined) # gu.save_dataframe_to_csv(df_withMissing, 'TSR_2018_withMissing') # ######################################################## 3-month mRS############################################# # ===================== Remove high missing features # df_withMissing = pd.read_csv(gu.get_file_path('TSR_2018_withMissing.csv', under_raw=False), encoding='utf8') # df_remove_hing_missing_columns = nomissUtil.remove_missing_intensive_features(df_withMissing) # nomissUtil.plot_missing(df_remove_hing_missing_columns) # ===================== only 3-month followup # df_3m = df_remove_hing_missing_columns.drop(['VERS_3', 'VERS_6', 'VERS_12', 'VEIHD_3', 'VEIHD_6', 'VEIHD_12', 'MRS_6', 'MRS_12'], axis=1) # ===================== Remove NaN observations # df_3m.dropna(inplace=True) # ===================== Remove dead cases # df_3m.drop(df_3m[df_3m.OFF_ID == 2.].index, inplace=True) # ===================== Make dummy variables # df_3m = clnUtil.make_dummy(df_3m) # gu.save_dataframe_to_csv(df_3m, 'TSR_2018_3m_noMissing') # ===================== validated mRS df_3m = pd.read_csv(gu.get_file_path('TSR_2018_3m_noMissing.csv', under_raw=False), encoding='utf8') df_3m_validated = mv.mRS_validate(df_3m) gu.save_dataframe_to_csv(df_3m_validated, 'TSR_2018_3m_noMissing_validated') print("Done")