df = df[df.duration_mean > 1] df.drop('duration_mean', axis=1, inplace=True) minus_day_cases = df.val.sum() start_icds = df.dx_1.unique() diagnosis_feats = df.columns[df.columns.str.startswith('dx_')] for feat in diagnosis_feats: df[feat] = hosp_prep.sanitize_diagnoses(df[feat]) if len(diagnosis_feats) > 1: df = hosp_prep.stack_merger(df) elif len(diagnosis_feats) == 1: df.rename(columns={'dx_1': 'cause_code'}, inplace=True) df['diagnosis_id'] = 1 else: print("Something went wrong, there are no ICD code features") start_icds = set(hosp_prep.sanitize_diagnoses(pd.Series(start_icds)))
errors='coerce') df = df[df.date_adm.notnull()] df = df[df.date_dis.notnull()] df['days_diff'] = df.date_dis - df.date_adm df = df[df.days_diff >= pd.to_timedelta(0, unit="D")] df = df[(df.days_diff > pd.to_timedelta(0, unit="D")) | (df.outcome_id == "death")] diagnosis_feats = df.columns[df.columns.str.startswith('dx_')] for feat in diagnosis_feats: df[feat] = hosp_prep.sanitize_diagnoses(df[feat]) if len(diagnosis_feats) > 1: df = hosp_prep.stack_merger(df) elif len(diagnosis_feats) == 1: df.rename(columns={'dx_1': 'cause_code'}, inplace=True) df['diagnosis_id'] = 1 else: print("Something went wrong, there are no ICD code features") df['val'] = 1 print("Are there missing values in any row?\n")
if len(diagnosis_feats) > 1: # Reshape diagnoses from wide to long # - review `hosp_prep.py` for additional documentation df = hosp_prep.stack_merger(df) df.drop('patient_index', axis=1, inplace=True) elif len(diagnosis_feats) == 1: df.rename(columns={'dx_1': 'cause_code'}, inplace=True) df['diagnosis_id'] = 1 else: print("Something went wrong, there are no ICD code features") # 2014 data needs to have the diagnoses cleaned b/c they're good ICD codes. df.loc[df.source == "GEO_COL_14", 'cause_code'] =\ hosp_prep.sanitize_diagnoses(df.loc[df.source == "GEO_COL_14", 'cause_code']) ##################################################### # GROUPBY AND AGGREGATE ##################################################### # Check for missing values print("Are there missing values in any row?") null_condition = df.isnull().values.any() if null_condition: warnings.warn(">> Yes. ROWS WITH ANY NULL VALUES WILL BE LOST ENTIRELY") else: print(">> No.") group_vars = [ 'cause_code', 'diagnosis_id', 'sex_id', 'age_start', 'age_end',