def load_APACHE(icustays): path = os.path.join(directories.mimic_dir, "apache.csv") dtypes = { "icustay_id": pd.Int32Dtype(), "APSIII": pd.Int16Dtype(), } apache = pd.read_csv(path, dtype=dtypes) apache.rename(inplace=True, columns={ "icustay_id": "ICUSTAY_ID", "APSIII": "APACHE" }) # print number of icustays dropped from missing APACHE length_before_drop = len(apache) apache.dropna(axis=0, inplace=True) # 34 na icustays length_after_drop = len(apache) directories.print_log("\t", length_before_drop - length_after_drop, "icustays w/o APACHE") # interpret APACHE score as int apache.ICUSTAY_ID = apache.ICUSTAY_ID.astype(int) apache.APACHE = apache.APACHE.astype(int) icustays = icustays.join(apache.set_index("ICUSTAY_ID"), on="ICUSTAY_ID", how="left") return icustays
def load_LODS(icustays): path = os.path.join(directories.mimic_dir, "lods.csv") dtypes = { "icustay_id": pd.Int32Dtype(), "LODS": pd.Int16Dtype(), "pulmonary": pd.Int16Dtype(), "cardiovascular": pd.Int16Dtype(), } lods = pd.read_csv(path, dtype=dtypes) lods.rename(inplace=True, columns={ "icustay_id": "ICUSTAY_ID", "pulmonary": "PULMONARY_LODS", "cardiovascular": "CARDIOVASCULAR_LODS" }) # print number of icustays dropped from missing LODS length_before_drop = len(lods) lods.dropna(axis=0, inplace=True) # 2482 na icustays length_after_drop = len(lods) directories.print_log("\t", length_before_drop - length_after_drop, "icustays w/o LODS") # interpret LODS scores as ints lods.ICUSTAY_ID = lods.ICUSTAY_ID.astype(int) lods.LODS = lods.LODS.astype(int) lods.PULMONARY_LODS = lods.PULMONARY_LODS.astype(int) lods.CARDIOVASCULAR_LODS = lods.CARDIOVASCULAR_LODS.astype(int) icustays = icustays.join(lods.set_index("ICUSTAY_ID"), on="ICUSTAY_ID", how="left") return icustays
def extract_test_inputs(cleaned_data): with open(os.path.join(directories.model_dir, "test_subjects.p"), "rb") as file: test_subjects = pickle.load(file).values test_subjects = test_subjects.reshape(-1, ) test_inputs = cleaned_data[cleaned_data.SUBJECT_ID.isin(test_subjects)] test_inputs = test_inputs[ ["LABEL", "SUBJECT_ID", "EPISODE", "EPOCH", "HADM_ID"] + feature_definitions.features_to_keep] ratio = 100. * sum(test_inputs.LABEL.values) / len(test_inputs) directories.print_log("\t\t", len(test_inputs), "test samples before dropping na,", "%2.0f%%" % ratio, "are positive)") test_inputs = test_inputs.dropna() ratio = 100. * sum(test_inputs.LABEL.values) / len(test_inputs) directories.print_log("\t\t", len(test_inputs), "test samples after dropping na,", "%2.0f%%" % ratio, "are positive)") print("\t%2.0f%% of test subjects will need pressors" % (100 * sum(test_inputs.EPISODE == 1) / len(test_inputs))) print("\t%2.0f%% of test subjects won't need pressors at all" % (100 * sum(test_inputs.EPISODE == 0) / len(test_inputs))) with open(os.path.join(directories.model_dir, "test_features.p"), "wb") as file: pickle.dump(test_inputs, file)
def main(): directories.print_log("preprocessing data") directories.print_log("\tpreparing training set") preprocess_training_inputs() directories.print_log("\tpreparing test set") preprocess_test_inputs() directories.print_log("done preprocessing data!") directories.print_log()
def compute_hours_to_pressor(vaso_episodes): vaso_episodes[ "EPISODE_START_POST_TRANSFER"] = vaso_episodes.STARTTIME - vaso_episodes.ADMITTIME rows_to_remove = vaso_episodes.EPISODE_START_POST_TRANSFER > pd.Timedelta( days=MAXIMUM_TIME_TO_PRESSOR_DAYS) directories.print_log( "\tdropping", sum(rows_to_remove), "icustays with first pressor episode occurring more than", "%2i" % MAXIMUM_TIME_TO_PRESSOR_DAYS, "days after admission") vaso_episodes = vaso_episodes[~rows_to_remove] return vaso_episodes
def build_features(labs, pressors_by_icustay): directories.print_log("\tbuilding features") data = [] for i in range(len(labs)): directories.print_log("\t\tfor epoch", i) temp = feature_definitions.attach_lab_features(pressors_by_icustay, labs[i]) temp["LABEL"] = (temp.EPISODE == 1) & (RECORD_LENGTH_SHIFT * i <= 24 ) # positive detection is 12 hours or less temp["EPOCH"] = i data.append(temp) return pd.concat(data)
def main(): # load in ventilation times with directories.engine.connect() as connection: vaso_episodes = pd.read_sql("pressors_by_icustay", con=connection, index_col="ICUSTAY_ID") with directories.engine.connect() as connection: interval_splits = pd.read_sql("interval_splits", con=connection, index_col="ICUSTAY_ID") interval_splits.set_index("HADM_ID", inplace=True) extract_lab_events(vaso_episodes, interval_splits) directories.print_log("Done processing lab events!") directories.print_log()
def extract_lab_records(): directories.print_log("\textracting lab records for", RECORD_LENGTH_HOURS,\ "hr windows, with a shift of", RECORD_LENGTH_SHIFT, "hrs") labs = [] with directories.engine.connect() as connection: for k in range(int(48 / RECORD_LENGTH_SHIFT)): directories.print_log("\t\tloading epoch", k) temp = [] for i in range(RECORD_LENGTH_HOURS): QUERY = f""" select * from lab_events where "HOURS_BEFORE_PRESSOR"={RECORD_LENGTH_SHIFT*k + 1 +i} order by "HOURS_BEFORE_PRESSOR" """ temp.append(pd.read_sql_query(QUERY, con=connection)) labs.append(pd.concat(temp)) return labs
def load_weights(icustays): path = os.path.join(directories.mimic_dir, "heightweight.csv") heightweight = pd.read_csv(path, dtype={"icustay_id": pd.Int32Dtype()}, usecols=["icustay_id", "weight_min"]) heightweight.rename(inplace=True, columns={ "icustay_id": "ICUSTAY_ID", "weight_min": "WEIGHT_KG" }) # print num rows dropped b/c of weight length_before_drop = len(heightweight) heightweight = heightweight[(MINIMUM_WEIGHT_KG < heightweight.WEIGHT_KG) & \ (heightweight.WEIGHT_KG < MAXIMUM_WEIGHT_KG)] length_after_drop = len(heightweight) directories.print_log("\t",length_before_drop-length_after_drop, "icustays w/ weight greater than",\ MAXIMUM_WEIGHT_KG, "kg (likely mislabeled)") icustays = icustays.join(heightweight.set_index("ICUSTAY_ID"), on="ICUSTAY_ID", how="left") return icustays
def load_OASIS(icustays): path = os.path.join(directories.mimic_dir, "oasis.csv") dtypes = { "icustay_id": pd.Int32Dtype(), "OASIS": pd.Int16Dtype(), } oasis = pd.read_csv(path, dtype=dtypes) oasis.rename(inplace=True, columns={ "icustay_id": "ICUSTAY_ID", }) # print number of icustays dropped from missing OASIS length_before_drop = len(oasis) oasis.dropna(axis=0, inplace=True) # 0 na icustays length_after_drop = len(oasis) directories.print_log("\t", length_before_drop - length_after_drop, "icustays w/o OASIS") # interpret OASIS scores as int oasis.ICUSTAY_ID = oasis.ICUSTAY_ID.astype(int) oasis.OASIS = oasis.OASIS.astype(int) icustays = icustays.join(oasis.set_index("ICUSTAY_ID"), on="ICUSTAY_ID", how="left") return icustays
def main(): with engine.connect() as connection: vaso_episodes = pd.read_sql("pressors_by_icustay", con=connection, index_col="ICUSTAY_ID") print_log("building hour-long intervals for each icustay") interval_splits = [pd.Series(vaso_episodes.PRESSOR_START_SEC - i*60*60, name=i, dtype=pd.Int32Dtype()) for i in range(N_WINDOWS+1)] interval_splits = pd.concat(interval_splits, axis=1) interval_splits = interval_splits.join(vaso_episodes[["SUBJECT_ID","HADM_ID"]]) print_log("\tsaving intervals to database `PressorGauge` in table `intervals`") with engine.connect() as connection: interval_splits.to_sql("interval_splits", con=connection, if_exists="replace", index_label="ICUSTAY_ID") print_log("Done computing intervals!") print_log()
def main(): directories.print_log("building features") labs = extract_lab_records() with directories.engine.connect() as connection: pressors_by_icustay = pd.read_sql("pressors_by_icustay", con=connection) data = build_features(labs, pressors_by_icustay) directories.print_log("\tcleaning features") cleaned_data = feature_definitions.clean_data(data) # set nan flags to zero, since there were no lab results at all in those windows cleaned_data.TOTAL_FLAG_COUNT.fillna(0, inplace=True) for feature in feature_definitions.features: cleaned_data[feature_definitions.features[feature]["ab"] + "_FLAG_COUNT"].fillna(0, inplace=True) extract_training_inputs(cleaned_data) extract_test_inputs(cleaned_data) directories.print_log("done building features!") directories.print_log()
def main(): directories.print_log("building pressor database", mode="w") icustays = load_icustay() icustays = load_admissions(icustays) icustays = load_patients(icustays) icustays = load_weights(icustays) icustays = load_LODS(icustays) icustays = load_OASIS(icustays) icustays = load_APACHE(icustays) vaso_episodes = load_vasopressor_durations() vaso_episodes = clean_vaso_episodes_1(vaso_episodes) vaso_episodes = pair_episodes_and_stays(vaso_episodes, icustays) vaso_episodes = compute_hours_to_pressor(vaso_episodes) vaso_episodes = clean_vaso_episodes_2(vaso_episodes) directories.print_log( "\tsaving to SQL database `PressorGauge`, table `pressors_by_icustay`") with directories.engine.connect() as connection: vaso_episodes.to_sql("pressors_by_icustay", con=connection, if_exists="replace", index_label="ICUSTAY_ID") directories.print_log("\ttotal of", len(vaso_episodes), "icustays, of which", sum(vaso_episodes.EPISODE > 0), "have a pressor episode") check = 100 * float(sum(vaso_episodes.EPISODE == 1)) / len(vaso_episodes) directories.print_log("\tsanity check: %2.0f%%" % check, "have pressors, ideally in range 1/4 to 1/3") directories.print_log("Done building pressor database!") directories.print_log()
def clean_vaso_episodes_2(vaso_episodes): vaso_episodes.reset_index(inplace=True) vaso_episodes.ICUSTAY_ID = vaso_episodes.ICUSTAY_ID.astype(int) vaso_episodes.EPISODE = vaso_episodes.EPISODE.astype(int) vaso_episodes.NUMBER_OF_EPISODES = vaso_episodes.NUMBER_OF_EPISODES.astype( int) vaso_episodes.loc[vaso_episodes.EPISODE == 0, "EPISODE_START_POST_TRANSFER"] = ( vaso_episodes.OUTTIME - vaso_episodes.ADMITTIME)[vaso_episodes.EPISODE == 0] vaso_episodes.loc[vaso_episodes.EPISODE == 0, "STARTTIME"] = vaso_episodes.OUTTIME[ vaso_episodes.EPISODE == 0] # remove episodes that start in the first day sum(vaso_episodes.EPISODE_START_POST_TRANSFER < pd.Timedelta( hours=MINIMUM_TIME_TO_PRESSOR_HOURS)) rows_to_remove = vaso_episodes.EPISODE_START_POST_TRANSFER < pd.Timedelta( hours=MINIMUM_TIME_TO_PRESSOR_HOURS) directories.print_log( "\tdropping", sum(rows_to_remove), "icustays with first pressor episode occurring less than", "%2i" % MINIMUM_TIME_TO_PRESSOR_HOURS, "hours after hospital admission") vaso_episodes = vaso_episodes[~rows_to_remove].copy() len(vaso_episodes) ## clean up ICU stays rows_to_replace = vaso_episodes.AGE > 150 directories.print_log("\treplacing age of", sum(rows_to_replace), "patients over 89 with age 91") vaso_episodes.loc[rows_to_replace, "AGE"] = 91 rows_to_remove = vaso_episodes.AGE < MINIMUM_AGE directories.print_log("\tdropping", sum(rows_to_remove), "icustays with age less than", MINIMUM_AGE) vaso_episodes = vaso_episodes[~rows_to_remove] rows_to_remove = (vaso_episodes.DURATION_HOURS < MINIMUM_PRESSOR_DURATION_MINUTES / 60) & ( vaso_episodes.EPISODE == 1) directories.print_log("\tdropping", sum(rows_to_remove), "pressor episodes with vaso durations less than", MINIMUM_PRESSOR_DURATION_MINUTES, "minutes") vaso_episodes = vaso_episodes[~rows_to_remove] vaso_episodes.set_index("ICUSTAY_ID", inplace=True) # compute pressor time distribution kde_true = KernelDensity(kernel="tophat") kde_true.fit( np.reshape( vaso_episodes[vaso_episodes.EPISODE == 1] ["EPISODE_START_POST_TRANSFER"].values.astype(int) / 10**9 / 60 / 60, (-1, 1))) # in hours kde_false = KernelDensity(kernel="tophat") kde_false.fit( np.reshape( vaso_episodes[vaso_episodes.EPISODE == 0] ["EPISODE_START_POST_TRANSFER"].values.astype(int) / 10**9 / 60 / 60, (-1, 1))) # in hours with open(os.path.join(directories.model_dir, "time_distributions.p"), "wb") as file: pickle.dump({"kde_true": kde_true, "kde_false": kde_false}, file) # vaso_episodes.reset_index(inplace=True) vaso_episodes[ "PRESSOR_START_SEC"] = vaso_episodes.STARTTIME - vaso_episodes.ADMITTIME vaso_episodes = vaso_episodes[~vaso_episodes.PRESSOR_START_SEC.isna()] vaso_episodes.PRESSOR_START_SEC = ( vaso_episodes.PRESSOR_START_SEC.astype(int) / 10**9).apply(np.int32) labels = [ "index", "ENDTIME", "DURATION_HOURS", "NUMBER_OF_EPISODES", "INTIME", "OUTTIME", "EPISODE_START_POST_TRANSFER", "STARTTIME", "DOB" ] vaso_episodes.drop(axis=1, labels=labels, inplace=True) len1 = len(vaso_episodes) vaso_episodes.dropna(inplace=True) len2 = len(vaso_episodes) directories.print_log("\tdropping", len1 - len2, "icustays with missing values") directories.print_log( "\tdropping", len(vaso_episodes) - len(vaso_episodes.HADM_ID.unique()), "multiple ICU stays in same hospital visit") vaso_episodes.drop_duplicates("HADM_ID", inplace=True) return vaso_episodes
def main(): with directories.engine.connect() as connection: pressors_by_icustay = pd.read_sql("pressors_by_icustay", con=connection, index_col="ICUSTAY_ID") X = pressors_by_icustay.groupby("SUBJECT_ID").agg({ "EPISODE": "max" }).reset_index() y = X.pop("EPISODE") directories.print_log("splitting train, validation, and test sets", mode="w") X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=TEST_SIZE_FRACTION, random_state=RND_SEED, stratify=y) directories.print_log("\t", len(X_train), "training samples") directories.print_log("\t", len(X_test), "testing samples") directories.print_log("\t saving train and test set identifiers") with open(os.path.join(directories.model_dir, "training_subjects.p"), "wb") as file: pickle.dump(X_train, file) with open(os.path.join(directories.model_dir, "test_subjects.p"), "wb") as file: pickle.dump(X_test, file) directories.print_log("done splitting data!") directories.print_log()
def extract_lab_events(vaso_episodes, interval_splits): # initialize chartevents dataframe columns = [ "SUBJECT_ID", "ITEMID", "CHARTTIME", "VALUENUM", "FLAG", "HADM_ID" ] dtypes = { "VALUENUM": float, "FLAG": str, "SUBJECT_ID": pd.Int32Dtype(), "HADM_ID": pd.Int32Dtype() } date_cols = ["CHARTTIME"] directories.print_log("extracting relevent lab events") path = os.path.join(directories.mimic_dir, "LABEVENTS.csv") count = 0 replace_flag = True for chunk in pd.read_csv(path, chunksize=CHUNK_SIZE, usecols=columns, dtype=dtypes, parse_dates=date_cols): # add columns to keep track of whether or not 24 hours from pressor (PRESSOR_LABEL) # and which hour before pressor event it it (HOURS_BEFORE_PRESSOR) chunk = chunk[~pd.isna( chunk.HADM_ID )] # throwing away about 25% of labs from outpatient settings b/c hard to attach them to an admission nubmer chunk["HOURS_BEFORE_PRESSOR"] = 0 temp = chunk.join(vaso_episodes[["HADM_ID", "ADMITTIME"]].set_index("HADM_ID"), on="HADM_ID") chunk.CHARTTIME = np.int32( (chunk.CHARTTIME - temp.ADMITTIME).values / 10**9) count += 1 if count > NUM_CHUNKS: break else: directories.print_log("\tprocessing LABEVENTS chunk:", count, "of", math.ceil(27_854_055 / CHUNK_SIZE)) # loop through all the times in a given hospital admission (HADM_ID), recall we keep only hospital admissions with 1 icustay for (time, hadm), time_group in chunk[chunk.HADM_ID.isin( vaso_episodes.HADM_ID)].groupby(["CHARTTIME", "HADM_ID"]): # only extract lab data if it falls in the interval i splits = interval_splits.loc[hadm] i = find_first((splits[1:-1].values < time) & (time <= splits[:-2].values)) if i: time_group = time_group.copy() time_group.HOURS_BEFORE_PRESSOR = i # if within one day of pressor event, lable true if episode==1 # if first entry to be added, replace table, otherwise add to it if replace_flag: mode = "replace" replace_flag = False else: mode = "append" with directories.engine.connect() as connection: time_group.to_sql("lab_events", con=connection, if_exists=mode, index=False)