예제 #1
0
def load_APACHE(icustays):
    path = os.path.join(directories.mimic_dir, "apache.csv")
    dtypes = {
        "icustay_id": pd.Int32Dtype(),
        "APSIII": pd.Int16Dtype(),
    }
    apache = pd.read_csv(path, dtype=dtypes)
    apache.rename(inplace=True,
                  columns={
                      "icustay_id": "ICUSTAY_ID",
                      "APSIII": "APACHE"
                  })
    # print number of icustays dropped from missing APACHE
    length_before_drop = len(apache)
    apache.dropna(axis=0, inplace=True)  # 34 na icustays
    length_after_drop = len(apache)
    directories.print_log("\t", length_before_drop - length_after_drop,
                          "icustays w/o APACHE")
    # interpret APACHE score as int
    apache.ICUSTAY_ID = apache.ICUSTAY_ID.astype(int)
    apache.APACHE = apache.APACHE.astype(int)
    icustays = icustays.join(apache.set_index("ICUSTAY_ID"),
                             on="ICUSTAY_ID",
                             how="left")
    return icustays
예제 #2
0
def load_LODS(icustays):
    path = os.path.join(directories.mimic_dir, "lods.csv")
    dtypes = {
        "icustay_id": pd.Int32Dtype(),
        "LODS": pd.Int16Dtype(),
        "pulmonary": pd.Int16Dtype(),
        "cardiovascular": pd.Int16Dtype(),
    }
    lods = pd.read_csv(path, dtype=dtypes)
    lods.rename(inplace=True,
                columns={
                    "icustay_id": "ICUSTAY_ID",
                    "pulmonary": "PULMONARY_LODS",
                    "cardiovascular": "CARDIOVASCULAR_LODS"
                })
    # print number of icustays dropped from missing LODS
    length_before_drop = len(lods)
    lods.dropna(axis=0, inplace=True)  # 2482 na icustays
    length_after_drop = len(lods)
    directories.print_log("\t", length_before_drop - length_after_drop,
                          "icustays w/o LODS")
    # interpret LODS scores as ints
    lods.ICUSTAY_ID = lods.ICUSTAY_ID.astype(int)
    lods.LODS = lods.LODS.astype(int)
    lods.PULMONARY_LODS = lods.PULMONARY_LODS.astype(int)
    lods.CARDIOVASCULAR_LODS = lods.CARDIOVASCULAR_LODS.astype(int)
    icustays = icustays.join(lods.set_index("ICUSTAY_ID"),
                             on="ICUSTAY_ID",
                             how="left")
    return icustays
예제 #3
0
def extract_test_inputs(cleaned_data):
    with open(os.path.join(directories.model_dir, "test_subjects.p"),
              "rb") as file:
        test_subjects = pickle.load(file).values
    test_subjects = test_subjects.reshape(-1, )

    test_inputs = cleaned_data[cleaned_data.SUBJECT_ID.isin(test_subjects)]
    test_inputs = test_inputs[
        ["LABEL", "SUBJECT_ID", "EPISODE", "EPOCH", "HADM_ID"] +
        feature_definitions.features_to_keep]

    ratio = 100. * sum(test_inputs.LABEL.values) / len(test_inputs)
    directories.print_log("\t\t", len(test_inputs),
                          "test samples before dropping na,",
                          "%2.0f%%" % ratio, "are positive)")
    test_inputs = test_inputs.dropna()

    ratio = 100. * sum(test_inputs.LABEL.values) / len(test_inputs)
    directories.print_log("\t\t", len(test_inputs),
                          "test samples after dropping na,", "%2.0f%%" % ratio,
                          "are positive)")

    print("\t%2.0f%% of test subjects will need pressors" %
          (100 * sum(test_inputs.EPISODE == 1) / len(test_inputs)))
    print("\t%2.0f%% of test subjects won't need pressors at all" %
          (100 * sum(test_inputs.EPISODE == 0) / len(test_inputs)))

    with open(os.path.join(directories.model_dir, "test_features.p"),
              "wb") as file:
        pickle.dump(test_inputs, file)
예제 #4
0
def main():
    directories.print_log("preprocessing data")
    directories.print_log("\tpreparing training set")
    preprocess_training_inputs()
    directories.print_log("\tpreparing test set")
    preprocess_test_inputs()
    directories.print_log("done preprocessing data!")
    directories.print_log()
예제 #5
0
def compute_hours_to_pressor(vaso_episodes):
    vaso_episodes[
        "EPISODE_START_POST_TRANSFER"] = vaso_episodes.STARTTIME - vaso_episodes.ADMITTIME
    rows_to_remove = vaso_episodes.EPISODE_START_POST_TRANSFER > pd.Timedelta(
        days=MAXIMUM_TIME_TO_PRESSOR_DAYS)
    directories.print_log(
        "\tdropping", sum(rows_to_remove),
        "icustays with first pressor episode occurring more than",
        "%2i" % MAXIMUM_TIME_TO_PRESSOR_DAYS, "days after admission")
    vaso_episodes = vaso_episodes[~rows_to_remove]
    return vaso_episodes
예제 #6
0
def build_features(labs, pressors_by_icustay):
    directories.print_log("\tbuilding features")
    data = []
    for i in range(len(labs)):
        directories.print_log("\t\tfor epoch", i)
        temp = feature_definitions.attach_lab_features(pressors_by_icustay,
                                                       labs[i])
        temp["LABEL"] = (temp.EPISODE
                         == 1) & (RECORD_LENGTH_SHIFT * i <= 24
                                  )  # positive detection is 12 hours or less
        temp["EPOCH"] = i
        data.append(temp)
    return pd.concat(data)
예제 #7
0
def main():
    # load in ventilation times
    with directories.engine.connect() as connection:
        vaso_episodes = pd.read_sql("pressors_by_icustay",
                                    con=connection,
                                    index_col="ICUSTAY_ID")

    with directories.engine.connect() as connection:
        interval_splits = pd.read_sql("interval_splits",
                                      con=connection,
                                      index_col="ICUSTAY_ID")

    interval_splits.set_index("HADM_ID", inplace=True)

    extract_lab_events(vaso_episodes, interval_splits)

    directories.print_log("Done processing lab events!")
    directories.print_log()
예제 #8
0
def extract_lab_records():
    directories.print_log("\textracting lab records for", RECORD_LENGTH_HOURS,\
        "hr windows, with a shift of", RECORD_LENGTH_SHIFT, "hrs")
    labs = []
    with directories.engine.connect() as connection:
        for k in range(int(48 / RECORD_LENGTH_SHIFT)):
            directories.print_log("\t\tloading epoch", k)
            temp = []
            for i in range(RECORD_LENGTH_HOURS):
                QUERY = f"""
                select *
                from lab_events
                where "HOURS_BEFORE_PRESSOR"={RECORD_LENGTH_SHIFT*k + 1 +i}
                order by "HOURS_BEFORE_PRESSOR"
                """
                temp.append(pd.read_sql_query(QUERY, con=connection))
            labs.append(pd.concat(temp))
    return labs
예제 #9
0
def load_weights(icustays):
    path = os.path.join(directories.mimic_dir, "heightweight.csv")
    heightweight = pd.read_csv(path,
                               dtype={"icustay_id": pd.Int32Dtype()},
                               usecols=["icustay_id", "weight_min"])
    heightweight.rename(inplace=True,
                        columns={
                            "icustay_id": "ICUSTAY_ID",
                            "weight_min": "WEIGHT_KG"
                        })
    # print num rows dropped b/c of weight
    length_before_drop = len(heightweight)
    heightweight = heightweight[(MINIMUM_WEIGHT_KG < heightweight.WEIGHT_KG) & \
        (heightweight.WEIGHT_KG < MAXIMUM_WEIGHT_KG)]
    length_after_drop = len(heightweight)
    directories.print_log("\t",length_before_drop-length_after_drop, "icustays w/ weight greater than",\
        MAXIMUM_WEIGHT_KG, "kg (likely mislabeled)")
    icustays = icustays.join(heightweight.set_index("ICUSTAY_ID"),
                             on="ICUSTAY_ID",
                             how="left")
    return icustays
예제 #10
0
def load_OASIS(icustays):
    path = os.path.join(directories.mimic_dir, "oasis.csv")
    dtypes = {
        "icustay_id": pd.Int32Dtype(),
        "OASIS": pd.Int16Dtype(),
    }
    oasis = pd.read_csv(path, dtype=dtypes)
    oasis.rename(inplace=True, columns={
        "icustay_id": "ICUSTAY_ID",
    })
    # print number of icustays dropped from missing OASIS
    length_before_drop = len(oasis)
    oasis.dropna(axis=0, inplace=True)  # 0 na icustays
    length_after_drop = len(oasis)
    directories.print_log("\t", length_before_drop - length_after_drop,
                          "icustays w/o OASIS")
    # interpret OASIS scores as int
    oasis.ICUSTAY_ID = oasis.ICUSTAY_ID.astype(int)
    oasis.OASIS = oasis.OASIS.astype(int)
    icustays = icustays.join(oasis.set_index("ICUSTAY_ID"),
                             on="ICUSTAY_ID",
                             how="left")
    return icustays
예제 #11
0
def main():

    with engine.connect() as connection:
        vaso_episodes = pd.read_sql("pressors_by_icustay", con=connection, index_col="ICUSTAY_ID")

    print_log("building hour-long intervals for each icustay")

    interval_splits = [pd.Series(vaso_episodes.PRESSOR_START_SEC - i*60*60, name=i, dtype=pd.Int32Dtype()) for i in range(N_WINDOWS+1)]
    interval_splits = pd.concat(interval_splits, axis=1)
    interval_splits = interval_splits.join(vaso_episodes[["SUBJECT_ID","HADM_ID"]])

    print_log("\tsaving intervals to database `PressorGauge` in table `intervals`")
    with engine.connect() as connection:
        interval_splits.to_sql("interval_splits", con=connection, if_exists="replace", index_label="ICUSTAY_ID")

    print_log("Done computing intervals!")
    print_log()
예제 #12
0
def main():
    directories.print_log("building features")
    labs = extract_lab_records()

    with directories.engine.connect() as connection:
        pressors_by_icustay = pd.read_sql("pressors_by_icustay",
                                          con=connection)

    data = build_features(labs, pressors_by_icustay)

    directories.print_log("\tcleaning features")
    cleaned_data = feature_definitions.clean_data(data)
    # set nan flags to zero, since there were no lab results at all in those windows
    cleaned_data.TOTAL_FLAG_COUNT.fillna(0, inplace=True)
    for feature in feature_definitions.features:
        cleaned_data[feature_definitions.features[feature]["ab"] +
                     "_FLAG_COUNT"].fillna(0, inplace=True)

    extract_training_inputs(cleaned_data)

    extract_test_inputs(cleaned_data)

    directories.print_log("done building features!")
    directories.print_log()
예제 #13
0
def main():
    directories.print_log("building pressor database", mode="w")
    icustays = load_icustay()
    icustays = load_admissions(icustays)
    icustays = load_patients(icustays)
    icustays = load_weights(icustays)
    icustays = load_LODS(icustays)
    icustays = load_OASIS(icustays)
    icustays = load_APACHE(icustays)

    vaso_episodes = load_vasopressor_durations()
    vaso_episodes = clean_vaso_episodes_1(vaso_episodes)
    vaso_episodes = pair_episodes_and_stays(vaso_episodes, icustays)
    vaso_episodes = compute_hours_to_pressor(vaso_episodes)
    vaso_episodes = clean_vaso_episodes_2(vaso_episodes)

    directories.print_log(
        "\tsaving to SQL database `PressorGauge`, table `pressors_by_icustay`")
    with directories.engine.connect() as connection:
        vaso_episodes.to_sql("pressors_by_icustay",
                             con=connection,
                             if_exists="replace",
                             index_label="ICUSTAY_ID")

    directories.print_log("\ttotal of", len(vaso_episodes),
                          "icustays, of which", sum(vaso_episodes.EPISODE > 0),
                          "have a pressor episode")
    check = 100 * float(sum(vaso_episodes.EPISODE == 1)) / len(vaso_episodes)
    directories.print_log("\tsanity check: %2.0f%%" % check,
                          "have pressors, ideally in range 1/4 to 1/3")

    directories.print_log("Done building pressor database!")
    directories.print_log()
예제 #14
0
def clean_vaso_episodes_2(vaso_episodes):
    vaso_episodes.reset_index(inplace=True)
    vaso_episodes.ICUSTAY_ID = vaso_episodes.ICUSTAY_ID.astype(int)
    vaso_episodes.EPISODE = vaso_episodes.EPISODE.astype(int)
    vaso_episodes.NUMBER_OF_EPISODES = vaso_episodes.NUMBER_OF_EPISODES.astype(
        int)

    vaso_episodes.loc[vaso_episodes.EPISODE == 0,
                      "EPISODE_START_POST_TRANSFER"] = (
                          vaso_episodes.OUTTIME -
                          vaso_episodes.ADMITTIME)[vaso_episodes.EPISODE == 0]
    vaso_episodes.loc[vaso_episodes.EPISODE == 0,
                      "STARTTIME"] = vaso_episodes.OUTTIME[
                          vaso_episodes.EPISODE == 0]

    # remove episodes that start in the first day
    sum(vaso_episodes.EPISODE_START_POST_TRANSFER < pd.Timedelta(
        hours=MINIMUM_TIME_TO_PRESSOR_HOURS))
    rows_to_remove = vaso_episodes.EPISODE_START_POST_TRANSFER < pd.Timedelta(
        hours=MINIMUM_TIME_TO_PRESSOR_HOURS)
    directories.print_log(
        "\tdropping", sum(rows_to_remove),
        "icustays with first pressor episode occurring less than",
        "%2i" % MINIMUM_TIME_TO_PRESSOR_HOURS,
        "hours after hospital admission")
    vaso_episodes = vaso_episodes[~rows_to_remove].copy()
    len(vaso_episodes)

    ## clean up ICU stays
    rows_to_replace = vaso_episodes.AGE > 150
    directories.print_log("\treplacing age of", sum(rows_to_replace),
                          "patients over 89 with age 91")
    vaso_episodes.loc[rows_to_replace, "AGE"] = 91
    rows_to_remove = vaso_episodes.AGE < MINIMUM_AGE
    directories.print_log("\tdropping", sum(rows_to_remove),
                          "icustays with age less than", MINIMUM_AGE)
    vaso_episodes = vaso_episodes[~rows_to_remove]

    rows_to_remove = (vaso_episodes.DURATION_HOURS <
                      MINIMUM_PRESSOR_DURATION_MINUTES / 60) & (
                          vaso_episodes.EPISODE == 1)
    directories.print_log("\tdropping", sum(rows_to_remove),
                          "pressor episodes with vaso durations less than",
                          MINIMUM_PRESSOR_DURATION_MINUTES, "minutes")
    vaso_episodes = vaso_episodes[~rows_to_remove]

    vaso_episodes.set_index("ICUSTAY_ID", inplace=True)

    # compute pressor time distribution
    kde_true = KernelDensity(kernel="tophat")
    kde_true.fit(
        np.reshape(
            vaso_episodes[vaso_episodes.EPISODE == 1]
            ["EPISODE_START_POST_TRANSFER"].values.astype(int) / 10**9 / 60 /
            60, (-1, 1)))  # in hours
    kde_false = KernelDensity(kernel="tophat")
    kde_false.fit(
        np.reshape(
            vaso_episodes[vaso_episodes.EPISODE == 0]
            ["EPISODE_START_POST_TRANSFER"].values.astype(int) / 10**9 / 60 /
            60, (-1, 1)))  # in hours
    with open(os.path.join(directories.model_dir, "time_distributions.p"),
              "wb") as file:
        pickle.dump({"kde_true": kde_true, "kde_false": kde_false}, file)

    # vaso_episodes.reset_index(inplace=True)
    vaso_episodes[
        "PRESSOR_START_SEC"] = vaso_episodes.STARTTIME - vaso_episodes.ADMITTIME
    vaso_episodes = vaso_episodes[~vaso_episodes.PRESSOR_START_SEC.isna()]
    vaso_episodes.PRESSOR_START_SEC = (
        vaso_episodes.PRESSOR_START_SEC.astype(int) / 10**9).apply(np.int32)
    labels = [
        "index", "ENDTIME", "DURATION_HOURS", "NUMBER_OF_EPISODES", "INTIME",
        "OUTTIME", "EPISODE_START_POST_TRANSFER", "STARTTIME", "DOB"
    ]
    vaso_episodes.drop(axis=1, labels=labels, inplace=True)

    len1 = len(vaso_episodes)
    vaso_episodes.dropna(inplace=True)
    len2 = len(vaso_episodes)
    directories.print_log("\tdropping", len1 - len2,
                          "icustays with missing values")

    directories.print_log(
        "\tdropping",
        len(vaso_episodes) - len(vaso_episodes.HADM_ID.unique()),
        "multiple ICU stays in same hospital visit")
    vaso_episodes.drop_duplicates("HADM_ID", inplace=True)

    return vaso_episodes
예제 #15
0
def main():
    with directories.engine.connect() as connection:
        pressors_by_icustay = pd.read_sql("pressors_by_icustay",
                                          con=connection,
                                          index_col="ICUSTAY_ID")

    X = pressors_by_icustay.groupby("SUBJECT_ID").agg({
        "EPISODE": "max"
    }).reset_index()
    y = X.pop("EPISODE")

    directories.print_log("splitting train, validation, and test sets",
                          mode="w")
    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        X, y, test_size=TEST_SIZE_FRACTION, random_state=RND_SEED, stratify=y)

    directories.print_log("\t", len(X_train), "training samples")
    directories.print_log("\t", len(X_test), "testing samples")

    directories.print_log("\t saving train and test set identifiers")
    with open(os.path.join(directories.model_dir, "training_subjects.p"),
              "wb") as file:
        pickle.dump(X_train, file)
    with open(os.path.join(directories.model_dir, "test_subjects.p"),
              "wb") as file:
        pickle.dump(X_test, file)

    directories.print_log("done splitting data!")
    directories.print_log()
예제 #16
0
def extract_lab_events(vaso_episodes, interval_splits):
    # initialize chartevents dataframe
    columns = [
        "SUBJECT_ID", "ITEMID", "CHARTTIME", "VALUENUM", "FLAG", "HADM_ID"
    ]

    dtypes = {
        "VALUENUM": float,
        "FLAG": str,
        "SUBJECT_ID": pd.Int32Dtype(),
        "HADM_ID": pd.Int32Dtype()
    }

    date_cols = ["CHARTTIME"]

    directories.print_log("extracting relevent lab events")
    path = os.path.join(directories.mimic_dir, "LABEVENTS.csv")
    count = 0
    replace_flag = True
    for chunk in pd.read_csv(path,
                             chunksize=CHUNK_SIZE,
                             usecols=columns,
                             dtype=dtypes,
                             parse_dates=date_cols):
        # add columns to keep track of whether or not 24 hours from pressor (PRESSOR_LABEL)
        # and which hour before pressor event it it (HOURS_BEFORE_PRESSOR)
        chunk = chunk[~pd.isna(
            chunk.HADM_ID
        )]  # throwing away about 25% of labs from outpatient settings b/c hard to attach them to an admission nubmer
        chunk["HOURS_BEFORE_PRESSOR"] = 0
        temp = chunk.join(vaso_episodes[["HADM_ID",
                                         "ADMITTIME"]].set_index("HADM_ID"),
                          on="HADM_ID")
        chunk.CHARTTIME = np.int32(
            (chunk.CHARTTIME - temp.ADMITTIME).values / 10**9)
        count += 1
        if count > NUM_CHUNKS:
            break
        else:
            directories.print_log("\tprocessing LABEVENTS chunk:", count, "of",
                                  math.ceil(27_854_055 / CHUNK_SIZE))
            # loop through all the times in a given hospital admission (HADM_ID), recall we keep only hospital admissions with 1 icustay
            for (time, hadm), time_group in chunk[chunk.HADM_ID.isin(
                    vaso_episodes.HADM_ID)].groupby(["CHARTTIME", "HADM_ID"]):
                # only extract lab data if it falls in the interval i
                splits = interval_splits.loc[hadm]
                i = find_first((splits[1:-1].values < time)
                               & (time <= splits[:-2].values))
                if i:
                    time_group = time_group.copy()
                    time_group.HOURS_BEFORE_PRESSOR = i
                    # if within one day of pressor event, lable true if episode==1
                    # if first entry to be added, replace table, otherwise add to it
                    if replace_flag:
                        mode = "replace"
                        replace_flag = False
                    else:
                        mode = "append"
                    with directories.engine.connect() as connection:
                        time_group.to_sql("lab_events",
                                          con=connection,
                                          if_exists=mode,
                                          index=False)