예제 #1
0
def input_fn(is_training, data_dir, batch_size, should_repeat=False):

    dataset = data_generator.Dataset(is_training=is_training, data_dir=data_dir,
                                     batch_size=batch_size, should_repeat=should_repeat)

    iter = dataset.get_one_shot_iterator()

    image, label = iter.get_next()

    return image, label
def main(args=sys.argv[1:]):
    train_size = float(args[0])
    seed = int(args[1])
    icu_data_dir = args[2]

    # Read the y data
    outcomes = pd.read_csv(icu_data_dir + "Outcomes-a.txt")
    subject_outcomes = outcomes[["RecordID", "In-hospital_death"]]

    # Create a dictionary of features for each subject
    # Using a dictionary because some of the features don't appear in all subjects...
    value_range = {}  # this is just for printing out ranges of the values
    file_folder = icu_data_dir + "set-a/"
    all_subject_features = {}
    for idx, filename in enumerate(os.listdir(file_folder)[:MAX_PROCESS]):
        df = pd.read_csv("%s%s" % (file_folder, filename))
        df["hour"] = np.array([time.split(":")[0] for time in df.Time.values],
                              dtype=int)
        df["minute"] = np.array(
            [time.split(":")[1] for time in df.Time.values], dtype=int)
        df.Time = df.hour * 60 + df.minute

        record_id = int(df.loc[0].Value)
        subject_features = {"RecordID": record_id}
        for feat_name, process_func_list in FEATURES.items():
            if WEIGHTED_MEAN in process_func_list:
                sub_df = df.loc[(df.Parameter == feat_name) & (df.Value > 0)]
            else:
                sub_df = df.loc[(df.Parameter == feat_name) & (df.Value >= 0)]

            if sub_df.shape[0] == 0:
                continue
            if feat_name not in value_range:
                value_range[feat_name] = [
                    sub_df.Value.min(), sub_df.Value.max()
                ]
            else:
                value_range[feat_name][0] = min(value_range[feat_name][0],
                                                sub_df.Value.min())
                value_range[feat_name][1] = max(value_range[feat_name][1],
                                                sub_df.Value.max())

            for func in process_func_list:
                value = func(sub_df)
                if not np.isfinite(value):
                    print(value, feat_name, func.__name__)
                    print(sub_df)
                assert np.isfinite(value)
                full_feature_name = "%s:%s" % (feat_name, func.__name__)
                subject_features[full_feature_name] = value

        fio2_df = df.loc[df.Parameter == "FiO2"]
        pao2_df = df.loc[df.Parameter == "PaO2"]
        if fio2_df.shape[0] and pao2_df.shape[0]:
            fio2_mean = _get_mean(fio2_df)
            pao2_mean = _get_mean(pao2_df)
            if fio2_mean > 0:
                subject_features["O2:_get_ratio"] = pao2_mean / fio2_mean

        all_subject_features[idx] = subject_features

    for k, v in value_range.items():
        print(k, v)

    subjects_x = pd.DataFrame.from_dict(all_subject_features, orient="index")

    ## if a covariate has > 30% missing data, remove it
    prop_nan = subjects_x.apply(lambda x: np.mean(np.isnan(x)))
    print('Features filtered for proportion of NA values >= 0.3')
    print(prop_nan >= 0.3)
    tmp = subjects_x.loc[:, prop_nan < 0.3]
    subjects_x = tmp

    # Merge the X and Y data
    icu_subjects = subjects_x.merge(subject_outcomes, on="RecordID")
    death_resp = icu_subjects["In-hospital_death"]
    icu_subjects = icu_subjects.drop(columns=["RecordID"])

    # Grab column names
    column_names = list(icu_subjects.columns.values)
    print(column_names)
    # icu_subjects = icu_subjects.as_matrix()
    icu_subjects = icu_subjects.loc[:, column_names].values

    # Center the x covariates
    centering_term = np.nanmean(icu_subjects, axis=0)
    centering_term[-1] = 0
    icu_subjects -= centering_term
    assert np.all(death_resp == icu_subjects[:, -1])

    # randomly split the data
    if train_size < 1:
        mats = train_test_split(icu_subjects,
                                train_size=train_size,
                                test_size=1.0 - train_size,
                                random_state=seed)
        x_train = mats[0][:, :-1]
        y_train = mats[0][:, -1:]
        x_test = mats[1][:, :-1]
        y_test = mats[1][:, -1:]
    else:
        x_train = icu_subjects[:, :-1]
        y_train = icu_subjects[:, -1:]
        x_test = x_train
        y_test = y_train

    print(x_train.shape)
    print(y_train.shape)
    print(x_test.shape)
    print(y_test.shape)

    # Save the data
    icu_data = data_generator.Dataset(x_train=x_train,
                                      y_train=y_train,
                                      x_test=x_test,
                                      y_test=y_test)

    ## save off as a pickle
    icu_processed_file = icu_data_dir + "icu_data_processed.pkl"
    pickle_to_file(icu_data, icu_processed_file)

    icu_column_file = icu_data_dir + "icu_data_column_names.txt"
    with open(icu_column_file, "w") as f:
        for i, col in enumerate(column_names[:-1]):
            f.write("%d, %s\n" % (i, col))

    feature_group_list, vi_group_names, nan_fill_config = _process_feature_groups(
        column_names[:-1])
    print(
        "Copy paste this for creating the variable importance groups argument!"
    )
    print("--var-import-idx %s" % ";".join(feature_group_list))
    icu_vi_name_file = icu_data_dir + "icu_data_var_import_names.csv"
    vi_group_name_df = pd.DataFrame.from_dict(vi_group_names, orient="index")
    vi_group_name_df.to_csv(icu_vi_name_file)

    nan_config_file = icu_data_dir + "nan_fill_config.json"
    with open(nan_config_file, 'w') as f:
        json.dump(nan_fill_config, f)
예제 #3
0
def main(args=sys.argv[1:]):
    train_size = 0.5
    seed = 0

    # Read the y data
    outcomes = pd.read_csv("../data/Outcomes-a.txt")
    subject_outcomes = outcomes[["RecordID", "Length_of_stay", "Survival"]]

    # Create a dictionary of features for each subject
    # Using a dictionary because some of the features don't appear in all subjects...
    value_range = {}  # this is just for printing out ranges of the values
    file_folder = "../data/set-a/"
    all_subject_features = {}
    for idx, filename in enumerate(os.listdir(file_folder)[:MAX_PROCESS]):
        df = pd.read_csv("%s%s" % (file_folder, filename))
        df["hour"] = np.array([time.split(":")[0] for time in df.Time.values],
                              dtype=int)
        df["minute"] = np.array(
            [time.split(":")[1] for time in df.Time.values], dtype=int)
        df.Time = df.hour * 60 + df.minute

        record_id = int(df.loc[0].Value)
        subject_features = {"RecordID": record_id}
        for feat_name, process_func_list in FEATURES.items():
            if WEIGHTED_MEAN in process_func_list:
                sub_df = df.loc[(df.Parameter == feat_name) & (df.Value > 0)]
            else:
                sub_df = df.loc[(df.Parameter == feat_name) & (df.Value >= 0)]

            if sub_df.shape[0] == 0:
                continue
            if feat_name not in value_range:
                value_range[feat_name] = [
                    sub_df.Value.min(), sub_df.Value.max()
                ]
            else:
                value_range[feat_name][0] = min(value_range[feat_name][0],
                                                sub_df.Value.min())
                value_range[feat_name][1] = max(value_range[feat_name][1],
                                                sub_df.Value.max())

            for func in process_func_list:
                value = func(sub_df)
                if not np.isfinite(value):
                    print(value, feat_name, func.__name__)
                    print(sub_df)
                assert np.isfinite(value)
                full_feature_name = "%s:%s" % (feat_name, func.__name__)
                subject_features[full_feature_name] = value

        fio2_df = df.loc[df.Parameter == "FiO2"]
        pao2_df = df.loc[df.Parameter == "PaO2"]
        if fio2_df.shape[0] and pao2_df.shape[0]:
            fio2_mean = _get_mean(fio2_df)
            pao2_mean = _get_mean(pao2_df)
            if fio2_mean > 0:
                subject_features["O2:_get_ratio"] = pao2_mean / fio2_mean

        all_subject_features[idx] = subject_features

    for k, v in value_range.items():
        print(k, v)

    subjects_x = pd.DataFrame.from_dict(all_subject_features, orient="index")

    # Merge the X and Y data
    icu_subjects = subjects_x.merge(subject_outcomes, on="RecordID")
    print(icu_subjects["Survival"])
    icu_subjects["resp"] = np.maximum(icu_subjects["Length_of_stay"],
                                      icu_subjects["Survival"])
    icu_subjects = icu_subjects.drop(columns=["RecordID"])
    print(np.mean(icu_subjects["Survival"]))
    print(np.median(icu_subjects["Survival"]))
    print(np.max(icu_subjects["Survival"]))
    print(np.mean(icu_subjects["Length_of_stay"]))
    print(np.median(icu_subjects["Length_of_stay"]))
    print(np.max(icu_subjects["Length_of_stay"]))

    # Grab column names
    column_names = list(icu_subjects.columns.values)
    icu_subjects = icu_subjects.as_matrix()

    # Center the x covariates
    centering_term = np.nanmean(icu_subjects, axis=0)
    centering_term[-1] = 0
    icu_subjects[:, :-3] -= centering_term[:-3]

    # randomly split the data
    print(column_names)
    mats = train_test_split(icu_subjects,
                            train_size=train_size,
                            test_size=1.0 - train_size,
                            random_state=seed)
    x_train = mats[0][:, :-3]
    y_train = mats[0][:, -1:]
    y_censored_train = mats[0][:, -2:-1] < 0
    x_test = mats[1][:, :-3]
    y_test = mats[1][:, -1:]
    y_censored_test = mats[1][:, -2:-1] < 0

    # Save the data
    icu_train_data = data_generator.Dataset(x=x_train,
                                            y=y_train,
                                            is_censored=y_censored_train)
    icu_test_data = data_generator.Dataset(x=x_test,
                                           y=y_test,
                                           is_censored=y_censored_test)

    ## save off as a pickle
    icu_processed_file = "../data/icu_data_processed.pkl"
    pickle_to_file({
        "train": icu_train_data,
        "test": icu_test_data
    }, icu_processed_file)

    icu_column_file = "../data/icu_data_column_names.txt"
    with open(icu_column_file, "w") as f:
        for i, col in enumerate(column_names[:-1]):
            f.write("%d, %s\n" % (i, col))
                    help="estimator to fit",
                    default="nn")
args = parser.parse_args()
print("Running " + args.estimator_type + " for VIM measure " + args.measure)
## --------------------------------------------------
## load the data, set up
## --------------------------------------------------
data = uts.pickle_from_file(args.dataset)
p = data.x_train.shape[1]
np.random.seed(args.seed)
folds_outer = np.random.choice(a=np.arange(2),
                               size=data.y_train.shape[0],
                               replace=True,
                               p=np.array([0.25, 0.75]))
data_0 = dg.Dataset(x_train=data.x_train[folds_outer == 0, :],
                    y_train=data.y_train[folds_outer == 0],
                    x_test=None,
                    y_test=None)
data_1 = dg.Dataset(x_train=data.x_train[folds_outer == 1, :],
                    y_train=data.y_train[folds_outer == 1],
                    x_test=None,
                    y_test=None)
cc_all = (np.sum(np.isnan(data.x_train), axis=1) == 0)
cc_all_test = (np.sum(np.isnan(data.x_test), axis=1) == 0)

if args.measure == 'auc':
    measure_func = mp.auc
    objective_function = 'binary:logistic'
    sl_scorer = log_loss
    mlp_class = MLPClassifier
    pred_type = "classification"
    ensemble_method = StackingClassifier
예제 #5
0
def do_one(n_train,
           n_test,
           p,
           m,
           measure_type,
           binary,
           gamma,
           cor,
           V,
           conditional_mean="nonlinear",
           estimator_type="tree"):
    """
    Run the simulation one time for a given set of parameters

    @param n: sample size
    @param p: dimension
    @param m: number of subsets to sample for SGD
    @param tail: number of SGD samples to use for tail averaging
    @param measure_type: variable importance measure
    @param binary: is the outcome binary?
    @param gamma: the constant multiplied by n for sampling
    @param cor: the correlation (only used if p > 10)
    @param V: folds for cross-fitting
    @param conditional_mean: type of conditional mean (linear or nonlinear)
    @param estimator_type: the type of estimator to fit (tree or linear model)

    @return multiple values, including
        shapley_vals: the shapley values
        shapley_ics: the influence curves for the shapley values
        shap_values: the mean absolute SHAP values
        shapley_dict['num_subsets_sampled']: the number of subsets sampled
        all_mps: all measures of predictiveness
        p_values: p-values
        hyp_tests: hypothesis test decisions
        shapley_dict['beta']: the "beta" matrix, from SGD on ics
    """
    # import standard libraries
    import numpy as np
    from xgboost import XGBRegressor
    from sklearn.linear_model import LinearRegression
    import shap
    from sklearn.model_selection import GridSearchCV
    from warnings import warn

    # import user-defined functions
    import data_generator as dg
    import measures_of_predictiveness as mp
    import utils as uts
    import get_influence_functions as gif
    import compute_ic as ci
    import get_shapley_value as gsv
    import shapley_hyp_test as sht

    # generate data
    if conditional_mean == "nonlinear":
        if binary:
            func_name = "ten_variable_binary_conditional_mean"
        else:
            func_name = "ten_variable_continuous_conditional_mean"
    else:
        func_name = "lm_conditional_mean"

    beta = np.array([1, 0, 1.2, 0, 1.05, 0] + [0] * (p - 6))

    if measure_type == "r_squared":
        measure_func = mp.r_squared
        objective_function = 'reg:linear'
    else:
        measure_func = mp.auc
        objective_function = 'binary:logistic'

    data_gen = dg.DataGenerator(func_name, n_train, n_test, p, binary, beta,
                                cor)
    draw = data_gen.create_data()
    folds_outer = np.random.choice(a=np.arange(2),
                                   size=draw.y_train.shape[0],
                                   replace=True,
                                   p=np.array([0.25, 0.75]))
    draw_0 = dg.Dataset(x_train=draw.x_train[folds_outer == 0, :],
                        y_train=draw.y_train[folds_outer == 0],
                        x_test=None,
                        y_test=None)
    draw_1 = dg.Dataset(x_train=draw.x_train[folds_outer == 1, :],
                        y_train=draw.y_train[folds_outer == 1],
                        x_test=None,
                        y_test=None)
    # set up args for xgboost

    # use the cross-validated selector to get the number of trees
    ntrees_tree = np.array([50, 100, 250, 500, 1000, 1500, 2000, 2500, 3000])
    lambdas_tree = np.array([1e-3, 1e-2, 1e-1, 1, 5, 10])
    param_grid_tree = [{
        'n_estimators': ntrees_tree,
        'reg_alpha': lambdas_tree
    }]
    # estimate full regression
    if estimator_type == "tree":
        cv_tree = GridSearchCV(XGBRegressor(objective=objective_function,
                                            max_depth=1,
                                            verbosity=0,
                                            learning_rate=1e-2,
                                            reg_lambda=0),
                               param_grid=param_grid_tree,
                               cv=5)
        cv_tree.fit(draw.x_train, np.ravel(draw.y_train))
        ensemble_tree = XGBRegressor(
            objective=objective_function,
            max_depth=1,
            verbosity=0,
            reg_lambda=0,
            learning_rate=1e-2,
            n_estimators=cv_tree.best_params_['n_estimators'],
            reg_alpha=cv_tree.best_params_['reg_alpha'])
        ensemble = ensemble_tree
        print("Num. est. in boosted tree: " +
              str(cv_tree.best_params_['n_estimators']))
    else:
        ensemble = LinearRegression(fit_intercept=False)
    # get a list of n subset sizes, Ss, Zs
    max_subset = np.array(list(range(p)))
    sampling_weights = np.append(
        np.append(1, [uts.choose(p - 2, s - 1)**(-1) for s in range(1, p)]), 1)
    subset_sizes = np.random.choice(np.arange(0, p + 1),
                                    p=sampling_weights / sum(sampling_weights),
                                    size=draw.x_train.shape[0] * gamma,
                                    replace=True)
    S_lst_all = [
        np.sort(np.random.choice(np.arange(0, p), subset_size, replace=False))
        for subset_size in list(np.sort(subset_sizes))
    ]
    # only need to continue with the unique subsets S
    Z_lst_all = [np.in1d(max_subset, S).astype(np.float64) for S in S_lst_all]
    Z, z_counts = np.unique(np.array(Z_lst_all), axis=0, return_counts=True)
    Z_lst = list(Z)
    Z_aug_lst = [np.append(1, z) for z in Z_lst]
    S_lst = [max_subset[z.astype(bool).tolist()] for z in Z_lst]
    if estimator_type == "tree":
        cv_tree_small = GridSearchCV(XGBRegressor(objective=objective_function,
                                                  max_depth=1,
                                                  verbosity=0,
                                                  learning_rate=1e-2,
                                                  reg_lambda=0),
                                     param_grid=param_grid_tree,
                                     cv=5)
        all_s_sizes = [len(s) for s in S_lst[1:]]
        s_sizes = np.unique(all_s_sizes)
        all_best_tree_lst = [None] * len(S_lst[1:])
        all_best_lambda_lst = [None] * len(S_lst[1:])
        for i in range(s_sizes.shape[0]):
            indx = all_s_sizes.index(s_sizes[i])
            this_s = S_lst[1:][indx]
            cc_i = (np.sum(np.isnan(draw_1.x_train[:, this_s]), axis=1) == 0)
            these_best_params = cv_tree_small.fit(
                draw_1.x_train[:, this_s][cc_i, :],
                np.ravel(draw_1.y_train[cc_i])).best_params_
            all_indices = [
                index for index, value in enumerate(all_s_sizes)
                if value == s_sizes[i]
            ]
            all_best_tree_lst = [
                these_best_params['n_estimators']
                if x in all_indices else all_best_tree_lst[x]
                for x in range(len(all_best_tree_lst))
            ]
            all_best_lambda_lst = [
                these_best_params['reg_alpha']
                if x in all_indices else all_best_lambda_lst[x]
                for x in range(len(all_best_lambda_lst))
            ]
        ensemble_funcs = [
            XGBRegressor(objective=objective_function,
                         max_depth=1,
                         verbosity=0,
                         reg_lambda=0,
                         reg_alpha=all_best_lambda_lst[i],
                         learning_rate=1e-2,
                         n_estimators=all_best_tree_lst[i])
            for i in range(len(all_best_tree_lst))
        ]
    else:
        ensemble_funcs = [ensemble for i in range(len(S_lst[1:]))]
    # get v, preds, ic for each unique S
    preds_none = np.repeat(np.mean(draw_1.y_train), draw_1.x_train.shape[0])
    v_none = measure_func(draw_1.y_train, preds_none)
    ic_none = ci.compute_ic(draw_1.y_train, preds_none, measure_type)
    # get v, preds, ic for the remaining non-null groups
    v_lst, preds_lst, ic_lst, folds = zip(
        *(mp.cv_predictiveness(draw_1,
                               S_lst[1:][i],
                               measure_func,
                               ensemble_funcs[i],
                               V=V,
                               stratified=binary,
                               na_rm=False) for i in range(len(S_lst[1:]))))
    v_lst_all = [v_none] + list(v_lst)
    ic_lst_all = [ic_none] + list(ic_lst)
    # set up Z, v, W, G, c_n matrices
    Z = np.array(Z_aug_lst)
    # constrain v >= 0
    v = np.maximum(np.array(v_lst_all), 0)
    W = np.diag(z_counts / np.sum(z_counts))
    G = np.vstack(
        (np.append(1,
                   np.zeros(p)), np.ones(p + 1) - np.append(1, np.zeros(p))))
    c_n = np.array([v_none, v_lst_all[len(v_lst)] - v_none])
    # do constrained least squares
    A_W = np.sqrt(W).dot(Z)
    v_W = np.sqrt(W).dot(v)
    kkt_matrix = uts.create_kkt_matrix(A_W, G)
    ls_matrix = np.vstack((2 * A_W.transpose().dot(v_W.reshape(
        (len(v_W), 1))), c_n.reshape((c_n.shape[0], 1))))
    ls_solution = np.linalg.pinv(kkt_matrix).dot(ls_matrix)
    shapley_vals = ls_solution[0:(p + 1), :]

    # get relevant objects
    shapley_ics = gif.shapley_influence_function(Z, z_counts, W, v,
                                                 shapley_vals, G, c_n,
                                                 np.array(ic_lst_all),
                                                 measure_func.__name__)
    # if any shapley values are < 0, make zero and print a warning
    if any(shapley_vals < 0):
        if any(shapley_vals[1:] < 0):
            warn("At least one estimated shapley value is < 0. Setting to 0.")
        shapley_vals = np.maximum(shapley_vals, 0)
    if any(shapley_vals > 1):
        if any(shapley_vals[1:] > 1):
            warn("At least one estimated shapley value is > 1. Setting to 1.")
        shapley_vals = np.minimum(shapley_vals, 1)

    # do hypothesis test
    # get the null predictiveness on a separate split
    preds_none_0 = np.repeat(np.mean(draw_0.y_train), draw_0.x_train.shape[0])
    v_none_0 = measure_func(draw_0.y_train, preds_none_0)
    ic_none_0 = ci.compute_ic(draw_0.y_train, preds_none_0, measure_type)
    sigma_none_0 = np.sqrt(np.mean(
        (ic_none_0)**2)) / np.sqrt(np.sum(draw_0.y_train.shape[0]))
    # get the shapley values + null predictiveness on the first split
    shapley_vals_plus = shapley_vals + shapley_vals[0]
    sigmas_one = [
        np.sqrt(gsv.shapley_se(shapley_ics, i, gamma)**2 + sigma_none_0**2)
        for i in range(1, p + 1)
    ]
    test_statistics, p_values, hyp_tests = sht.shapley_hyp_test(
        shapley_vals_plus[1:],
        v_none_0,
        sigmas_one,
        sigma_none_0,
        level=0.05,
        p=p)

    # get variable importance using SHAP values
    if estimator_type == "tree":
        mod = XGBRegressor(objective=objective_function,
                           learning_rate=1e-2,
                           reg_lambda=0,
                           max_depth=1,
                           n_estimators=cv_tree.best_params_['n_estimators'],
                           reg_alpha=cv_tree.best_params_['reg_alpha'],
                           verbosity=0)
        mod.fit(draw.x_train, draw.y_train)
        explainer = shap.TreeExplainer(mod)
    else:
        mod = LinearRegression(fit_intercept=False)
        mod.fit(draw.x_train, draw.y_train)
        explainer = shap.LinearExplainer((np.ravel(mod.coef_), 0),
                                         draw.x_train,
                                         feature_dependence='correlation',
                                         nsamples=500)

    shap_values = explainer.shap_values(draw.x_test)

    # return the population shapley values and averaged prediction-level shapley values
    return shapley_vals, shapley_ics, shap_values, Z.shape[
        0], v, p_values, hyp_tests