示例#1
0
def main(args=sys.argv[1:]):
    args = parse_args(args)
    print(args)

    train_aug_X, train_y, train_patients = read_and_extract_features(args, "train")
    test_aug_X, test_y, test_patients = read_and_extract_features(args, "test")

    print('Imputing missing values ...')
    # Impute things
    imputer = SimpleImputer(strategy="median")
    imputer.fit(train_aug_X)
    imputed_train_X = imputer.transform(train_aug_X)
    print("train data shape", imputed_train_X.shape)
    imputed_test_X = imputer.transform(test_aug_X)

    # Save things
    train_data = Dataset(x=imputed_train_X, y=train_y, group_id=train_patients)
    support_sim_settings = SupportSimSettingsComplex.create_from_dataset(train_data.x, args.inflation_factor)
    train_data_dict = {
            "train": train_data,
            "support_sim_settings": support_sim_settings,
            "imputer": imputer}
    pickle_to_file(train_data_dict, args.out_train_data)

    test_data = Dataset(x=imputed_test_X, y=test_y, group_id=test_patients)
    pickle_to_file(test_data, args.out_test_data)
示例#2
0
def main(args=sys.argv[1:]):
    MIMIC_TEST = "experiment_mimic/_output/data/valid_data_%d_%d.csv"

    args = parse_args(args)
    logging.basicConfig(format="%(message)s",
                        filename=args.log_file,
                        level=logging.DEBUG)
    print(args)
    logging.info(args)

    trial_data = TrialData()
    times = []
    for time_key in range(args.start_year, args.start_year + args.num_years):
        for quarter in range(4):
            path_time = MIMIC_TEST % (time_key, quarter)
            raw_dataset = np.genfromtxt(path_time)
            if len(raw_dataset.shape) == 1:
                raw_dataset = raw_dataset.reshape((1, -1))
                print("VALIDATION DATA ONLY SIZE 1")
            print(raw_dataset.shape)
            if raw_dataset.shape[0] < args.min_batch_size:
                print("SKIPPING THIS BATCH. TOO SMALL", raw_dataset.shape)
                continue

            print("year q", time_key, quarter)
            dataset = Dataset(raw_dataset[:, 1:],
                              raw_dataset[:, 0],
                              num_classes=2)
            trial_data.add_batch(dataset)
    nature = FixedNature(trial_data=trial_data)

    pickle_to_file(nature, args.out_file)
def main(args=sys.argv[1:]):
    args = parse_args(args)
    logging.basicConfig(format="%(message)s",
                        filename=args.log_file,
                        level=logging.DEBUG)
    print(args)
    logging.info(args)

    # Read all data
    data_dict = pickle_from_file(args.data_file)
    # Get the appropriate datasplit
    split_dict = pickle_from_file(args.data_split_file)
    recalib_data = data_dict["train"].subset(split_dict["recalibrate_idxs"])

    # Load model
    fitted_model = load_model(args.fitted_file)
    family = fitted_model.density_parametric_form

    if family == "gaussian":
        coverage_dict = recalibrate_intervals_gaussian(fitted_model,
                                                       recalib_data, args)
    elif family == "bernoulli":
        coverage_dict = recalibrate_intervals_bernoulli(
            fitted_model, recalib_data, args)
    elif "multinomial" in family:
        coverage_dict = recalibrate_intervals_multinomial(
            fitted_model, recalib_data, args)
    else:
        raise ValueError("dunno what is going on")
    print(coverage_dict)

    pickle_to_file(coverage_dict, args.out_file)
示例#4
0
def main(args=sys.argv[1:]):
    args = parse_args(args)
    logging.basicConfig(format="%(message)s", filename=args.log_file, level=logging.DEBUG)
    print(args)
    logging.info(args)

    # Read all data
    data_dict = pickle_from_file(args.data_file)
    # Get the appropriate datasplit
    split_dict = pickle_from_file(args.data_split_file)
    recalib_data = data_dict["train"].subset(split_dict["recalibrate_idxs"])

    # Load model
    fitted_model = load_model(args.fitted_file)

    coverage_dict = {}
    for alpha in args.alphas:
        recalibrator = DecisionIntervalRecalibrator(fitted_model, alpha)
        inference_dict = recalibrator.recalibrate(recalib_data)
        print("RECALIB INF DICT", inference_dict["cov_given_accept"])
        est_cov_given_accept = inference_dict["cov_given_accept"]["mean"]
        logging.info("Alpha %f, ideal cov %f, est cov|accept %f", alpha, 1 - alpha, est_cov_given_accept)
        logging.info(get_normal_ci(inference_dict["cov_given_accept"]))
        coverage_dict[alpha] = inference_dict
    pickle_to_file(coverage_dict, args.out_file)
示例#5
0
def main(args=sys.argv[1:]):
    args = parse_args(args)
    logging.basicConfig(format="%(message)s",
                        filename=args.log_file,
                        level=logging.DEBUG)
    print(args)
    logging.info(args)

    np.random.seed(args.seed)
    if args.support_setting == "constant":
        support_sim_settings = SupportSimSettingsUniform(
            args.num_p,
            min_func_name="min_x_func_constant",
            max_func_name="max_x_func_constant")
    elif args.support_setting == "changing":
        raise ValueError("huh? i can get here?")
        support_sim_settings = SupportSimSettingsNormal(
            args.num_p,
            std_func_name="std_func_changing",
            mu_func_name="mu_func_changing")
    else:
        raise ValueError("Asdfasdf")

    data_gen = DataGenerator(args.density_parametric_form,
                             args.sim_func_name,
                             support_sim_settings,
                             max_y=args.max_y,
                             min_y=args.min_y)
    trial_data = TrialData(data_gen, args.batch_sizes)
    for batch_index in range(args.num_batches):
        trial_data.make_new_batch()

    out_dict = {"meta": trial_data.make_meta_data(), "data": trial_data}
    print(out_dict["meta"])
    pickle_to_file(out_dict, args.out_file)
示例#6
0
def main(args=sys.argv[1:]):
    args = parse_args(args)
    logging.basicConfig(format="%(message)s", filename=args.log_file, level=logging.DEBUG)
    print(args)
    logging.info(args)

    scratch_dir = make_scratch_dir(args)

    np.random.seed(args.seed)
    tf.set_random_seed(args.seed)

    # Read data
    data_dict = pickle_from_file(args.data_file)
    # Get the appropriate datasplit
    split_dict = pickle_from_file(args.data_split_file)
    train_split_dataset = data_dict["train"].subset(split_dict["train_idxs"])
    # TODO: so hacky. so lazy
    if args.density_parametric_form == "multinomial":
        print("num classes", train_split_dataset.num_classes)
        density_parametric_form = "multinomial%d" % train_split_dataset.num_classes
    else:
        density_parametric_form = args.density_parametric_form

    # Setup the parameters we will tune over
    param_grid = [{
        'density_layer_sizes': args.density_layer_sizes,
        'density_parametric_form': [density_parametric_form],
        'density_weight_param': args.density_weight_params,
        'dropout_rate': args.dropout_rate,
        'weight_penalty_type': [args.weight_penalty_type],
        'max_iters': [args.max_iters],
        'num_ensemble': [args.num_ensemble],
        'num_inits': [args.num_inits],
        'act_func': [args.act_func],
        'learning_rate': [args.learning_rate],
        'do_distributed': [args.do_distributed],
        'scratch_dir': [scratch_dir],
    }]

    # Fit model
    fitted_model, best_hyperparams, cv_results = do_cross_validation(
        train_split_dataset,
        nn_class=EnsembleDensityNN,
        param_grid=param_grid,
        cv=args.cv)
    logging.info("Best hyperparams %s", best_hyperparams)

    # Save model
    pickle_to_file({
        "nn_class": EnsembleDensityNN,
        "fitted_params": [nn.model_params for nn in fitted_model.nns],
        "hyperparams": best_hyperparams,
        "cv_results": cv_results,
    }, args.fitted_file)
示例#7
0
def main(args=sys.argv[1:]):
    args = parse_args(args)

    agg_model_preds_and_targets = AggModelPredsAndTargets()
    for year in range(args.start_year, args.start_year + args.num_years):
        for split_idx in range(args.start_num_year_splits, args.end_num_year_splits):
            prefetch_file = args.path_template % (year, split_idx)
            model_preds_and_targets = pickle_from_file(prefetch_file)
            agg_model_preds_and_targets.append(model_preds_and_targets)

    pickle_to_file(agg_model_preds_and_targets, args.out_file)
def main(args=sys.argv[1:]):
    args = parse_args(args)
    logging.basicConfig(format="%(message)s",
                        filename=args.log_file,
                        level=logging.DEBUG)
    print(args)
    logging.info(args)

    np.random.seed(args.seed)
    tf.set_random_seed(args.seed)

    # Read data
    data_dict = pickle_from_file(args.data_file)
    assert data_dict["support_sim_settings"].check_dataset(data_dict["train"])
    # Get the appropriate datasplit
    split_dict = pickle_from_file(args.data_split_file)
    train_split_dataset = data_dict["train"].subset(split_dict["train_idxs"])

    # Setup the parameters we will tune over
    param_grid = [{
        'interval_alpha': [args.interval_alpha],
        'decision_layer_sizes': args.decision_layer_sizes,
        'interval_layer_sizes': args.interval_layer_sizes,
        'decision_weight_param': args.decision_weight_params,
        'interval_weight_param': args.interval_weight_params,
        'weight_penalty_type': [args.weight_penalty_type],
        'cost_decline': [args.cost_decline],
        'do_no_harm_param': args.do_no_harm_params,
        'log_barrier_param': args.log_barrier_params,
        'max_iters': [args.max_iters],
        'num_inits': [args.num_inits],
        'act_func': [args.act_func],
        'learning_rate': [args.learning_rate],
        'support_sim_settings': [data_dict["support_sim_settings"]],
        'support_sim_num': [args.support_sim_num],
    }]

    # Fit model
    fitted_model, best_hyperparams, cv_results = do_cross_validation(
        train_split_dataset,
        nn_class=SimultaneousIntervalDecisionNNs,
        param_grid=param_grid,
        cv=args.cv)
    logging.info("Best hyperparams %s", best_hyperparams)

    # Save model
    pickle_to_file(
        {
            "nn_class": SimultaneousIntervalDecisionNNs,
            "fitted_params": fitted_model.model_params,
            "hyperparams": best_hyperparams,
            "cv_results": cv_results,
        }, args.fitted_file)
def main(args=sys.argv[1:]):
    args = parse_args(args)
    print(args)
    np.random.seed(args.seed)

    all_approval_list = [
        pickle_from_file(history_file) for history_file in args.history_files
    ]
    for hist in all_approval_list:
        print(hist)
    all_approval_dict = {x.policy_name: x for x in all_approval_list}
    pickle_to_file(all_approval_dict, args.out_file)
示例#10
0
def main(args=sys.argv[1:]):
    args = parse_args()
    print(args)

    np.random.seed(args.seed)
    # Read data
    data_dict = pickle_from_file(args.in_data_file)
    full_data = data_dict["train"]
    unique_groups = np.unique(full_data.group_id)
    shuffled_order = np.random.permutation(unique_groups)

    if args.recalibrate_num is not None:
        num_recalibrate = args.recalibrate_num
        train_groups = shuffled_order[:-num_recalibrate]
        recalibrate_groups = shuffled_order[-num_recalibrate:]
    else:
        fold_size = int(unique_groups.size / args.k_folds) + 1
        start_idx = args.fold_idx * fold_size
        end_idx = min((args.fold_idx + 1) * fold_size, unique_groups.size)
        print("number in recalibrated groups", end_idx - start_idx)
        train_groups = np.concatenate(
            [shuffled_order[:start_idx], shuffled_order[end_idx:]])
        recalibrate_groups = shuffled_order[start_idx:end_idx]

    train_idxs = np.isin(full_data.group_id, train_groups).flatten()
    assert train_idxs.size > 1

    # For recalibartion, we only grab a random obs per group
    recalibrate_idxs = []
    for recalib_group_id in recalibrate_groups:
        matching_obs_idxs = np.where(full_data.group_id == recalib_group_id)[0]
        random_matching_obs_idx = np.random.choice(matching_obs_idxs)
        recalibrate_idxs.append(random_matching_obs_idx)
    recalibrate_idxs = np.array(recalibrate_idxs)

    assert recalibrate_idxs.size > 1
    # Double check we grabbed a single random obs per group
    assert np.unique(
        full_data.group_id[recalibrate_idxs]).size == recalibrate_idxs.size

    # Write data to file
    print("num train", train_idxs.size)
    pickle_to_file(
        {
            "train_idxs": train_idxs,
            "recalibrate_idxs": recalibrate_idxs,
            "support_sim_settings": data_dict["support_sim_settings"],
        }, args.out_file)
示例#11
0
def main(args=sys.argv[1:]):
    args = parse_args(args)

    models = []
    for year in range(args.start_year, args.start_year + args.num_years):
        for quarter in range(4):
            model_file = args.path_template % (year, quarter)
            print("model", model_file)
            assert os.path.exists(model_file)
            if len(models) == 0:
                models.append(pickle_from_file(model_file))
            else:
                models.append(model_file)

    proposer = FixedProposer(models)

    print("pickling...")
    pickle_to_file(proposer, args.out_file)
示例#12
0
def main(args=sys.argv[1:]):
    args = parse_args(args)
    logging.basicConfig(format="%(message)s",
                        filename=args.log_file,
                        level=logging.DEBUG)
    print(args)
    logging.info(args)

    np.random.seed(args.seed)

    proposer = LassoProposer(
        args.density_parametric_form,
        eps=args.proposer_eps,
        n_alphas=args.proposer_alphas,
        cv=args.proposer_cv,
        num_back_batches=args.proposer_batches,
    )

    pickle_to_file(proposer, args.out_file)
示例#13
0
def main(args=sys.argv[1:]):
    args = parse_args(args)

    YEARS = range(args.start_year, args.start_year + args.num_years)
    MONTHS = range(1, 1 + args.num_months)
    model_paths = []
    for year in YEARS:
        for month in MONTHS:
            model_file = args.path_template % (year, month)
            print("model", model_file, os.path.exists(model_file))
            if not os.path.exists(model_file):
                model_paths.append(prev_model_file)
            else:
                model_paths.append(model_file)
            prev_model_file = model_file
    proposer = FixedProposerFromFile(model_paths,
                                     criterion_str="l1",
                                     max_loss=args.max_loss)

    pickle_to_file(proposer, args.out_file)
def main(args=sys.argv[1:]):
    args = parse_args(args)

    nature = pickle_from_file(args.nature_file)

    approval_hist = ApprovalHistory(human_max_loss=1, policy_name="Placeholder")
    model = pickle_from_file(args.model_file)
    proposer = FixedProposer([model])

    # begin simulation
    # introduce the singleton model
    proposer.propose_model(None, None)
    model_pred_targets = ModelPredsAndTargets()
    for t in range(nature.total_time - 1):
        print("prefetcthing time", t)
        sub_trial_data = nature.get_trial_data(t + 1)
        obs_batch_data = sub_trial_data.batch_data[-1]
        batch_preds, batch_target = proposer.get_model_preds_and_target(obs_batch_data)
        model_pred_targets.append(batch_preds, batch_target)
        nature.next(approval_hist)

    pickle_to_file(model_pred_targets, args.out_file)
def main(args=sys.argv[1:]):
    args = parse_args(args)
    logging.basicConfig(format="%(message)s",
                        filename=args.log_file,
                        level=logging.DEBUG)
    print(args)
    logging.info(args)

    times = []
    models = []
    YEARS = range(args.start_year, args.start_year + args.num_years)
    MONTHS = range(1, 1 + args.num_months)
    for year in YEARS:
        for month in MONTHS:
            times.append((year, month))

    trial_data = TrialDataFromDisk()
    for time_key in times:
        path_time = args.valid_data_template % time_key
        trial_data.add_batch(path_time, args.batch_size)

    nature = FixedNature(trial_data=trial_data)
    pickle_to_file(nature, args.out_file)
示例#16
0
def main(args=sys.argv[1:]):
    args = parse_args()
    print(args)

    np.random.seed(args.seed)

    data_gen = DataGenerator(
        sim_func_form=args.sim_func_form,
        sim_func_name=args.sim_func,
        num_p=args.num_p,
        num_classes=args.num_classes,
        noise_sd=args.sim_noise_sd,
        std_dev_x=args.std_dev_x,
        max_x=args.max_x,
    )
    train_data, support_sim_settings = data_gen.create_data(args.num_train)

    # Write data to file
    pickle_to_file(
        {
            "train": train_data,
            "support_sim_settings": support_sim_settings,
            "data_gen": data_gen
        }, args.out_data_file)
示例#17
0
def main(args=sys.argv[1:]):
    args = parse_args(args)
    logging.basicConfig(format="%(message)s",
                        filename=args.log_file,
                        level=logging.DEBUG)
    print(args)
    logging.info(args)

    np.random.seed(args.seed)

    if args.support_setting == "constant":
        support_sim_settings = SupportSimSettingsUniform(
            args.num_p,
            min_func_name="min_x_func_constant",
            max_func_name="max_x_func_constant",
        )
    elif args.support_setting == "changing":
        raise ValueError("huh? i can get here?")
        support_sim_settings = SupportSimSettingsNormal(
            args.num_p,
            std_func_name="std_func_changing",
            mu_func_name="mu_func_changing",
        )
    else:
        raise ValueError("Asdfasdf")
    data_gen = DataGenerator(
        args.density_parametric_form,
        args.sim_func_name,
        support_sim_settings,
        max_y=args.max_y,
        min_y=args.min_y,
    )

    proposer = get_proposer(args, data_gen)

    pickle_to_file(proposer, args.out_file)
示例#18
0
def main(args=sys.argv[1:]):
    args = parse_args(args)
    logging.basicConfig(
        format="%(message)s", filename=args.log_file, level=logging.DEBUG
    )
    print(args)
    logging.info(args)
    np.random.seed(args.seed)

    nature = pickle_from_file(args.nature_file)
    logging.info("BATCH SIZES %s", nature.batch_sizes)
    proposer = pickle_from_file(args.proposer_file)

    nature.next(None)
    model = proposer.propose_model(nature.get_trial_data(0), None)
    if args.human_max_loss is None:
        args.human_max_loss = np.mean(
            proposer.score_models(
                nature.create_test_data(time_t=0, num_obs=args.num_test_obs)
            )[0]
        )
        logging.info("HUMAN MAX %f", args.human_max_loss)
    nature.next(None)

    print("POLICY")
    policy = create_policy(
        args.policy_name,
        args,
        human_max_loss=args.human_max_loss,
        drift=args.human_max_loss * args.drift_scale,
        total_time=nature.total_time,
        num_experts=nature.total_time,
        batch_size=np.mean(nature.batch_sizes[1:]),
    )

    st_time = time.time()
    if args.prefetched_file is None:
        sim = Simulation(
            nature,
            proposer,
            policy,
            args.human_max_loss,
            num_test_obs=args.num_test_obs,
            holdout_last_batch=args.holdout_last_batch,
        )
    else:
        prefetched = pickle_from_file(args.prefetched_file)
        sim = SimulationPrefetched(
            nature,
            proposer,
            prefetched,
            policy,
            args.human_max_loss,
            num_test_obs=args.num_test_obs,
            holdout_last_batch=args.holdout_last_batch,
        )
    sim.run(lambda approval_hist: pickle_to_file(approval_hist, args.out_file))
    logging.info(sim.approval_hist)
    print(sim.approval_hist)
    logging.info("run time %d", time.time() - st_time)

    pickle_to_file(sim.approval_hist, args.out_file)
    if args.out_nature_file is not None:
        pickle_to_file(nature.to_fixed(), args.out_nature_file)
示例#19
0
def main(args=sys.argv[1:]):
    args = parse_args(args)
    logging.basicConfig(format="%(message)s", filename=args.log_file, level=logging.DEBUG)
    print(args)
    logging.info(args)
    nn_class = EnsembleSimultaneousDensityDecisionNNs

    scratch_dir = make_scratch_dir(args)

    np.random.seed(args.seed)
    tf.set_random_seed(args.seed)

    # Read data
    data_dict = pickle_from_file(args.data_file)
    #assert data_dict["support_sim_settings"].check_dataset(data_dict["train"])
    # Get the appropriate datasplit
    split_dict = pickle_from_file(args.data_split_file)
    print(split_dict["train_idxs"])
    print(data_dict["train"].x.shape)
    print(data_dict["train"].y.shape)
    print(split_dict["train_idxs"].shape)
    train_split_dataset = data_dict["train"].subset(split_dict["train_idxs"])

    print(train_split_dataset.y.shape)
    if args.density_parametric_form == "multinomial":
        print("num classes", train_split_dataset.num_classes)
        density_parametric_form = "multinomial%d" % train_split_dataset.num_classes
    else:
        density_parametric_form = args.density_parametric_form

    if args.use_train_data_support:
        support_data = train_split_dataset
        old_support_settings = data_dict["support_sim_settings"]
        data_dict["support_sim_settings"] = SupportSimSettingsEmpirical(
                support_data.x,
                scale=args.support_empirical_scale,
                min_x=old_support_settings.min_x,
                max_x=old_support_settings.max_x)
    elif args.empirical_support_file:
        empirical_support = pickle_from_file(args.empirical_support_file)
        old_support_settings = data_dict["support_sim_settings"]
        data_dict["support_sim_settings"] = SupportSimSettingsEmpirical(
                empirical_support,
                scale=args.support_empirical_scale,
                min_x=old_support_settings.min_x,
                max_x=old_support_settings.max_x)

    # Setup the parameters we will tune over
    param_grid = [{
        'density_layer_sizes': args.density_layer_sizes,
        'decision_layer_sizes': args.decision_layer_sizes,
        'dropout_rate': args.dropout_rate,
        'density_parametric_form': [density_parametric_form],
        'density_weight_param': args.density_weight_params,
        'decision_weight_param': args.decision_weight_params,
        'weight_penalty_type': [args.weight_penalty_type],
        'cost_decline': [args.cost_decline],
        'do_no_harm_param': args.do_no_harm_params,
        'log_barrier_param': args.log_barrier_params,
        'max_iters': [args.max_iters],
        'num_inits': [args.num_inits],
        'num_ensemble': [args.num_ensemble],
        'do_distributed': [args.do_distributed],
        'scratch_dir': [scratch_dir],
        'act_func': [args.act_func],
        'learning_rate': [args.learning_rate],
        'support_sim_settings': [data_dict["support_sim_settings"]],
        'support_sim_num': [args.support_sim_num],
    }]

    # Fit model
    fitted_model, best_hyperparams, cv_results = do_cross_validation(
        train_split_dataset,
        nn_class=nn_class,
        param_grid=param_grid,
        cv=args.cv)
    logging.info("Best hyperparams %s", best_hyperparams)

    # Save model
    pickle_to_file({
        "nn_class": nn_class,
        "fitted_params": [m.model_params for m in fitted_model.nns],
        "hyperparams": best_hyperparams,
        "cv_results": cv_results,
    }, args.fitted_file)

    # DOUBLE CHECKING THINGS WORK
    #pickle_from_file(args.fitted_file)
    fitted_model.get_accept_prob(train_split_dataset.x[:10,:])
    fitted_model.get_prediction_interval(train_split_dataset.x[:10,:])
示例#20
0
def main(args=sys.argv[1:]):
    args = parse_args(args)
    print(args)
    np.random.seed(0)

    mnist = tf.keras.datasets.mnist
    (x_train, y_train), (x_test, y_test) = mnist.load_data()
    x_train, x_test = x_train / 255.0, x_test / 255.0

    orig_image_shape = x_train.shape[1:]
    x_train = x_train.reshape((x_train.shape[0], -1))
    x_test = x_test.reshape((x_test.shape[0], -1))

    num_classes = 10
    num_train_classes = 9

    data_mask = y_train < num_train_classes
    x_train = x_train[data_mask]
    y_train = y_train[data_mask]
    y_train_categorical = np.zeros((y_train.size, num_train_classes))
    y_train_categorical[np.arange(y_train.size), y_train] = 1

    y_test_categorical = np.zeros((y_test.size, num_classes))
    y_test_categorical[np.arange(y_test.size), y_test] = 1

    (_, _), (weird_x, _) = tf.keras.datasets.fashion_mnist.load_data()
    weird_x = weird_x / 255.0
    weird_x = weird_x.reshape((weird_x.shape[0], -1))

    if args.do_pca:
        pca = PCA(n_components=300, whiten=True)
        x_train = pca.fit_transform(x_train)
        print(pca.explained_variance_ratio_)
        x_test = pca.transform(x_test)
        weird_x = pca.transform(weird_x)

    num_p = x_train.shape[1]
    min_x = np.min(np.concatenate([x_train, x_test]), axis=0).reshape((1, -1))
    max_x = np.max(np.concatenate([x_train, x_test]), axis=0).reshape((1, -1))
    support_sim_settings = SupportSimSettingsContinuousMulti(num_p,
                                                             min_x=min_x,
                                                             max_x=max_x)

    train_data = Dataset(x=x_train,
                         y=y_train_categorical,
                         num_classes=num_train_classes)
    train_data_dict = {
        "train": train_data,
        "support_sim_settings": support_sim_settings
    }
    pickle_to_file(train_data_dict, args.out_train_data)

    random_idx = np.random.choice(x_test.shape[0], size=4000, replace=False)
    test_data = Dataset(x=x_test[random_idx, :],
                        y=y_test_categorical[random_idx, :],
                        num_classes=num_classes)
    pickle_to_file(test_data, args.out_test_data)

    check_supp = support_sim_settings.check_obs_x(weird_x)
    print("NUM WEIRD", weird_x.shape)
    print("NUM WEiRD IN SUPPORT", np.sum(check_supp))
    weird_x = weird_x[check_supp, :]
    pickle_to_file(weird_x, args.out_weird_data)
示例#21
0
def main(args=sys.argv[1:]):
    args = parse_args(args)
    logging.basicConfig(
        format="%(message)s", filename=args.log_file, level=logging.DEBUG
    )
    print(args)
    logging.info(args)

    np.random.seed(args.seed)
    if args.support_setting == "constant":
        support_sim_settings = SupportSimSettingsUniform(
            args.num_p,
            min_func_name="min_x_func_constant",
            max_func_name="max_x_func_constant",
        )
    elif args.support_setting == "changing":
        raise ValueError("huh? i can get here?")
        support_sim_settings = SupportSimSettingsNormal(
            args.num_p,
            std_func_name="std_func_changing",
            mu_func_name="mu_func_changing",
        )
    else:
        raise ValueError("Asdfasdf")

    data_gen = DataGenerator(
        args.density_parametric_form,
        args.sim_func_name,
        support_sim_settings,
        noise_sd=args.y_sigma,
        max_y=args.max_y,
        min_y=args.min_y,
    )
    trial_data = TrialData(args.batch_sizes)
    init_coef = np.zeros(args.num_p)
    init_coef[: args.num_coefs] = args.coef_scale
    new_coef = init_coef
    all_coefs = []
    last_coef_change = 0
    coef_norm = np.sqrt(np.sum(np.power(init_coef, 2)))
    did_drift = False
    for batch_index in range(args.num_batches):
        do_drift = (
            batch_index % args.drift_cycle == args.drift_cycle - 1
            if args.drift_cycle > 0
            else False
        )
        if do_drift:
            print("DRIFT", do_drift)
            new_coef = np.copy(new_coef)
            to0_rand_idx = np.random.choice(
                np.where(np.abs(new_coef) > 0)[0], size=args.num_coef_drift
            )
            to1_rand_idx = np.random.choice(
                np.where(np.abs(new_coef) <= 1e-10)[0], size=args.num_coef_drift
            )
            new_coef[to0_rand_idx] = 0
            new_coef[to1_rand_idx] = np.max(init_coef)
            last_coef_change = batch_index - 1
            did_drift = True

        elif did_drift and np.random.rand() < args.prob_revert_drift:
            print("REVERT", batch_index)
            # Try reverting to old coefs
            new_coef = all_coefs[last_coef_change]
            last_coef_change = batch_index - 1
            did_drift = False
        else:
            did_drift = False

        new_data = data_gen.create_data(
            args.batch_sizes[batch_index], batch_index, coef=new_coef
        )
        all_coefs.append(new_coef)
        trial_data.add_batch(new_data)
    nature = FixedNature(data_gen, trial_data, coefs=all_coefs)

    pickle_to_file(nature, args.out_file)
示例#22
0
def main(args=sys.argv[1:]):
    train_size = 0.5
    seed = 0

    # Read the y data
    outcomes = pd.read_csv("../data/Outcomes-a.txt")
    subject_outcomes = outcomes[["RecordID", "Length_of_stay", "Survival"]]

    # Create a dictionary of features for each subject
    # Using a dictionary because some of the features don't appear in all subjects...
    value_range = {}  # this is just for printing out ranges of the values
    file_folder = "../data/set-a/"
    all_subject_features = {}
    for idx, filename in enumerate(os.listdir(file_folder)[:MAX_PROCESS]):
        df = pd.read_csv("%s%s" % (file_folder, filename))
        df["hour"] = np.array([time.split(":")[0] for time in df.Time.values],
                              dtype=int)
        df["minute"] = np.array(
            [time.split(":")[1] for time in df.Time.values], dtype=int)
        df.Time = df.hour * 60 + df.minute

        record_id = int(df.loc[0].Value)
        subject_features = {"RecordID": record_id}
        for feat_name, process_func_list in FEATURES.items():
            if WEIGHTED_MEAN in process_func_list:
                sub_df = df.loc[(df.Parameter == feat_name) & (df.Value > 0)]
            else:
                sub_df = df.loc[(df.Parameter == feat_name) & (df.Value >= 0)]

            if sub_df.shape[0] == 0:
                continue
            if feat_name not in value_range:
                value_range[feat_name] = [
                    sub_df.Value.min(), sub_df.Value.max()
                ]
            else:
                value_range[feat_name][0] = min(value_range[feat_name][0],
                                                sub_df.Value.min())
                value_range[feat_name][1] = max(value_range[feat_name][1],
                                                sub_df.Value.max())

            for func in process_func_list:
                value = func(sub_df)
                if not np.isfinite(value):
                    print(value, feat_name, func.__name__)
                    print(sub_df)
                assert np.isfinite(value)
                full_feature_name = "%s:%s" % (feat_name, func.__name__)
                subject_features[full_feature_name] = value

        fio2_df = df.loc[df.Parameter == "FiO2"]
        pao2_df = df.loc[df.Parameter == "PaO2"]
        if fio2_df.shape[0] and pao2_df.shape[0]:
            fio2_mean = _get_mean(fio2_df)
            pao2_mean = _get_mean(pao2_df)
            if fio2_mean > 0:
                subject_features["O2:_get_ratio"] = pao2_mean / fio2_mean

        all_subject_features[idx] = subject_features

    for k, v in value_range.items():
        print(k, v)

    subjects_x = pd.DataFrame.from_dict(all_subject_features, orient="index")

    # Merge the X and Y data
    icu_subjects = subjects_x.merge(subject_outcomes, on="RecordID")
    print(icu_subjects["Survival"])
    icu_subjects["resp"] = np.maximum(icu_subjects["Length_of_stay"],
                                      icu_subjects["Survival"])
    icu_subjects = icu_subjects.drop(columns=["RecordID"])
    print(np.mean(icu_subjects["Survival"]))
    print(np.median(icu_subjects["Survival"]))
    print(np.max(icu_subjects["Survival"]))
    print(np.mean(icu_subjects["Length_of_stay"]))
    print(np.median(icu_subjects["Length_of_stay"]))
    print(np.max(icu_subjects["Length_of_stay"]))

    # Grab column names
    column_names = list(icu_subjects.columns.values)
    icu_subjects = icu_subjects.as_matrix()

    # Center the x covariates
    centering_term = np.nanmean(icu_subjects, axis=0)
    centering_term[-1] = 0
    icu_subjects[:, :-3] -= centering_term[:-3]

    # randomly split the data
    print(column_names)
    mats = train_test_split(icu_subjects,
                            train_size=train_size,
                            test_size=1.0 - train_size,
                            random_state=seed)
    x_train = mats[0][:, :-3]
    y_train = mats[0][:, -1:]
    y_censored_train = mats[0][:, -2:-1] < 0
    x_test = mats[1][:, :-3]
    y_test = mats[1][:, -1:]
    y_censored_test = mats[1][:, -2:-1] < 0

    # Save the data
    icu_train_data = data_generator.Dataset(x=x_train,
                                            y=y_train,
                                            is_censored=y_censored_train)
    icu_test_data = data_generator.Dataset(x=x_test,
                                           y=y_test,
                                           is_censored=y_censored_test)

    ## save off as a pickle
    icu_processed_file = "../data/icu_data_processed.pkl"
    pickle_to_file({
        "train": icu_train_data,
        "test": icu_test_data
    }, icu_processed_file)

    icu_column_file = "../data/icu_data_column_names.txt"
    with open(icu_column_file, "w") as f:
        for i, col in enumerate(column_names[:-1]):
            f.write("%d, %s\n" % (i, col))
示例#23
0
def main(args=sys.argv[1:]):
    args = parse_args(args)
    np.random.seed(args.seed)
    logging.basicConfig(format="%(message)s",
                        filename=args.log_file,
                        level=logging.DEBUG)
    logging.info(args)

    data_dict = pickle_from_file(args.data_file)
    test_data, _ = data_dict["data_gen"].create_data(args.num_test)
    fitted_models = []
    agg_dict = {}
    for fitted_file, coverage_file in zip(args.fitted_files,
                                          args.coverage_files):
        fitted_model = load_model(fitted_file)
        fitted_models.append(fitted_model)
        coverage_dict = pickle_from_file(coverage_file)
        for pi_alpha, inference_dict in coverage_dict.items():
            if pi_alpha not in agg_dict:
                agg_dict[pi_alpha] = []
            agg_dict[pi_alpha].append(inference_dict)

    unif_x = data_dict["support_sim_settings"].support_unif_rvs(args.num_test)
    unif_test_data = data_dict["data_gen"].create_data_given_x(unif_x)

    coverage_agg_results = {}
    for pi_alpha, inference_dicts in agg_dict.items():
        aggregator = DecisionIntervalAggregator(fitted_models, pi_alpha,
                                                inference_dicts)
        indiv_test_datas = [
            data_dict["data_gen"].create_data(args.num_test)[0]
            for _ in fitted_models
        ]
        indiv_test_inf_dicts = [
            DecisionIntervalRecalibrator(fitted_model,
                                         pi_alpha).recalibrate(indiv_test_data)
            for fitted_model, indiv_test_data in zip(fitted_models,
                                                     indiv_test_datas)
        ]
        individual_is_covereds = []
        for test_coverage_dict, inf_dict in zip(indiv_test_inf_dicts,
                                                inference_dicts):
            print(inf_dict)
            test_coverage = test_coverage_dict["cov_given_accept"]["mean"]
            test_coverage_ci = get_normal_ci(
                test_coverage_dict["cov_given_accept"], args.ci_alpha)
            individual_ci = get_normal_ci(inf_dict["cov_given_accept"],
                                          args.ci_alpha)
            indiv_covered = individual_ci[
                0] <= test_coverage and test_coverage <= individual_ci[1]
            logging.info("indiv est %f ci %s",
                         inf_dict["cov_given_accept"]["mean"], individual_ci)
            logging.info("true indiv %f ci %s", test_coverage,
                         test_coverage_ci)
            logging.info("indiv is covered? %s", indiv_covered)
            individual_is_covereds.append(indiv_covered)

        # Calculate the width of the individual CI diams for comparison
        individual_ci_diams = get_individual_ci_diams(inference_dicts,
                                                      args.ci_alpha)

        # Evaluate if the true coverage value is covered
        agg_cov_given_accept_dict = aggregator.calc_agg_cover_given_accept(
            args.ci_alpha)
        true_cov_given_accept_dict = aggregator.eval_cov_given_accept(
            test_data)["cov_given_accept"]
        true_cov_given_accept = true_cov_given_accept_dict["mean"]
        agg_ci = agg_cov_given_accept_dict["ci"]
        is_covered = true_cov_given_accept > agg_ci[
            0] and true_cov_given_accept < agg_ci[1]

        # Evaluate coverage if using independence assumption
        indpt_aggregator = DecisionIntervalIndptAggregator(
            fitted_models, pi_alpha, inference_dicts)
        indpt_agg_cov_given_accept_dict = indpt_aggregator.calc_agg_cover_given_accept(
            args.ci_alpha)
        indpt_ci = indpt_agg_cov_given_accept_dict["ci"]
        indpt_is_covered = true_cov_given_accept > indpt_ci[
            0] and true_cov_given_accept < indpt_ci[1]

        coverage_agg_results[pi_alpha] = {
            "is_covered": {
                "agg": [is_covered],
                "independent": [indpt_is_covered],
                "individual": individual_is_covereds
            },
            "ci_diams": {
                "agg": [agg_ci[1] - agg_ci[0]],
                "independent": [indpt_ci[1] - indpt_ci[0]],
                "individual": individual_ci_diams
            },
            "true_cov": {
                "agg": [true_cov_given_accept],
                "independent": [true_cov_given_accept],
                "individual": [
                    test_inf_dict["cov_given_accept"]["mean"]
                    for test_inf_dict in indiv_test_inf_dicts
                ]
            }
        }

        # Evaluate local coverage
        local_coverages = assess_local_agg_coverage_true(
            aggregator, test_data, data_dict["data_gen"])
        for key, val in local_coverages.items():
            coverage_agg_results[pi_alpha][key] = val

        logging.info("PI alpha %f", pi_alpha)
        logging.info("estimated agg cover given accept %f %s",
                     agg_cov_given_accept_dict["mean"], agg_ci)
        logging.info("indepttt estimated agg cover given accept %f %s",
                     indpt_agg_cov_given_accept_dict["mean"], indpt_ci)
        logging.info("true cov given accept %f, se %f", true_cov_given_accept,
                     true_cov_given_accept_dict["se"])
        logging.info("is  covered? %s", is_covered)
        logging.info("indept is  covered? %s", indpt_is_covered)

    logging.info(coverage_agg_results)
    pickle_to_file(coverage_agg_results, args.out_file)
def main(args=sys.argv[1:]):
    args = parse_args(args)
    print(args)

    # Save things
    test_data = pickle_from_file(args.in_test_data)
    train_data_dict = pickle_from_file(args.in_train_data)
    train_data = train_data_dict["train"]
    num_meta_feats = 3
    assert train_data.x.shape[1] % 2 == 1
    num_non_meta_feats = int((train_data.x.shape[1] - num_meta_feats) / 2)
    start_missing_idx = num_non_meta_feats + num_meta_feats
    print(start_missing_idx)
    is_missingness_acceptable = np.mean(train_data.x[:, start_missing_idx:],
                                        axis=0) < 0.1

    keep_cols = np.concatenate([
        np.array([True, True, True]), is_missingness_acceptable,
        np.zeros(num_non_meta_feats, dtype=bool)
    ])
    print(keep_cols)
    print(keep_cols.shape)
    train_data.x = train_data.x[:, keep_cols]
    test_data.x = test_data.x[:, keep_cols]

    orig_support_sim_settings = SupportSimSettingsComplex.create_from_dataset(
        train_data.x, inflation_factor=0)
    orig_cts_feature_idxs = orig_support_sim_settings.cts_feature_idxs
    orig_discrete_feature_idxs = orig_support_sim_settings.discrete_feature_idxs
    print(orig_cts_feature_idxs[:10])
    print(orig_discrete_feature_idxs[:10])

    if args.holdout_age:
        age = train_data.x[:, 0]
        age_mask = ((age < args.holdout_min_age) +
                    (age > args.holdout_max_age)).astype(bool)
        heldin_train_data = train_data.subset(age_mask)
        # REMOVE AGE FROM CTS FEATURES
        orig_cts_feature_idxs = orig_cts_feature_idxs[1:]
    else:
        heldin_train_data = train_data
        offset_idx = 0
    print("max train age", np.max(heldin_train_data.x[:, 0]))

    pca = PCA(n_components=args.num_pca, whiten=True)
    print("ORIG SHAPE", heldin_train_data.x.shape)
    heldin_train_data_x_cts = pca.fit_transform(
        heldin_train_data.x[:, orig_cts_feature_idxs])
    print(pca.explained_variance_ratio_)
    print("NUM DIS", orig_discrete_feature_idxs.size)
    test_data_x_cts = pca.transform(test_data.x[:, orig_cts_feature_idxs])

    heldin_train_data.x = np.hstack([
        heldin_train_data.x[:, orig_discrete_feature_idxs],
        heldin_train_data_x_cts
    ])
    if args.holdout_age:
        test_data.x = np.hstack([
            test_data.x[:, 0:1],  # age feature
            test_data.x[:, orig_discrete_feature_idxs],
            test_data_x_cts
        ])
    else:
        test_data.x = np.hstack(
            [test_data.x[:, orig_discrete_feature_idxs], test_data_x_cts])
    print('NEW TEST SHAPE', test_data.x.shape)
    print('NEW TRAIN SHAPE', heldin_train_data.x.shape)

    support_sim_settings = SupportSimSettingsComplex.create_from_dataset(
        heldin_train_data.x, inflation_factor=0)
    support_sim_settings._process_feature_ranges()
    print("dataset check",
          support_sim_settings.check_dataset(heldin_train_data))

    train_data_dict["train"] = heldin_train_data
    train_data_dict["support_sim_settings"] = support_sim_settings
    heldin_train_data.num_p = heldin_train_data.x.shape[1]
    pickle_to_file(train_data_dict, args.out_train_data)

    test_data.num_p = test_data.x.shape[1]
    pickle_to_file(test_data, args.out_test_data)

    print("num obs", heldin_train_data.num_obs)
    print("num obs", train_data.num_obs)
    print("FINAL NUM FEATS", heldin_train_data.num_p)
    print("FINAL NUM FEATS", test_data.num_p)