#d2 = average_depth(smooth_rf_opt, data)
    #np.all(d1 == d2)

    n_trees = len(random_forest.estimators_)
    n_obs = data.shape[0]
    depth = np.zeros(n_obs)
    for t in random_forest.estimators_:
        d_path = t.decision_path(data)
        depth = depth + np.array(d_path.sum(axis=1)).ravel()

    return depth / n_trees


# start of analysis

data, y = smooth_rf.generate_data(large_n=650)

data_vis = pd.DataFrame(data={
    "x1": data[:, 0],
    "x2": data[:, 1],
    "y": y
},
                        columns=["x1", "x2", "y"])

ggout = ggplot(data_vis) +\
    geom_point(aes(x = "x1",y ="x2", color = "factor(y)")) +\
    theme_minimal() +\
    labs(x= "X1", y = "X2", color = "value (minus 100)")

rf = sklearn.ensemble.RandomForestClassifier(n_estimators=300)
rf_fit = rf.fit(data, y)
Пример #2
0
n_sim = 100
depth_range = np.arange(2, 50, 2)
np.random.seed(100)
verbose = True

# n_tree = 10

n_tree = 10
score_mat = np.zeros((n_sim, depth_range.shape[0]))

if verbose:
    bar = progressbar.ProgressBar()
    sim_iter = bar(np.arange(n_sim))

for s_idx in sim_iter:
    all_dat_tune = smooth_rf.generate_data(large_n=1000)
    data_all, y_all = all_dat_tune[0], all_dat_tune[1]
    #data_all = data_all + 100
    y_all = y_all.ravel()


    data_train, data_test, y_train, y_test = \
        sklearn.model_selection.train_test_split(data_all,
                                                 y_all,
                                                 test_size = .5)

    score_vec = regression_prep(data_train,
                                data_test,
                                y_train,
                                y_test,
                                depth_range=np.arange(2, 50, 2),
Пример #3
0
def test_update_rf():
    """
    test update_rf
    """

    data, y = smooth_rf.generate_data(650)
    X_trained = data
    y_trained = y
    data_test, y_test = smooth_rf.generate_data(10000)

    # classification
    model_type = sklearn.ensemble.RandomForestClassifier

    model = model_type(n_estimators=2)
    model_fit = model.fit(data, y)
    random_forest = model_fit

    max_iter = 10000

    y_all, Gamma, eta, weights_all, t_idx_vec, \
        one_d_dict, two_d_dict, lamb_dim, num_classes \
         = smooth_rf.pytorch_numpy_prep(random_forest,
                              X_trained = X_trained, y_trained = y_trained,
                              distance_style="standard",
                              parents_all=True,
                              verbose=False,
                              train_only=True)

    forest_dataset = smooth_rf.ForestDataset(y_all, Gamma, eta, weights_all,
                                             t_idx_vec, one_d_dict, two_d_dict,
                                             lamb_dim)
    dataloader = smooth_rf.DataLoader(
        dataset=forest_dataset,
        sampler=RandomSampler(forest_dataset, replacement=True)
    )  # doesn't have to go through all trees in 1 iteration

    init = 100
    num_vars = len(one_d_dict) + len(two_d_dict)

    torch_model = smooth_rf.SoftmaxTreeFit(num_vars=num_vars,
                                           lamb_dim=lamb_dim,
                                           init=init)
    criterion = smooth_rf.weighted_l2
    optimizer = torch.optim.Adam(torch_model.parameters())

    smooth_rf_pytorch = smooth_rf.update_rf(random_forest,
                                            pytorch_model=torch_model,
                                            X_trained=X_trained,
                                            y_trained=y_trained,
                                            parents_all=True,
                                            distance_style="standard",
                                            verbose=False)
    y_pred_test_base = random_forest.predict(data_test)
    y_pred_test_base_prob = random_forest.predict_proba(data_test)

    y_pred_test_smooth = smooth_rf_pytorch.predict(data_test)
    y_pred_test_smooth_prob = smooth_rf_pytorch.predict_proba(data_test)

    assert np.all(y_pred_test_base == y_pred_test_smooth), \
        "update of random forest with really weak weights should produce "+\
        "the same predictions are the base rf"
    assert np.any(y_pred_test_base_prob != y_pred_test_smooth_prob), \
        "update of random forest with really weak weights should produce "+\
        "slightly different probabilities"

    # regression
    model_type = sklearn.ensemble.RandomForestRegressor

    model = model_type(n_estimators=2)
    model_fit = model.fit(data, y)
    random_forest = model_fit

    max_iter = 10000

    y_all, Gamma, eta, weights_all, t_idx_vec, \
        one_d_dict, two_d_dict, lamb_dim, num_classes \
         = smooth_rf.pytorch_numpy_prep(random_forest,
                              X_trained = X_trained, y_trained = y_trained,
                              distance_style="standard",
                              parents_all=True,
                              verbose=False,
                              train_only=True)

    forest_dataset = smooth_rf.ForestDataset(y_all, Gamma, eta, weights_all,
                                             t_idx_vec, one_d_dict, two_d_dict,
                                             lamb_dim)
    dataloader = smooth_rf.DataLoader(
        dataset=forest_dataset,
        sampler=RandomSampler(forest_dataset, replacement=True)
    )  # doesn't have to go through all trees in 1 iteration

    init = 100
    num_vars = len(one_d_dict) + len(two_d_dict)

    torch_model = smooth_rf.SoftmaxTreeFit(num_vars=num_vars,
                                           lamb_dim=lamb_dim,
                                           init=init)
    criterion = smooth_rf.weighted_l2
    optimizer = torch.optim.Adam(torch_model.parameters())

    smooth_rf_pytorch = smooth_rf.update_rf(random_forest,
                                            pytorch_model=torch_model,
                                            X_trained=X_trained,
                                            y_trained=y_trained,
                                            parents_all=True,
                                            distance_style="standard",
                                            verbose=False)
    y_pred_test_base = random_forest.predict(data_test)

    y_pred_test_smooth = smooth_rf_pytorch.predict(data_test)

    assert np.allclose(y_pred_test_base,y_pred_test_smooth), \
        "update of random forest with really weak weights should produce "+\
        "the really close predicted values are the base rf"
    assert np.any(y_pred_test_base != y_pred_test_smooth), \
        "update of random forest with really weak weights should produce "+\
        "slightly different predicted values"
Пример #4
0
def pull_data(data_set, path, reg_or_class="reg"):
    if data_set == "microsoft":
        n_data = 650 * 2
        X, y = smooth_rf.generate_data(large_n=n_data)

        if reg_or_class == "reg":
            y = y + 100

        return X, y
    elif data_set == "moon":
        n_data = 350 * 2
        X, y = sklearn.datasets.make_moons(n_samples=n_data, noise=.3)
        if reg_or_class == "reg":
            y = y + 100

        return X, y

    elif data_set == "prgeng":
        data_all = pd.read_csv(path + "data/prgeng/prgeng.txt", sep=" ")
        y_all = data_all["wageinc"]
        data_all.pop("wageinc")

        X = np.array(data_all)
        y = y_all.ravel()

        if reg_or_class != "reg":
            ValueError("must use 'reg' with 'prgeng' dataset")

        return X, y

    elif data_set == "titantic":
        data_train = pd.read_csv(path + "data/titanic/titanic3.csv")

        data_train.pop("cabin")
        data_train.pop("name")
        data_train.pop("ticket")
        data_train.pop("body")
        data_train.pop("boat")
        data_train.pop("home.dest")
        data_train["pclass"] = data_train["pclass"].apply(str)

        NAs = pd.concat([data_train.isnull().sum()], axis=1)

        # Filling missing Age values with mean
        data_train["age"] = data_train["age"].fillna(data_train["age"].mean())
        # Filling missing Embarked values with most common value
        data_train["embarked"] = data_train["embarked"].fillna(
            data_train["embarked"].mode()[0])

        for col in data_train.dtypes[data_train.dtypes == "object"].index:
            for_dummy = data_train.pop(col)
            data_train = pd.concat(
                [data_train, pd.get_dummies(for_dummy, prefix=col)], axis=1)

        data_train = data_train.dropna()

        y_all = data_train.survived
        data_train.pop("survived")

        data_all = data_train

        X = np.array(data_all)
        y = y_all.ravel()

        if reg_or_class != "class":
            ValueError("must use 'class' with 'titanic' dataset")

        return X, y
Пример #5
0
import sklearn.datasets
import sklearn.metrics

base_error = []
smooth_error = []
smooth_error2 = []

for sim in np.arange(2):  #np.arange(20):
    print("sim", sim)
    # data, y = sklearn.datasets.make_moons(n_samples=350, noise=.3)

    # data_test, y_test = sklearn.datasets.make_moons(10000, noise=.3)

    # model_type = sklearn.ensemble.RandomForestClassifier

    data, y = smooth_rf.generate_data(650)
    y = y + 100

    data_test, y_test = smooth_rf.generate_data(10000)
    y_test = y_test + 100

    model_type = sklearn.ensemble.RandomForestRegressor

    model = model_type(n_estimators=10)
    model_fit = model.fit(data, y)
    random_forest = model_fit

    max_iter = 10000


    smooth_rf_standard, _ , _, loss_all_standard = \
def pull_data(data_set, path="", reg_or_class="reg"):
    """
    create / pull data depending upon data requested (and data type)

    Arguments:
    ----------
    data_set : string
        name of dataset
    path : string
        location to data folder (only if needed - reads in data)
    reg_or_class : string
        either "reg" or "class" - determines which random forest model is being
        built

    Returns:
    --------
    X : numpy array
        data's X features
    y : numpy array
        data's y values
    """
    if data_set == "microsoft":
        n_data = 650 * 2
        X, y = smooth_rf.generate_data(large_n=n_data)

        if reg_or_class == "reg":
            y = y + 100

        return X, y
    elif data_set == "moon":
        n_data = 350 * 2
        X, y = sklearn.datasets.make_moons(n_samples=n_data, noise=.3)
        if reg_or_class == "reg":
            y = y + 100

        return X, y

    elif data_set == "prgeng":
        data_all = pd.read_csv(path + "data/prgeng/prgeng.txt", sep=" ")
        y_all = data_all["wageinc"]
        data_all.pop("wageinc")

        X = np.array(data_all)
        y = y_all.ravel()

        if reg_or_class != "reg":
            ValueError("must use 'reg' with 'prgeng' dataset")

        return X, y

    elif data_set == "titantic":
        data_train = pd.read_csv(path + "data/titanic/titanic3.csv")

        data_train.pop("cabin")
        data_train.pop("name")
        data_train.pop("ticket")
        data_train.pop("body")
        data_train.pop("boat")
        data_train.pop("home.dest")
        data_train["pclass"] = data_train["pclass"].apply(str)

        NAs = pd.concat([data_train.isnull().sum()], axis=1)

        # Filling missing Age values with mean
        data_train["age"] = data_train["age"].fillna(data_train["age"].mean())
        # Filling missing Embarked values with most common value
        data_train["embarked"] = data_train["embarked"].fillna(
            data_train["embarked"].mode()[0])

        for col in data_train.dtypes[data_train.dtypes == "object"].index:
            for_dummy = data_train.pop(col)
            data_train = pd.concat(
                [data_train, pd.get_dummies(for_dummy, prefix=col)], axis=1)

        data_train = data_train.dropna()

        y_all = data_train.survived
        data_train.pop("survived")

        data_all = data_train

        X = np.array(data_all)
        y = y_all.ravel()

        if reg_or_class != "class":
            ValueError("must use 'class' with 'titanic' dataset")

        return X, y