Exemplo n.º 1
0
seed = 0
Xtrain, ytrain, Xtest, ytest = covtype(train_size=4000000, seed=seed)
covtype_size = [
    100000, 200000, 300000, 400000, 500000, 600000, 700000, 800000, 900000,
    1000000
]

new_arr = np.repeat(Xtrain, 3, axis=0)  #increase dataset size

print(len(new_arr))
total_time = []
#nr_tree = 32 #Change to run script for different number of trees

file_name = './models/model_' + str(nr_tree) + 'tree_4jobs.data'
model = WoodClassifier.load(file_name)
nr_classes = len(np.unique(ytrain)) + 1
model.compile_store_v2(new_arr, nr_classes, 10)

print("Number of estimators: \t\t%i" % model.n_estimators)
forest_time = []

for i in xrange(len(covtype_size)):
    times = np.zeros(8, np.float32)
    X_temp = new_arr[:covtype_size[i]]
    print("Number of training patterns:\t%i" % X_temp.shape[0])

    start_time = time.time()
    cpu_test = model.predict(X_temp)
    cpu_test = model.predict(X_temp)
    cpu_test = model.predict(X_temp)
Exemplo n.º 2
0
def single_run(dkey, train_size, n_bottom, param, seed, profile=False):

    print(
        "Processing data set %s with train_size %s, n_bottom %s, seed %s, and parameters %s ..."
        % (str(dkey), str(train_size), str(n_bottom), str(seed), str(param)))

    if dkey == "covtype":
        traingen, testgen = covtype_generators(train_size=train_size,
                                               store="mem",
                                               seed=seed)
    elif dkey == "higgs":
        traingen, testgen = higgs_generators(train_size=train_size,
                                             store="mem",
                                             seed=seed)
    elif dkey == "susy":
        traingen, testgen = susy_generators(train_size=train_size,
                                            store="mem",
                                            seed=seed)
    else:
        raise Exception("Unknown data set!")

    print("")
    print("Number of training patterns:\t%i" % traingen.get_shapes()[0][0])
    print("Number of test patterns:\t%i" % testgen.get_shapes()[0][0])
    print("Dimensionality of the data:\t%i\n" % traingen.get_shapes()[0][1])

    param_wood = param['param_wood']

    wood = WoodClassifier(n_estimators=1,
                          criterion="gini",
                          max_features=param_wood['max_features'],
                          min_samples_split=2,
                          n_jobs=param_wood['n_jobs'],
                          seed=seed,
                          bootstrap=param_wood['bootstrap'],
                          tree_traversal_mode="dfs",
                          tree_type=param_wood['tree_type'],
                          min_samples_leaf=1,
                          float_type="double",
                          max_depth=None,
                          verbose=0)

    model = HugeWoodClassifier(
        n_estimators=int(24 / n_bottom),
        n_estimators_bottom=int(n_bottom),
        n_top="auto",
        n_patterns_leaf=75000,
        balanced_top_tree=True,
        top_tree_lambda=1.0,
        top_tree_max_depth=None,
        top_tree_type="standard",
        top_tree_leaf_stopping_mode="ignore_impurity",
        n_jobs=param_wood['n_jobs'],
        seed=seed,
        verbose=1,
        plot_intermediate={},
        chunk_max_megabytes=2048,
        wrapped_instance=wood,
        store=MemoryStore(),
    )

    # training
    if profile == True:
        import yep
        assert param_wood['n_jobs'] == 1
        yep.start("train.prof")

    fit_start_time = time.time()
    model.fit(traingen)
    fit_end_time = time.time()
    if profile == True:
        yep.stop()
    ypreds_train = model.predict(generator=traingen)

    # testing
    test_start_time = time.time()
    ypred_test = model.predict(generator=testgen)
    test_end_time = time.time()

    results = {}
    results['dataset'] = dkey
    results['param'] = param
    results['training_time'] = fit_end_time - fit_start_time
    results['testing_time'] = test_end_time - test_start_time
    print("Training time:\t\t%f" % results['training_time'])
    print("Testing time:\t\t%f" % results['testing_time'])

    evaluate(ypreds_train, traingen.get_all_target(), results, "training")
    evaluate(ypred_test, testgen.get_all_target(), results, "testing")

    fname = '%s_%s_%s_%s_%s_%s.json' % (
        str(param_wood['n_estimators']),
        str(param_wood['max_features']),
        str(param_wood['n_jobs']),
        str(param_wood['bootstrap']),
        str(param_wood['tree_type']),
        str(seed),
    )
    fname = os.path.join(params.odir, str(dkey), str(train_size),
                         str(n_bottom), "hugewood_75K", fname)
    ensure_dir_for_file(fname)
    with open(fname, 'w') as fp:
        json.dump(results, fp)

    del (testgen)
    del (traingen)
    model.cleanup()

    time.sleep(1)
Exemplo n.º 3
0
from sklearn.metrics import accuracy_score

from woody import WoodClassifier
from woody.data import *

seed = 0

#Xtrain, ytrain, Xtest, ytest = covtype(train_size=400000, seed=seed)
Xtrain, ytrain, Xtest, ytest = susy(train_size=4000000, seed=seed)
if Xtrain.dtype != np.float32:
    Xtrain = Xtrain.astype(np.float32)
    ytrain = ytrain.astype(np.float32)
    Xtest = Xtest.astype(np.float32)
    ytest = ytest.astype(np.float32)

model = WoodClassifier.load('./model_susy8tree.data')
nr_classes = len(np.unique(ytrain)) + 1  #not sure if accurate
model.compile_and_Store(Xtrain, nr_classes)

cpu_train = model.predict(Xtrain)
cpu_test = model.predict(Xtest)
#print(cpu_train)

assert np.allclose(
    cpu_train,
    model.cuda_predict(Xtrain)) == True, "cuda_predict failed for train set"
assert np.allclose(cpu_train, model.cuda_pred_tree_mult(
    Xtrain, 10)) == True, "cuda_pred_tree_mult failed for train set"
assert np.allclose(cpu_train, model.cuda_pred_forest(
    Xtrain)) == True, "cuda_pred_forest failed for train set"
assert np.allclose(cpu_train, model.cuda_pred_forest_mult(
Exemplo n.º 4
0
def single_run(dkey, train_size, param, seed, profile=False):

    print(
        "Processing data set %s with train_size %s, seed %s, and parameters %s ..."
        % (str(dkey), str(train_size), str(seed), str(param)))

    if dkey == "covtype":
        Xtrain, ytrain, Xtest, ytest = covtype(train_size=train_size,
                                               seed=seed)
    elif dkey == "higgs":
        Xtrain, ytrain, Xtest, ytest = higgs(train_size=train_size, seed=seed)
    elif dkey == "susy":
        Xtrain, ytrain, Xtest, ytest = susy(train_size=train_size, seed=seed)
    else:
        raise Exception("Unknown data set!")

    print("")
    print("Number of training patterns:\t%i" % Xtrain.shape[0])
    print("Number of test patterns:\t%i" % Xtest.shape[0])
    print("Dimensionality of the data:\t%i\n" % Xtrain.shape[1])

    model = WoodClassifier(n_estimators=param['n_estimators'],
                           criterion="gini",
                           max_features=param['max_features'],
                           min_samples_split=2,
                           n_jobs=param['n_jobs'],
                           seed=seed,
                           bootstrap=param['bootstrap'],
                           tree_traversal_mode="dfs",
                           tree_type=param['tree_type'],
                           min_samples_leaf=1,
                           float_type="double",
                           max_depth=None,
                           verbose=0)

    if profile == True:
        import yep
        assert param['n_jobs'] == 1
        yep.start("train.prof")

    # training
    fit_start_time = time.time()
    model.fit(Xtrain, ytrain)
    fit_end_time = time.time()
    if profile == True:
        yep.stop()
    ypreds_train = model.predict(Xtrain)

    # testing
    test_start_time = time.time()
    ypred_test = model.predict(Xtest)
    test_end_time = time.time()

    results = {}
    results['dataset'] = dkey
    results['param'] = param
    results['training_time'] = fit_end_time - fit_start_time
    results['testing_time'] = test_end_time - test_start_time
    print("Training time:     %f" % results['training_time'])
    print("Testing time:      %f" % results['testing_time'])

    evaluate(ypreds_train, ytrain, results, "training")
    evaluate(ypred_test, ytest, results, "testing")

    fname = '%s_%s_%s_%s_%s_%s.json' % (
        str(param['n_estimators']),
        str(param['max_features']),
        str(param['n_jobs']),
        str(param['bootstrap']),
        str(param['tree_type']),
        str(seed),
    )
    fname = os.path.join(params.odir, str(dkey), str(train_size), "wood",
                         fname)
    ensure_dir_for_file(fname)
    with open(fname, 'w') as fp:
        json.dump(results, fp)
Exemplo n.º 5
0
    Xtrain = Xtrain.astype(np.float32)
    ytrain = ytrain.astype(np.float32)
    Xtest = Xtest.astype(np.float32)
    ytest = ytest.astype(np.float32)

print("")
print("Number of training patterns:\t%i" % Xtrain.shape[0])
print("Number of test patterns:\t%i" % Xtest.shape[0])
print("Dimensionality of the data:\t%i" % Xtrain.shape[1])
model = WoodClassifier(n_estimators=nr_tree,
                       criterion="gini",
                       max_features=None,
                       min_samples_split=2,
                       n_jobs=4,
                       seed=seed,
                       bootstrap=True,
                       tree_traversal_mode="dfs",
                       tree_type="standard",
                       min_samples_leaf=1,
                       float_type="double",
                       max_depth=None,
                       verbose=1)

fit_start_time = time.time()
model.fit(Xtrain, ytrain)
fit_end_time = time.time()

if (f_type == 0):
    file_name = "./models/model_susy" + str(nr_tree) + "tree_4jobs.data"
elif (f_type == 1):
    file_name = "./models/model_" + str(nr_tree) + "tree_4jobs.data"
Exemplo n.º 6
0
def single_run(dkey, train_size, param, seed, profile=False):

    print("Processing data set %s with train_size %s and parameters %s ..." %
          (str(dkey), str(train_size), str(param)))

    tmp_dir = "tmp/hugewood"

    if dkey == "landsat":

        # TODO: Download file manually if needed (255GB and 524MB):
        # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_train.h5pd
        # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd

        fname_train = "data/landsat_train.h5pd"
        fname_test = "data/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd"

        traingen = DataGenerator(fname=fname_train,
                                 seed=seed,
                                 patterns=True,
                                 target=True,
                                 chunksize=1000000,
                                 n_lines_max=train_size)
        testgen = DataGenerator(fname=fname_test,
                                seed=seed,
                                patterns=True,
                                target=True,
                                chunksize=1000000,
                                n_lines_max=10000000)

    else:
        raise Exception("Unknown data set!")

    print("")
    print("Number of training patterns:\t%i" % traingen.get_shapes()[0][0])
    print("Number of test patterns:\t%i" % testgen.get_shapes()[0][0])
    print("Dimensionality of the data:\t%i\n" % traingen.get_shapes()[0][1])

    param_wood = param['param_wood']

    wood = WoodClassifier(n_estimators=1,
                          criterion="gini",
                          max_features=param_wood['max_features'],
                          min_samples_split=2,
                          n_jobs=param_wood['n_jobs'],
                          seed=params.seed,
                          bootstrap=param_wood['bootstrap'],
                          tree_traversal_mode="dfs",
                          tree_type=param_wood['tree_type'],
                          min_samples_leaf=1,
                          float_type="double",
                          max_depth=None,
                          verbose=0)

    model = HugeWoodClassifier(
        n_estimators=param['n_estimators'],
        n_estimators_bottom=param['n_estimators_bottom'],
        n_top="auto",
        n_patterns_leaf="auto",
        balanced_top_tree=True,
        top_tree_max_depth=None,
        top_tree_type="standard",
        top_tree_leaf_stopping_mode="ignore_impurity",
        n_jobs=param_wood['n_jobs'],
        seed=params.seed,
        verbose=1,
        plot_intermediate={},
        chunk_max_megabytes=2048,
        wrapped_instance=wood,
        odir=tmp_dir,
        store=DiskStore(),
    )

    # training
    if profile == True:
        import yep
        assert param_wood['n_jobs'] == 1
        yep.start("train.prof")

    fit_start_time = time.time()
    model.fit(traingen)
    fit_end_time = time.time()
    if profile == True:
        yep.stop()

    # testing
    print("Computing predictions ...")
    test_start_time = time.time()
    ypred_test = model.predict(generator=testgen)
    test_end_time = time.time()

    results = {}
    results['dataset'] = dkey
    results['param'] = param
    results['training_time'] = fit_end_time - fit_start_time
    results['testing_time'] = test_end_time - test_start_time
    results['total'] = model.get_training_times()['total']
    results['retrieve'] = model.get_training_times()['retrieve']
    results['top'] = model.get_training_times()['top']
    results['distribute'] = model.get_training_times()['distribute']
    results['bottom'] = model.get_training_times()['bottom']

    print("Training time:\t\t%f" % results['training_time'])
    print("Testing time:\t\t%f" % results['testing_time'])

    print("Evaluating test error ...")
    ytest = testgen.get_all_target()
    evaluate(ypred_test, ytest, results, "testing")

    fname = '%s_%s_%s_%s_%s.json' % (
        str(param_wood['n_estimators']),
        str(param_wood['max_features']),
        str(param_wood['n_jobs']),
        str(param_wood['bootstrap']),
        str(param_wood['tree_type']),
    )
    fname = os.path.join(params.odir, str(dkey), str(train_size), "hugewood",
                         fname)
    ensure_dir_for_file(fname)
    with open(fname, 'w') as fp:
        json.dump(results, fp)

    del (testgen)
    del (traingen)
    model.cleanup()
    time.sleep(1)
Exemplo n.º 7
0
        Xtrain = Xtrain.astype(np.float32) 
        ytrain = ytrain.astype(np.float32) 
        Xtest = Xtest.astype(np.float32) 
        ytest = ytest.astype(np.float32) 

print("")
print("Number of training patterns:\t%i" % Xtrain.shape[0])
print("Number of test patterns:\t%i" % Xtest.shape[0])
print("Dimensionality of the data:\t%i" % Xtrain.shape[1])
model = WoodClassifier(
            n_estimators=2,
            criterion="gini",
            max_features=None,
            min_samples_split=2,
            n_jobs=1,
            seed=seed,
            bootstrap=True,
            tree_traversal_mode="dfs",
            tree_type="standard",
            min_samples_leaf=1,
            float_type="double",
            max_depth=None,
            verbose=1)

fit_start_time = time.time()
#model.fit(Xtrain, ytrain)
fit_end_time = time.time()
#model.save("./model_susy2tree.data")
model = WoodClassifier.load('./model_8tree.data')
print("Number of estimators: \t\t%i\n" % model.n_estimators)

nr_classes = len(np.unique(ytrain)) +1 #not sure if accurate