def get_higgs_generator(data_path, train_size=1000000, store="h5", seed=0, part="train", patterns=True, target=True): if store == "h5": if part == "train": fname = os.path.join(data_path, "higgs/HIGGS.train_%s.h5pd" % str(train_size)) elif part == "test": fname = os.path.join(data_path, "higgs/HIGGS.test.h5pd") if not os.path.exists(fname): print( "Store for higgs data does not exist. Generating all stores ..." ) _convert_higgs_data(data_path, train_size) if part == "test": chunksize = 250000 else: if train_size <= 2000000: chunksize = 500000 else: chunksize = 2000000 return DataGenerator(fname=fname, seed=seed, patterns=patterns, target=target, chunksize=chunksize) elif store == "mem": X_train, y_train, X_test, y_test = get_higgs_data( data_path, train_size=train_size, shuffle_train=False, shuffle_test=False) data = {} if part == "train": data['X'] = X_train data['y'] = y_train else: data['X'] = X_test data['y'] = y_test return DataGenerator(data=data, seed=seed, patterns=patterns, target=target, chunksize=10000000)
def get_covtype_generator(data_path, train_size=100000, store="h5", seed=0, part="train", patterns=True, target=True): if store == "h5": if part == "train": fname = os.path.join( data_path, "covtype/covtype-train-1_%s.csv.h5pd" % str(train_size)) elif part == "test": fname = os.path.join(data_path, "covtype/covtype-test-1.csv.h5pd") if not os.path.exists(fname): print( "Store for covtype data does not exist. Generating all stores ..." ) _convert_datasets(data_path, train_size) return DataGenerator(fname=fname, seed=seed, patterns=patterns, target=target, chunksize=200000) elif store == "mem": X_train, y_train, X_test, y_test = get_covtype_data( data_path, train_size=train_size, shuffle_train=False, shuffle_test=False) data = {} if part == "train": data['X'] = X_train data['y'] = y_train else: data['X'] = X_test data['y'] = y_test return DataGenerator(data=data, seed=seed, patterns=patterns, target=target, chunksize=200000)
def get_artificial_generator(data_path, size=1000, seed=0, part="train", store="h5", patterns=True, target=True): if part == "train": fname = os.path.join(data_path, "artificial/train_" + str(size) + ".h5pd") elif part == "test": fname = os.path.join(data_path, "artificial/test_" + str(size) + ".h5pd") try: shutil.rmtree(fname) except: pass if not os.path.exists(fname): print( "Store for artificial data does not exist. Generating all stores ..." ) _convert_datasets(data_path, size=size, seed=seed) return DataGenerator(fname=fname, seed=seed, patterns=patterns, target=target, chunksize=200000)
def get_landsat_generator(data_path, train_size=10000000, data_set="LC81950212016133LGN00", version="1_1", seed=0, part="train", store=None, patterns=True, target=True, chunksize=5000000): assert version in ["1_1", "3_3", "pan_1_1", "pan_3_3"] if part == "train": fname = os.path.join(data_path, "landsat", str(data_set) + "_" + version + ".train.h5pd") elif part == "test": fname = os.path.join(data_path, "landsat", str(data_set) + "_" + version + ".test.h5pd") check_and_download(fname) return DataGenerator(fname=fname, seed=seed, patterns=patterns, target=target, chunksize=chunksize)
def single_run(dkey, train_size, param, seed, profile=False): print("Processing data set %s with train_size %s and parameters %s ..." % (str(dkey), str(train_size), str(param))) if dkey == "landsat": # TODO: Download file manually if needed (9,7GB and 524MB): # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_train_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd # TODO: Adapt paths accordingly fname_train = "data/landsat_train_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd" fname_test = "data/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd" traingen = DataGenerator(fname=fname_train, seed=seed, patterns=True, target=True, chunksize=1000000, n_lines_max=train_size) testgen = DataGenerator(fname=fname_test, seed=seed, patterns=True, target=True, chunksize=1000000, n_lines_max=20000000) else: raise Exception("Unknown data set!") Xtrain, ytrain = traingen.get_all() Xtest, ytest = testgen.get_all() print("") print("Number of training patterns:\t%i" % Xtrain.shape[0]) print("Number of test patterns:\t%i" % Xtest.shape[0]) print("Dimensionality of the data:\t%i\n" % Xtrain.shape[1]) if param['tree_type'] == "randomized": from sklearn.ensemble import ExtraTreesClassifier as RF elif param['tree_type'] == "standard": from sklearn.ensemble import RandomForestClassifier as RF model = RF(n_estimators=param['n_estimators'], criterion="gini", max_features=param['max_features'], min_samples_split=2, n_jobs=param['n_jobs'], random_state=seed, bootstrap=param['bootstrap'], min_samples_leaf=1, max_depth=None, verbose=0) if profile == True: import yep assert param['n_jobs'] == 1 yep.start("train.prof") # training fit_start_time = time.time() model.fit(Xtrain, ytrain) fit_end_time = time.time() if profile == True: yep.stop() ypreds_train = model.predict(Xtrain) # testing test_start_time = time.time() ypred_test = model.predict(Xtest) test_end_time = time.time() results = {} results['dataset'] = dkey results['param'] = param results['training_time'] = fit_end_time - fit_start_time results['testing_time'] = test_end_time - test_start_time print("Training time: %f" % results['training_time']) print("Testing time: %f" % results['testing_time']) evaluate(ypreds_train, ytrain, results, "training") evaluate(ypred_test, ytest, results, "testing") fname = '%s_%s_%s_%s_%s_%s.json' % ( str(param['n_estimators']), str(param['max_features']), str(param['n_jobs']), str(param['bootstrap']), str(param['tree_type']), str(seed), ) fname = os.path.join(params.odir, str(dkey), str(train_size), "sk", fname) ensure_dir_for_file(fname) with open(fname, 'w') as fp: json.dump(results, fp)
def single_run(dkey, train_size, param, seed, profile=False): print("Processing data set %s with train_size %s and parameters %s ..." % (str(dkey), str(train_size), str(param))) tmp_dir = "tmp/subsetwood" if dkey == "landsat": # TODO: Download file manually if needed (9,7GB and 524MB): # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_train_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd # TODO: Adapt paths accordingly fname_train = "data/landsat_train_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd" fname_test = "data/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd" traingen = DataGenerator(fname=fname_train, seed=seed, patterns=True, target=True, chunksize=1000000, n_lines_max=train_size) testgen = DataGenerator(fname=fname_test, seed=seed, patterns=True, target=True, chunksize=1000000, n_lines_max=20000000) else: raise Exception("Unknown data set!") print("") print("Number of training patterns:\t%i" % traingen.get_shapes()[0][0]) print("Number of test patterns:\t%i" % testgen.get_shapes()[0][0]) print("Dimensionality of the data:\t%i\n" % traingen.get_shapes()[0][1]) # set to top trees size n_subset = 500000 model = SubsetWoodClassifier( n_estimators=param['n_estimators'], criterion="gini", max_features=param['max_features'], min_samples_split=2, n_jobs=param['n_jobs'], seed=seed, bootstrap=param['bootstrap'], tree_traversal_mode="dfs", tree_type=param['tree_type'], min_samples_leaf=1, float_type="double", max_depth=None, verbose=1, odir=tmp_dir, store=DiskStore()) # training if profile == True: import yep assert param['n_jobs'] == 1 yep.start("train.prof") fit_start_time = time.time() model.fit(traingen, n_subset=n_subset) fit_end_time = time.time() if profile == True: yep.stop() # testing print("Computing predictions ...") test_start_time = time.time() ypred_test = model.predict(generator=testgen) test_end_time = time.time() results = {} results['dataset'] = dkey results['param'] = param results['training_time'] = fit_end_time - fit_start_time results['testing_time'] = test_end_time - test_start_time results['total'] = model.get_training_times()['total'] results['retrieve'] = model.get_training_times()['retrieve'] results['subset'] = model.get_training_times()['subset'] print("Training time:\t\t%f" % results['training_time']) print("Testing time:\t\t%f" % results['testing_time']) print("Evaluating test error ...") ytest = testgen.get_all_target() ytrain = traingen.get_all_target() ytrain = ytrain.astype(numpy.int64) ytest = ytest.astype(numpy.int64) ypred_test = ypred_test.astype(numpy.int64) evaluate(ypred_test, ytest, results, "testing") print("Training distribution") print(numpy.bincount(ytrain)) print("Test distribution") print(numpy.bincount(ytest)) print("Predict distribution") print(numpy.bincount(ypred_test)) fname = '%s_%s_%s_%s_%s_%s.json' % (str(param['n_estimators']), str(param['max_features']), str(param['n_jobs']), str(param['bootstrap']), str(param['tree_type']), str(seed), ) fname = os.path.join(params.odir, str(dkey), str(train_size), "subsetwood_" + str(n_subset), fname) ensure_dir_for_file(fname) with open(fname, 'w') as fp: json.dump(results, fp) del(testgen) del(traingen) model.cleanup() time.sleep(1)
def single_run(dkey, train_size, param, seed, profile=False): print("Processing data set %s with train_size %s and parameters %s ..." % (str(dkey), str(train_size), str(param))) if dkey == "landsat": # TODO: Download file manually if needed (9,7GB and 524MB): # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_train_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd # TODO: Adapt paths accordingly fname_train = "data/landsat_train_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd" fname_test = "data/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd" traingen = DataGenerator(fname=fname_train, seed=seed, patterns=True, target=True, chunksize=1000000, n_lines_max=train_size) testgen = DataGenerator(fname=fname_test, seed=seed, patterns=True, target=True, chunksize=1000000, n_lines_max=20000000) # TODO: Adapt paths accordingly fname_train_csv = "tmp/landsat_train_small_%lu.csv" % train_size fname_test_csv = "tmp/landsat_test.csv" traingen.to_csv(fname_train_csv, cache=False, remove=True) testgen.to_csv(fname_test_csv, cache=False, remove=True) import h2o from skutil.h2o import h2o_col_to_numpy h2o.init(max_mem_size="12G", nthreads=param['n_jobs']) h2o.remove_all() from h2o.estimators.random_forest import H2ORandomForestEstimator if dkey == "landsat_small" or dkey == "landsat": train_df = h2o.import_file(fname_train_csv) test_df = h2o.import_file(fname_test_csv) Xcols, ycol = train_df.col_names[:-1], train_df.col_names[-1] else: raise Exception("Unknown data set!") print("") print("Number of training patterns:\t%i" % train_df.shape[0]) print("Number of test patterns:\t%i" % test_df.shape[0]) print("Dimensionality of the data:\t%i\n" % train_df.shape[1]) if param['max_features'] is None: mtries = train_df.shape[1] - 2 elif param['max_features'] == "sqrt": mtries = int(math.sqrt(train_df.shape[1] - 2)) if param['bootstrap'] == False: sample_rate = 1.0 else: sample_rate = 0.632 model = H2ORandomForestEstimator( mtries=mtries, sample_rate=sample_rate, #nbins=1000, #crash min_rows=1, build_tree_one_node=True, max_depth=20, balance_classes=False, ntrees=param['n_estimators'], seed=seed) # training fit_start_time = time.time() model.train(Xcols, ycol, training_frame=train_df) fit_end_time = time.time() # testing test_start_time = time.time() ypreds_test = model.predict(test_df) test_end_time = time.time() results = {} results['dataset'] = dkey results['param'] = param results['training_time'] = fit_end_time - fit_start_time results['testing_time'] = test_end_time - test_start_time print("Training time: %f" % results['training_time']) print("Testing time: %f" % results['testing_time']) evaluate(numpy.rint(ypreds_test.as_data_frame().values), test_df[ycol].as_data_frame().values, results, "testing") fname = '%s_%s_%s_%s_%s_%s.json' % ( str(param['n_estimators']), str(param['max_features']), str(param['n_jobs']), str(param['bootstrap']), str(param['tree_type']), str(seed), ) fname = os.path.join(params.odir, str(dkey), str(train_size), "h2", fname) ensure_dir_for_file(fname) with open(fname, 'w') as fp: json.dump(results, fp)
def single_run(dkey, train_size, param, seed, profile=False): print("Processing data set %s with train_size %s and parameters %s ..." % (str(dkey), str(train_size), str(param))) tmp_dir = "tmp/hugewood" if dkey == "landsat": # TODO: Download file manually if needed (255GB and 524MB): # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_train.h5pd # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd fname_train = "data/landsat_train.h5pd" fname_test = "data/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd" traingen = DataGenerator(fname=fname_train, seed=seed, patterns=True, target=True, chunksize=1000000, n_lines_max=train_size) testgen = DataGenerator(fname=fname_test, seed=seed, patterns=True, target=True, chunksize=1000000, n_lines_max=10000000) else: raise Exception("Unknown data set!") print("") print("Number of training patterns:\t%i" % traingen.get_shapes()[0][0]) print("Number of test patterns:\t%i" % testgen.get_shapes()[0][0]) print("Dimensionality of the data:\t%i\n" % traingen.get_shapes()[0][1]) param_wood = param['param_wood'] wood = WoodClassifier(n_estimators=1, criterion="gini", max_features=param_wood['max_features'], min_samples_split=2, n_jobs=param_wood['n_jobs'], seed=params.seed, bootstrap=param_wood['bootstrap'], tree_traversal_mode="dfs", tree_type=param_wood['tree_type'], min_samples_leaf=1, float_type="double", max_depth=None, verbose=0) model = HugeWoodClassifier( n_estimators=param['n_estimators'], n_estimators_bottom=param['n_estimators_bottom'], n_top="auto", n_patterns_leaf="auto", balanced_top_tree=True, top_tree_max_depth=None, top_tree_type="standard", top_tree_leaf_stopping_mode="ignore_impurity", n_jobs=param_wood['n_jobs'], seed=params.seed, verbose=1, plot_intermediate={}, chunk_max_megabytes=2048, wrapped_instance=wood, odir=tmp_dir, store=DiskStore(), ) # training if profile == True: import yep assert param_wood['n_jobs'] == 1 yep.start("train.prof") fit_start_time = time.time() model.fit(traingen) fit_end_time = time.time() if profile == True: yep.stop() # testing print("Computing predictions ...") test_start_time = time.time() ypred_test = model.predict(generator=testgen) test_end_time = time.time() results = {} results['dataset'] = dkey results['param'] = param results['training_time'] = fit_end_time - fit_start_time results['testing_time'] = test_end_time - test_start_time results['total'] = model.get_training_times()['total'] results['retrieve'] = model.get_training_times()['retrieve'] results['top'] = model.get_training_times()['top'] results['distribute'] = model.get_training_times()['distribute'] results['bottom'] = model.get_training_times()['bottom'] print("Training time:\t\t%f" % results['training_time']) print("Testing time:\t\t%f" % results['testing_time']) print("Evaluating test error ...") ytest = testgen.get_all_target() evaluate(ypred_test, ytest, results, "testing") fname = '%s_%s_%s_%s_%s.json' % ( str(param_wood['n_estimators']), str(param_wood['max_features']), str(param_wood['n_jobs']), str(param_wood['bootstrap']), str(param_wood['tree_type']), ) fname = os.path.join(params.odir, str(dkey), str(train_size), "hugewood", fname) ensure_dir_for_file(fname) with open(fname, 'w') as fp: json.dump(results, fp) del (testgen) del (traingen) model.cleanup() time.sleep(1)