def javapredict_gbm_hexdev_692(): train = h2o.upload_file( pyunit_utils.locate( "smalldata/logreg/prostate_train_null_column_name.csv")) test = h2o.upload_file( pyunit_utils.locate( "smalldata/logreg/prostate_train_null_column_name.csv")) params = { 'ntrees': 100, 'max_depth': 5, 'seed': 42, 'training_frame': train, 'learn_rate': 0.1, 'min_rows': 10, 'distribution': "bernoulli" } # 651MB pojo train["CAPSULE"] = train["CAPSULE"].asfactor() test["CAPSULE"] = test["CAPSULE"].asfactor() print("Parameter list:") for k, v in zip(list(params.keys()), list(params.values())): print("{0}, {1}".format(k, v)) x = list(range(0, train.ncol)) y = "CAPSULE" pyunit_utils.javapredict("gbm", "class", train, test, x, y, **params)
def javapredict_cars(): # optional parameters params = { 'ntrees': 5000, 'max_depth': 10, 'min_rows': 1, 'learn_rate': 0.1, 'balance_classes': random.sample([True, False], 1)[0] } print("Parameter list:") for k, v in zip(list(params.keys()), list(params.values())): print("{0}, {1}".format(k, v)) train = h2o.import_file( pyunit_utils.locate("smalldata/junit/cars_nice_header.csv")) test = h2o.import_file( pyunit_utils.locate("smalldata/junit/cars_nice_header.csv")) x = [ "name", "economy", "displacement", "power", "weight", "acceleration", "year" ] y = "cylinders" pyunit_utils.javapredict("gbm", "numeric", train, test, x, y, **params)
def javapredict_dynamic_data(): # Generate random dataset dataset_params = {} dataset_params['rows'] = random.sample(list(range(100, 200)), 1)[0] dataset_params['cols'] = random.sample(list(range(10, 21)), 1)[0] dataset_params['categorical_fraction'] = round(random.random(), 1) left_over = (1 - dataset_params['categorical_fraction']) dataset_params['integer_fraction'] = round( left_over - round(random.uniform(0, left_over), 1), 1) if dataset_params['integer_fraction'] + dataset_params[ 'categorical_fraction'] == 1: if dataset_params['integer_fraction'] > dataset_params[ 'categorical_fraction']: dataset_params[ 'integer_fraction'] = dataset_params['integer_fraction'] - 0.1 else: dataset_params['categorical_fraction'] = dataset_params[ 'categorical_fraction'] - 0.1 dataset_params['missing_fraction'] = random.uniform(0, 0.01) dataset_params['has_response'] = True dataset_params['randomize'] = True dataset_params['factors'] = random.randint(2, 50) print("Dataset parameters: {0}".format(dataset_params)) train = h2o.create_frame(**dataset_params) print("Training dataset:") print(train) # Save dataset to results directory results_dir = pyunit_utils.locate("results") h2o.download_csv( train, os.path.join(results_dir, "pca_dynamic_training_dataset.log")) # Generate random parameters params = {} if random.randint(0, 1): params['max_iterations'] = random.sample(list(range(1, 1000)), 1)[0] if random.randint(0, 1): params['transform'] = random.sample( ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"], 1)[0] realNcol = train.ncol - 1 params['k'] = random.sample(list(range(1, min(realNcol, train.nrow))), 1)[0] print("Parameter list: {0}".format(params)) x = train.names x.remove("response") y = "response" pyunit_utils.javapredict(algo="pca", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)
def javapredict_dynamic_data(): # Generate random dataset dataset_params = {} dataset_params['rows'] = random.sample(range(5000,15001),1)[0] dataset_params['cols'] = random.sample(range(10,21),1)[0] dataset_params['categorical_fraction'] = round(random.random(),1) left_over = (1 - dataset_params['categorical_fraction']) dataset_params['integer_fraction'] = round(left_over - round(random.uniform(0,left_over),1),1) if dataset_params['integer_fraction'] + dataset_params['categorical_fraction'] == 1: if dataset_params['integer_fraction'] > dataset_params['categorical_fraction']: dataset_params['integer_fraction'] = dataset_params['integer_fraction'] - 0.1 else: dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1 dataset_params['missing_fraction'] = random.uniform(0,0.5) dataset_params['has_response'] = True dataset_params['randomize'] = True dataset_params['factors'] = random.randint(2,2000) print "Dataset parameters: {0}".format(dataset_params) append_response = False distribution = random.sample(['bernoulli','multinomial','gaussian','poisson','tweedie','gamma'], 1)[0] if distribution == 'gaussian': dataset_params['response_factors'] = 1 elif distribution == 'bernoulli': dataset_params['response_factors'] = 2 elif distribution == 'multinomial': dataset_params['response_factors'] = random.randint(3,100) else: dataset_params['has_response'] = False response = h2o.H2OFrame.fromPython([random.randint(1,1000) for r in range(0,dataset_params['rows'])]) append_response = True print "Distribution: {0}".format(distribution) train = h2o.create_frame(**dataset_params) if append_response: train = response.cbind(train) train.set_name(0,"response") if distribution == 'bernoulli' or distribution == 'multinomial': train['response'] = train['response'].asfactor() train = train.impute("response", method="mode") print "Training dataset:" print train # Save dataset to results directory results_dir = pyunit_utils.locate("results") h2o.download_csv(train,os.path.join(results_dir,"training_dataset.log")) # Generate random parameters params = {} if random.randint(0,1): params['ntrees'] = random.sample(range(1,21),1)[0] if random.randint(0,1): params['max_depth'] = random.sample(range(1,11),1)[0] if random.randint(0,1): params['min_rows'] = random.sample(range(1,11),1)[0] if random.randint(0,1): params['nbins'] = random.sample(range(2,21),1)[0] if random.randint(0,1): params['nbins_cats'] = random.sample(range(2,1025),1)[0] if random.randint(0,1): params['learn_rate'] = random.random() params['distribution'] = distribution print "Parameter list: {0}".format(params) x = train.names x.remove("response") y = "response" pyunit_utils.javapredict(algo="gbm", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)
def javapredict_dynamic_data(): # Generate random dataset dataset_params = {} dataset_params['rows'] = random.sample(list(range(5000,15001)),1)[0] dataset_params['cols'] = random.sample(list(range(10,21)),1)[0] dataset_params['categorical_fraction'] = round(random.random(),1) left_over = (1 - dataset_params['categorical_fraction']) dataset_params['integer_fraction'] = round(left_over - round(random.uniform(0,left_over),1),1) if dataset_params['integer_fraction'] + dataset_params['categorical_fraction'] == 1: if dataset_params['integer_fraction'] > dataset_params['categorical_fraction']: dataset_params['integer_fraction'] = dataset_params['integer_fraction'] - 0.1 else: dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1 dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1 dataset_params['missing_fraction'] = random.uniform(0,0.5) dataset_params['has_response'] = True dataset_params['randomize'] = True dataset_params['factors'] = random.randint(2,2000) print("Dataset parameters: {0}".format(dataset_params)) append_response = False family = random.sample(['binomial','gaussian','poisson','tweedie','gamma'], 1)[0] if family == 'binomial': dataset_params['response_factors'] = 2 elif family == 'gaussian': dataset_params['response_factors'] = 1 else: dataset_params['has_response'] = False response = h2o.H2OFrame([[random.randint(1,1000)] for r in range(0,dataset_params['rows'])]) append_response = True print("Family: {0}".format(family)) train = h2o.create_frame(**dataset_params) if append_response: train = response.cbind(train) train.set_name(0,"response") if family == 'binomial': train['response'] = train['response'].asfactor() results_dir = pyunit_utils.locate("results") h2o.download_csv(train["response"],os.path.join(results_dir,"glm_dynamic_preimputed_response.log")) train = train.impute("response", method="mode") print("Training dataset:") print(train) # Save dataset to results directory h2o.download_csv(train,os.path.join(results_dir,"glm_dynamic_training_dataset.log")) # Generate random parameters params = {} if random.randint(0,1): params['alpha'] = random.random() params['family'] = family if params['family'] == "tweedie": if random.randint(0,1): params['tweedie_variance_power'] = round(random.random()+1,6) params['tweedie_link_power'] = 1 - params['tweedie_variance_power'] print("Parameter list: {0}".format(params)) x = list(range(1,train.ncol)) y = "response" pyunit_utils.javapredict(algo="glm", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)
def javapredict_dynamic_data(): # Generate random dataset dataset_params = {} dataset_params['rows'] = random.sample(list(range(5000,15001)),1)[0] dataset_params['cols'] = random.sample(list(range(10,21)),1)[0] dataset_params['categorical_fraction'] = round(random.random(),1) left_over = (1 - dataset_params['categorical_fraction']) dataset_params['integer_fraction'] = round(left_over - round(random.uniform(0,left_over),1),1) if dataset_params['integer_fraction'] + dataset_params['categorical_fraction'] == 1: if dataset_params['integer_fraction'] > dataset_params['categorical_fraction']: dataset_params['integer_fraction'] = dataset_params['integer_fraction'] - 0.1 else: dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1 dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1 dataset_params['missing_fraction'] = random.uniform(0,0.5) dataset_params['has_response'] = True dataset_params['randomize'] = True dataset_params['factors'] = random.randint(2,2000) print("Dataset parameters: {0}".format(dataset_params)) append_response = False family = random.sample(['binomial','gaussian','poisson','tweedie','gamma'], 1)[0] if family == 'binomial': dataset_params['response_factors'] = 2 elif family == 'gaussian': dataset_params['response_factors'] = 1 else: dataset_params['has_response'] = False response = h2o.H2OFrame([random.randint(1,1000) for r in range(0,dataset_params['rows'])]) append_response = True print("Family: {0}".format(family)) train = h2o.create_frame(**dataset_params) if append_response: train = response.cbind(train) train.set_name(0,"response") if family == 'binomial': train['response'] = train['response'].asfactor() results_dir = pyunit_utils.locate("results") h2o.download_csv(train["response"],os.path.join(results_dir,"glm_dynamic_preimputed_response.log")) train = train.impute("response", method="mode") print("Training dataset:") print(train) # Save dataset to results directory h2o.download_csv(train,os.path.join(results_dir,"glm_dynamic_training_dataset.log")) # Generate random parameters params = {} if random.randint(0,1): params['alpha'] = random.random() params['family'] = family if params['family'] == "tweedie": if random.randint(0,1): params['tweedie_variance_power'] = round(random.random()+1,6) params['tweedie_link_power'] = 1 - params['tweedie_variance_power'] print("Parameter list: {0}".format(params)) x = list(range(1,train.ncol)) y = "response" pyunit_utils.javapredict(algo="glm", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)
def javapredict_dynamic_data(): dataset_params = {} dataset_params['rows'] = 13183 dataset_params['cols'] = 13 dataset_params['categorical_fraction'] = 0.4 dataset_params['integer_fraction'] = 0.3 dataset_params['missing_fraction'] = 0.27539154084819495 dataset_params['has_response'] = True dataset_params['randomize'] = True dataset_params['factors'] = 819 print("Dataset parameters: {0}".format(dataset_params)) problem = 2 print( "Model-building exercise (0:regression, 1:binomial, 2:multinomial): {0}" .format(problem)) if problem == 'binomial': dataset_params['response_factors'] = 2 elif problem == 'regression': dataset_params['response_factors'] = 1 else: dataset_params['response_factors'] = 16 train = h2o.create_frame(**dataset_params) if problem == 'binomial' or problem == 'multinomial': train['response'] = train['response'].asfactor() results_dir = pyunit_utils.locate("results") h2o.download_csv( train["response"], os.path.join(results_dir, "drf_dynamic_preimputed_response.log")) train.impute("response", method="mode") print("Training dataset:") print(train) # Save dataset to results directory h2o.download_csv( train, os.path.join(results_dir, "drf_dynamic_training_dataset.log")) params = {} params['nbins'] = 5 params['min_rows'] = 7 params['mtries'] = 4 params['sample_rate'] = 0.7867986759373544 params['seed'] = 1304644573760597606 print("Parameter list: {0}".format(params)) x = list(range(1, train.ncol)) y = "response" pyunit_utils.javapredict(algo="random_forest", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)
def javapredict_dynamic_data(): # Generate random dataset dataset_params = {} dataset_params['rows'] = random.sample(list(range(5000, 15001)), 1)[0] dataset_params['cols'] = random.sample(list(range(10, 21)), 1)[0] dataset_params['categorical_fraction'] = round(random.random(), 1) left_over = (1 - dataset_params['categorical_fraction']) dataset_params['integer_fraction'] = round( left_over - round(random.uniform(0, left_over), 1), 1) if dataset_params['integer_fraction'] + dataset_params[ 'categorical_fraction'] == 1: if dataset_params['integer_fraction'] > dataset_params[ 'categorical_fraction']: dataset_params[ 'integer_fraction'] = dataset_params['integer_fraction'] - 0.1 else: dataset_params['categorical_fraction'] = dataset_params[ 'categorical_fraction'] - 0.1 dataset_params['missing_fraction'] = random.uniform(0, 0.5) dataset_params['has_response'] = True dataset_params['randomize'] = True dataset_params['factors'] = random.randint(2, 2000) dataset_params['response_factors'] = random.randint(3, 100) print("Dataset parameters: {0}".format(dataset_params)) train = h2o.create_frame(**dataset_params) print("Training dataset:") print(train) # Save dataset to results directory results_dir = pyunit_utils.locate("results") h2o.download_csv( train, os.path.join(results_dir, "nb_dynamic_training_dataset.log")) # Generate random parameters params = {} params['laplace'] = 0 if random.randint(0, 1): params['laplace'] = random.uniform(0, 11) print("Parameter list: {0}".format(params)) x = train.names x.remove("response") y = "response" pyunit_utils.javapredict(algo="naive_bayes", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)
def javapredict_dynamic_data(): # Generate random dataset dataset_params = {} dataset_params['rows'] = random.sample(list(range(5000,15001)),1)[0] dataset_params['cols'] = random.sample(list(range(10,21)),1)[0] dataset_params['categorical_fraction'] = round(random.random(),1) left_over = (1 - dataset_params['categorical_fraction']) dataset_params['integer_fraction'] = round(left_over - round(random.uniform(0,left_over),1),1) if dataset_params['integer_fraction'] + dataset_params['categorical_fraction'] == 1: if dataset_params['integer_fraction'] > dataset_params['categorical_fraction']: dataset_params['integer_fraction'] = dataset_params['integer_fraction'] - 0.1 else: dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1 dataset_params['missing_fraction'] = random.uniform(0,0.5) dataset_params['has_response'] = True dataset_params['randomize'] = True dataset_params['factors'] = random.randint(2,2000) print("Dataset parameters: {0}".format(dataset_params)) problem = random.sample(list(range(0,3)),1) print("Model-building exercise (0:regression, 1:binomial, 2:multinomial): {0}".format(problem)) if problem == 'binomial': dataset_params['response_factors'] = 2 elif problem == 'regression': dataset_params['response_factors'] = 1 else: dataset_params['response_factors'] = random.randint(3,100) train = h2o.create_frame(**dataset_params) if problem == 'binomial' or problem == 'multinomial': train['response'] = train['response'].asfactor() results_dir = pyunit_utils.locate("results") h2o.download_csv(train["response"],os.path.join(results_dir,"drf_dynamic_preimputed_response.log")) train.impute("response", method="mode") print("Training dataset:") print(train) # Save dataset to results directory h2o.download_csv(train,os.path.join(results_dir,"drf_dynamic_training_dataset.log")) # Generate random parameters params = {} if random.randint(0,1): params['ntrees'] = random.sample(list(range(1,21)),1)[0] if random.randint(0,1): params['max_depth'] = random.sample(list(range(1,11)),1)[0] if random.randint(0,1): params['min_rows'] = random.sample(list(range(1,11)),1)[0] if random.randint(0,1): params['nbins'] = random.sample(list(range(2,21)),1)[0] if random.randint(0,1): params['nbins_cats'] = random.sample(list(range(2,1025)),1)[0] if random.randint(0,1): params['mtries'] = random.sample(list(range(1,dataset_params['cols']+1)),1)[0] if random.randint(0,1): params['sample_rate'] = random.random() print("Parameter list: {0}".format(params)) x = list(range(1,train.ncol)) y = "response" pyunit_utils.javapredict(algo="random_forest", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)
def javapredict_smallcat(): # optional parameters params = {'epochs':100} print("Parameter list:") for k,v in zip(list(params.keys()), list(params.values())): print("{0}, {1}".format(k,v)) train = h2o.upload_file(pyunit_utils.locate("smalldata/iris/setosa_versicolor.csv")) test = h2o.upload_file(pyunit_utils.locate("smalldata/iris/virginica.csv")) x = [0,1,2,4] y = 3 pyunit_utils.javapredict("deeplearning", "numeric", train, test, x, y, **params)
def javapredict_2x100000(): # optional parameters params = {"max_iterations":1, "solver":"L_BFGS"} print "Parameter list:" for k,v in zip(params.keys(), params.values()): print "{0}, {1}".format(k,v) train = h2o.import_file(pyunit_utils.locate("smalldata/jira/2x100000_real.csv.gz")) test = train x = range(1,train.ncol) y = 0 pyunit_utils.javapredict("glm", "numeric", train, test, x, y, **params)
def javapredict_2x100000(): # optional parameters params = {"max_iterations":1, "solver":"L_BFGS"} print("Parameter list:") for k,v in zip(list(params.keys()), list(params.values())): print("{0}, {1}".format(k,v)) train = h2o.import_file(pyunit_utils.locate("smalldata/jira/2x100000_real.csv.gz")) test = train x = list(range(1,train.ncol)) y = 0 pyunit_utils.javapredict(algo="glm", equality="numeric", train=train, test=test, x=x, y=y, compile_only=True, **params)
def javapredict_smallcat(): # optional parameters params = {'ntrees':100, 'max_depth':5, 'min_rows':10} print("Parameter list:") for k,v in zip(list(params.keys()), list(params.values())): print("{0}, {1}".format(k,v)) train = h2o.upload_file(pyunit_utils.locate("smalldata/iris/setosa_versicolor.csv")) test = h2o.upload_file(pyunit_utils.locate("smalldata/iris/virginica.csv")) x = [0,1,2,4] y = 3 pyunit_utils.javapredict("random_forest", "numeric", train, test, x, y, **params)
def javapredict_iris_drf(): # optional parameters params = {'ntrees':100, 'max_depth':5, 'min_rows':10} print "Parameter list:" for k,v in zip(params.keys(), params.values()): print "{0}, {1}".format(k,v) train = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris_train.csv")) test = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris_train.csv")) x = ["sepal_len","sepal_wid","petal_len","petal_wid"] y = "species" pyunit_utils.javapredict("random_forest", "class", train, test, x, y, **params)
def javapredict_cars(): # optional parameters params = {'ntrees':5000, 'max_depth':10, 'min_rows':1, 'learn_rate':0.1, 'balance_classes':random.sample([True,False],1)[0]} print("Parameter list:") for k,v in zip(list(params.keys()), list(params.values())): print("{0}, {1}".format(k,v)) train = h2o.import_file(pyunit_utils.locate("smalldata/junit/cars_nice_header.csv")) test = h2o.import_file(pyunit_utils.locate("smalldata/junit/cars_nice_header.csv")) x = ["name","economy", "displacement","power","weight","acceleration","year"] y = "cylinders" pyunit_utils.javapredict("gbm", "numeric", train, test, x, y, **params)
def javapredict_smallcat(): # optional parameters params = {'ntrees':100, 'max_depth':5, 'min_rows':10} print "Parameter list:" for k,v in zip(params.keys(), params.values()): print "{0}, {1}".format(k,v) train = h2o.upload_file(pyunit_utils.locate("smalldata/iris/setosa_versicolor.csv")) test = h2o.upload_file(pyunit_utils.locate("smalldata/iris/virginica.csv")) x = [0,1,2,4] y = 3 pyunit_utils.javapredict("random_forest", "numeric", train, test, x, y, **params)
def javapredict_2x100000(): # optional parameters params = {"max_iterations": 1, "solver": "L_BFGS"} print "Parameter list:" for k, v in zip(params.keys(), params.values()): print "{0}, {1}".format(k, v) train = h2o.import_file( pyunit_utils.locate("smalldata/jira/2x100000_real.csv.gz")) test = train x = range(1, train.ncol) y = 0 pyunit_utils.javapredict("glm", "numeric", train, test, x, y, **params)
def javapredict_dynamic_data(): dataset_params = {} dataset_params["rows"] = 13183 dataset_params["cols"] = 13 dataset_params["categorical_fraction"] = 0.4 dataset_params["integer_fraction"] = 0.3 dataset_params["missing_fraction"] = 0.27539154084819495 dataset_params["has_response"] = True dataset_params["randomize"] = True dataset_params["factors"] = 819 print("Dataset parameters: {0}".format(dataset_params)) problem = 2 print("Model-building exercise (0:regression, 1:binomial, 2:multinomial): {0}".format(problem)) if problem == "binomial": dataset_params["response_factors"] = 2 elif problem == "regression": dataset_params["response_factors"] = 1 else: dataset_params["response_factors"] = 16 train = h2o.create_frame(**dataset_params) if problem == "binomial" or problem == "multinomial": train["response"] = train["response"].asfactor() results_dir = pyunit_utils.locate("results") h2o.download_csv(train["response"], os.path.join(results_dir, "drf_dynamic_preimputed_response.log")) train.impute("response", method="mode") print("Training dataset:") print(train) # Save dataset to results directory h2o.download_csv(train, os.path.join(results_dir, "drf_dynamic_training_dataset.log")) params = {} params["nbins"] = 5 params["min_rows"] = 7 params["mtries"] = 4 params["sample_rate"] = 0.7867986759373544 params["seed"] = 1304644573760597606 print("Parameter list: {0}".format(params)) x = list(range(1, train.ncol)) y = "response" pyunit_utils.javapredict( algo="random_forest", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params )
def javapredict_drf_xlarge(): hdfs_name_node = pyunit_utils.hadoop_namenode() hdfs_file_name = "/datasets/z_repro.csv" url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_file_name) params = {'ntrees':20, 'max_depth':35, 'min_rows':1} # 739MB pojo print "Parameter list:" for k,v in zip(params.keys(), params.values()): print "{0}, {1}".format(k,v) train = h2o.import_file(url) test = train[range(0,10),:] x = range(1,train.ncol) y = 0 pyunit_utils.javapredict("random_forest", "numeric", train, test, x, y, **params)
def javapredict_gbm_xlarge(): hdfs_name_node = pyunit_utils.hadoop_namenode() hdfs_file_name = "/datasets/z_repro.csv" url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_file_name) params = {'ntrees':22, 'max_depth':37, 'min_rows':1, 'sample_rate':0.1} # 651MB pojo print("Parameter list:") for k,v in zip(list(params.keys()), list(params.values())): print("{0}, {1}".format(k,v)) train = h2o.import_file(url) test = train[list(range(0,10)),:] x = list(range(1,train.ncol)) y = 0 pyunit_utils.javapredict("gbm", "numeric", train, test, x, y, **params)
def javapredict_dl_xlarge(): hdfs_name_node = pyunit_utils.hadoop_namenode() hdfs_file_name = "/datasets/z_repro.csv" url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_file_name) params = {'hidden':[3500, 3500], 'epochs':0.0001} # 436MB pojo print("Parameter list:") for k,v in zip(list(params.keys()), list(params.values())): print("{0}, {1}".format(k,v)) train = h2o.import_file(url) test = train[list(range(0,10)),:] x = list(range(1,train.ncol)) y = 0 pyunit_utils.javapredict("deeplearning", "numeric", train, test, x, y, **params)
def javapredict_smallcat(): # optional parameters params = {'epochs': 100} print("Parameter list:") for k, v in zip(list(params.keys()), list(params.values())): print("{0}, {1}".format(k, v)) train = h2o.upload_file( pyunit_utils.locate("smalldata/iris/setosa_versicolor.csv")) test = h2o.upload_file(pyunit_utils.locate("smalldata/iris/virginica.csv")) x = [0, 1, 2, 4] y = 3 pyunit_utils.javapredict("deeplearning", "numeric", train, test, x, y, **params)
def javapredict_dynamic_data(): # Generate random dataset dataset_params = {} dataset_params['rows'] = random.sample(range(5000,15001),1)[0] dataset_params['cols'] = random.sample(range(10,21),1)[0] dataset_params['categorical_fraction'] = round(random.random(),1) left_over = (1 - dataset_params['categorical_fraction']) dataset_params['integer_fraction'] = round(left_over - round(random.uniform(0,left_over),1),1) if dataset_params['integer_fraction'] + dataset_params['categorical_fraction'] == 1: if dataset_params['integer_fraction'] > dataset_params['categorical_fraction']: dataset_params['integer_fraction'] = dataset_params['integer_fraction'] - 0.1 else: dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1 dataset_params['missing_fraction'] = random.uniform(0,0.5) dataset_params['has_response'] = True dataset_params['randomize'] = True dataset_params['factors'] = random.randint(2,2000) print "Dataset parameters: {0}".format(dataset_params) train = h2o.create_frame(**dataset_params) print "Training dataset:" print train # Save dataset to results directory results_dir = pyunit_utils.locate("results") h2o.download_csv(train,os.path.join(results_dir,"kmeans_dynamic_training_dataset.log")) # Generate random parameters params = {} params['k'] = random.sample(range(1,10),1)[0] if random.randint(0,1): params['max_iterations'] = random.sample(range(1,1000),1)[0] if random.randint(0,1): params['standardize'] = random.sample([True, False],1)[0] if random.randint(0,1): params['seed'] = random.sample(range(1,1000),1)[0] if random.randint(0,1): params['init'] = random.sample(['Random','PlusPlus','Furthest'],1)[0] print "Parameter list: {0}".format(params) x = train.names x.remove("response") y = "response" pyunit_utils.javapredict(algo="kmeans", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)
def javapredict_dynamic_data(): # Generate random dataset dataset_params = {} dataset_params['rows'] = random.sample(list(range(100,200)),1)[0] dataset_params['cols'] = random.sample(list(range(10,21)),1)[0] dataset_params['categorical_fraction'] = round(random.random(),1) left_over = (1 - dataset_params['categorical_fraction']) dataset_params['integer_fraction'] = round(left_over - round(random.uniform(0,left_over),1),1) if dataset_params['integer_fraction'] + dataset_params['categorical_fraction'] == 1: if dataset_params['integer_fraction'] > dataset_params['categorical_fraction']: dataset_params['integer_fraction'] = dataset_params['integer_fraction'] - 0.1 else: dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1 dataset_params['missing_fraction'] = random.uniform(0,0.01) dataset_params['has_response'] = True dataset_params['randomize'] = True dataset_params['factors'] = random.randint(2,50) print("Dataset parameters: {0}".format(dataset_params)) train = h2o.create_frame(**dataset_params) print("Training dataset:") print(train) # Save dataset to results directory results_dir = pyunit_utils.locate("results") h2o.download_csv(train,os.path.join(results_dir,"pca_dynamic_training_dataset.log")) # Generate random parameters params = {} if random.randint(0,1): params['max_iterations'] = random.sample(list(range(1,1000)),1)[0] if random.randint(0,1): params['transform'] = random.sample(["NONE","STANDARDIZE","NORMALIZE","DEMEAN","DESCALE"],1)[0] params['k'] = random.sample(list(range(1,min(train.ncol,train.nrow))),1)[0] print("Parameter list: {0}".format(params)) x = train.names x.remove("response") y = "response" pyunit_utils.javapredict(algo="pca", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)
def javapredict_dynamic_data(): # Generate random dataset dataset_params = {} dataset_params['rows'] = random.sample(list(range(5000,15001)),1)[0] dataset_params['cols'] = random.sample(list(range(10,21)),1)[0] dataset_params['categorical_fraction'] = round(random.random(),1) left_over = (1 - dataset_params['categorical_fraction']) dataset_params['integer_fraction'] = round(left_over - round(random.uniform(0,left_over),1),1) if dataset_params['integer_fraction'] + dataset_params['categorical_fraction'] == 1: if dataset_params['integer_fraction'] > dataset_params['categorical_fraction']: dataset_params['integer_fraction'] = dataset_params['integer_fraction'] - 0.1 else: dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1 dataset_params['missing_fraction'] = random.uniform(0,0.5) dataset_params['has_response'] = True dataset_params['randomize'] = True dataset_params['factors'] = random.randint(2,2000) dataset_params['response_factors'] = random.randint(3,100) print("Dataset parameters: {0}".format(dataset_params)) train = h2o.create_frame(**dataset_params) print("Training dataset:") print(train) # Save dataset to results directory results_dir = pyunit_utils.locate("results") h2o.download_csv(train,os.path.join(results_dir,"nb_dynamic_training_dataset.log")) # Generate random parameters params = {} params['laplace'] = 0 if random.randint(0,1): params['laplace'] = random.uniform(0,11) print("Parameter list: {0}".format(params)) x = train.names x.remove("response") y = "response" pyunit_utils.javapredict(algo="naive_bayes", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)
def javapredict_pubdev_4531(): train = h2o.upload_file(pyunit_utils.locate("smalldata/logreg/prostate_train_null_column_name.csv")) test = h2o.upload_file(pyunit_utils.locate("smalldata/logreg/prostate_train_null_column_name.csv")) params = {'ntrees':20, 'max_depth':2, 'seed':42, 'training_frame':train, 'learn_rate':0.1, 'min_rows':10, 'distribution':"bernoulli"} # 651MB pojo train["CAPSULE"] = train["CAPSULE"].asfactor() test["CAPSULE"] = test["CAPSULE"].asfactor() print("Parameter list:") for k,v in zip(list(params.keys()), list(params.values())): print("{0}, {1}".format(k,v)) x = list(range(0,train.ncol)) y = "CAPSULE" pyunit_utils.javapredict("gbm", "class", train, test, x, y, **params) # make sure original call run # check a separator that is a special character pyunit_utils.javapredict("gbm", "class", train, test, x, y,separator="|", setInvNumNA=True, **params) pyunit_utils.javapredict("gbm", "class", train, test, x, y,separator="\|", setInvNumNA=True, **params) # test with escape string // already added pyunit_utils.javapredict("gbm", "class", train, test, x, y,separator="\\|", setInvNumNA=True, **params) # check a separator that is not a special character pyunit_utils.javapredict("gbm", "class", train, test, x, y,separator="@", setInvNumNA=True, **params)
def javapredict_gbm_xlarge(): hdfs_name_node = pyunit_utils.hadoop_namenode() hdfs_file_name = "/datasets/z_repro.csv" url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_file_name) params = { 'ntrees': 22, 'max_depth': 37, 'min_rows': 1, 'sample_rate': 0.1 } # 651MB pojo print("Parameter list:") for k, v in zip(list(params.keys()), list(params.values())): print("{0}, {1}".format(k, v)) train = h2o.import_file(url) test = train[list(range(0, 10)), :] x = list(range(1, train.ncol)) y = 0 pyunit_utils.javapredict("gbm", "numeric", train, test, x, y, **params)
def javapredict_dynamic_data(): # Generate random dataset dataset_params = {} dataset_params['rows'] = random.sample(list(range(5000, 15001)), 1)[0] dataset_params['cols'] = random.sample(list(range(10, 21)), 1)[0] dataset_params['categorical_fraction'] = round(random.random(), 1) left_over = (1 - dataset_params['categorical_fraction']) dataset_params['integer_fraction'] = round( left_over - round(random.uniform(0, left_over), 1), 1) if dataset_params['integer_fraction'] + dataset_params[ 'categorical_fraction'] == 1: if dataset_params['integer_fraction'] > dataset_params[ 'categorical_fraction']: dataset_params[ 'integer_fraction'] = dataset_params['integer_fraction'] - 0.1 else: dataset_params['categorical_fraction'] = dataset_params[ 'categorical_fraction'] - 0.1 dataset_params['missing_fraction'] = random.uniform(0, 0.5) dataset_params['has_response'] = True dataset_params['randomize'] = True dataset_params['factors'] = random.randint(2, 2000) print("Dataset parameters: {0}".format(dataset_params)) append_response = False distribution = random.sample( ['bernoulli', 'multinomial', 'gaussian', 'poisson', 'gamma'], 1)[0] if distribution == 'bernoulli': dataset_params['response_factors'] = 2 elif distribution == 'gaussian': dataset_params['response_factors'] = 1 elif distribution == 'multinomial': dataset_params['response_factors'] = random.randint(3, 100) else: dataset_params['has_response'] = False response = h2o.H2OFrame([[random.randint(1, 1000)] for r in range(0, dataset_params['rows'])]) append_response = True print("Distribution: {0}".format(distribution)) train = h2o.create_frame(**dataset_params) if append_response: train = response.cbind(train) train.set_name(0, "response") if distribution == 'bernoulli' or distribution == 'multinomial': train['response'] = train['response'].asfactor() results_dir = pyunit_utils.locate("results") h2o.download_csv( train["response"], os.path.join(results_dir, "dl_dynamic_preimputed_response.log")) train = train.impute("response", method="mode") print("Training dataset:") print(train) # Save dataset to results directory h2o.download_csv( train, os.path.join(results_dir, "dl_dynamic_training_dataset.log")) # Generate random parameters params = {} if random.randint(0, 1): params['activation'] = random.sample([ "Rectifier", "Tanh", "TanhWithDropout", "RectifierWithDropout", "MaxoutWithDropout" ], 1)[0] if random.randint(0, 1): params['epochs'] = random.sample(list(range(1, 10)), 1)[0] if random.randint(0, 1): h = random.randint(10, 21) params['hidden'] = [h for x in range(random.randint(2, 3))] params['distribution'] = distribution params['l1'] = random.random() print("Parameter list: {0}".format(params)) x = train.names x.remove("response") y = "response" pyunit_utils.javapredict(algo="deeplearning", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)
def javapredict_dynamic_data(): # Generate random dataset dataset_params = {} dataset_params['rows'] = random.sample(range(5000, 15001), 1)[0] dataset_params['cols'] = random.sample(range(10, 21), 1)[0] dataset_params['categorical_fraction'] = round(random.random(), 1) left_over = (1 - dataset_params['categorical_fraction']) dataset_params['integer_fraction'] = round( left_over - round(random.uniform(0, left_over), 1), 1) if dataset_params['integer_fraction'] + dataset_params[ 'categorical_fraction'] == 1: if dataset_params['integer_fraction'] > dataset_params[ 'categorical_fraction']: dataset_params[ 'integer_fraction'] = dataset_params['integer_fraction'] - 0.1 else: dataset_params['categorical_fraction'] = dataset_params[ 'categorical_fraction'] - 0.1 dataset_params['missing_fraction'] = random.uniform(0, 0.5) dataset_params['has_response'] = True dataset_params['randomize'] = True dataset_params['factors'] = random.randint(2, 2000) print "Dataset parameters: {0}".format(dataset_params) append_response = False distribution = random.sample([ 'bernoulli', 'multinomial', 'gaussian', 'poisson', 'tweedie', 'gamma' ], 1)[0] if distribution == 'gaussian': dataset_params['response_factors'] = 1 elif distribution == 'bernoulli': dataset_params['response_factors'] = 2 elif distribution == 'multinomial': dataset_params['response_factors'] = random.randint(3, 100) else: dataset_params['has_response'] = False response = h2o.H2OFrame([ random.randint(1, 1000) for r in range(0, dataset_params['rows']) ]) append_response = True print "Distribution: {0}".format(distribution) train = h2o.create_frame(**dataset_params) if append_response: train = response.cbind(train) train.set_name(0, "response") if distribution == 'bernoulli' or distribution == 'multinomial': train['response'] = train['response'].asfactor() results_dir = pyunit_utils.locate("results") h2o.download_csv( train["response"], os.path.join(results_dir, "gbm_dynamic_preimputed_response.log")) train = train.impute("response", method="mode") print "Training dataset:" print train # Save dataset to results directory h2o.download_csv( train, os.path.join(results_dir, "gbm_dynamic_training_dataset.log")) # Generate random parameters params = {} if random.randint(0, 1): params['ntrees'] = random.sample(range(1, 21), 1)[0] if random.randint(0, 1): params['max_depth'] = random.sample(range(1, 11), 1)[0] if random.randint(0, 1): params['min_rows'] = random.sample(range(1, 11), 1)[0] if random.randint(0, 1): params['nbins'] = random.sample(range(2, 21), 1)[0] if random.randint(0, 1): params['nbins_cats'] = random.sample(range(2, 1025), 1)[0] if random.randint(0, 1): params['learn_rate'] = random.random() params['distribution'] = distribution print "Parameter list: {0}".format(params) x = train.names x.remove("response") y = "response" pyunit_utils.javapredict(algo="gbm", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)
def javapredict_pubdev_4531(): train = h2o.upload_file( pyunit_utils.locate( "smalldata/logreg/prostate_train_null_column_name.csv")) test = h2o.upload_file( pyunit_utils.locate( "smalldata/logreg/prostate_train_null_column_name.csv")) params = { 'ntrees': 20, 'max_depth': 2, 'seed': 42, 'training_frame': train, 'learn_rate': 0.1, 'min_rows': 10, 'distribution': "bernoulli" } # 651MB pojo train["CAPSULE"] = train["CAPSULE"].asfactor() test["CAPSULE"] = test["CAPSULE"].asfactor() print("Parameter list:") for k, v in zip(list(params.keys()), list(params.values())): print("{0}, {1}".format(k, v)) x = list(range(0, train.ncol)) y = "CAPSULE" pyunit_utils.javapredict("gbm", "class", train, test, x, y, **params) # make sure original call run # check a separator that is a special character pyunit_utils.javapredict("gbm", "class", train, test, x, y, separator="|", setInvNumNA=True, **params) pyunit_utils.javapredict("gbm", "class", train, test, x, y, separator="\|", setInvNumNA=True, **params) # test with escape string // already added pyunit_utils.javapredict("gbm", "class", train, test, x, y, separator="\\|", setInvNumNA=True, **params) # check a separator that is not a special character pyunit_utils.javapredict("gbm", "class", train, test, x, y, separator="@", setInvNumNA=True, **params)
def javapredict_dynamic_data(): # Generate random dataset dataset_params = {} dataset_params['rows'] = random.sample(list(range(5000,15001)),1)[0] dataset_params['cols'] = random.sample(list(range(10,21)),1)[0] dataset_params['categorical_fraction'] = round(random.random(),1) left_over = (1 - dataset_params['categorical_fraction']) dataset_params['integer_fraction'] = round(left_over - round(random.uniform(0,left_over),1),1) if dataset_params['integer_fraction'] + dataset_params['categorical_fraction'] == 1: if dataset_params['integer_fraction'] > dataset_params['categorical_fraction']: dataset_params['integer_fraction'] = dataset_params['integer_fraction'] - 0.1 else: dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1 dataset_params['missing_fraction'] = random.uniform(0,0.5) dataset_params['has_response'] = True dataset_params['randomize'] = True dataset_params['factors'] = random.randint(2,2000) print("Dataset parameters: {0}".format(dataset_params)) append_response = False distribution = random.sample(['bernoulli','multinomial','gaussian','poisson','gamma'], 1)[0] if distribution == 'bernoulli': dataset_params['response_factors'] = 2 elif distribution == 'gaussian': dataset_params['response_factors'] = 1 elif distribution == 'multinomial': dataset_params['response_factors'] = random.randint(3,100) else: dataset_params['has_response'] = False response = h2o.H2OFrame([[random.randint(1,1000)] for r in range(0,dataset_params['rows'])]) append_response = True print("Distribution: {0}".format(distribution)) train = h2o.create_frame(**dataset_params) if append_response: train = response.cbind(train) train.set_name(0,"response") if distribution == 'bernoulli' or distribution == 'multinomial': train['response'] = train['response'].asfactor() results_dir = pyunit_utils.locate("results") h2o.download_csv(train["response"],os.path.join(results_dir,"dl_dynamic_preimputed_response.log")) train.impute("response", method="mode") print("Training dataset:") print(train) # Save dataset to results directory h2o.download_csv(train,os.path.join(results_dir,"dl_dynamic_training_dataset.log")) # Generate random parameters params = {} if random.randint(0,1): params['activation'] = random.sample(["Rectifier", "Tanh", "TanhWithDropout", "RectifierWithDropout", "MaxoutWithDropout"],1)[0] if random.randint(0,1): params['epochs'] = random.sample(list(range(1,10)),1)[0] if random.randint(0,1): h = random.randint(10,21) params['hidden'] = [h for x in range(random.randint(2,3))] params['distribution'] = distribution params['l1'] = random.random() print("Parameter list: {0}".format(params)) x = train.names x.remove("response") y = "response" pyunit_utils.javapredict(algo="deeplearning", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)