def glrm_iris(): print("Importing iris.csv data...") irisH2O = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris.csv")) irisH2O.describe() print("@@@@@@ Building PCA with GramSVD...\n") glrmPCA = H2OPCA(k=5, transform="STANDARDIZE", pca_method="GLRM", use_all_factor_levels=True, seed=21) glrmPCA.train(x=irisH2O.names, training_frame=irisH2O) glrm_h2o = H2OGeneralizedLowRankEstimator(k=5, loss="Quadratic",transform="STANDARDIZE", recover_svd=True, seed=21) glrm_h2o.train(x=irisH2O.names, training_frame=irisH2O) # compare singular values and stuff with GramSVD print("@@@@@@ Comparing eigenvalues between GramSVD and GLRM...\n") pyunit_utils.assert_H2OTwoDimTable_equal(glrmPCA._model_json["output"]["importance"], glrm_h2o._model_json["output"]["importance"], ["Standard deviation", "Cumulative Proportion", "Cumulative Proportion"], tolerance=1e-6) print("@@@@@@ Comparing eigenvectors between GramSVD and GLRM...\n") # compare singular vectors pyunit_utils.assert_H2OTwoDimTable_equal(glrmPCA._model_json["output"]["eigenvectors"], glrm_h2o._model_json["output"]["eigenvectors"], glrm_h2o._model_json["output"]["names"], tolerance=1e-6,check_sign=True) # check to make sure maximum proportional variance <= 1 assert glrmPCA._model_json["output"]["importance"].cell_values[1][1] <= 1, \ "Expected value <= 1.0 but received {0}".format(glrmPCA._model_json["output"]["importance"].cell_values[1][1])
def init_for_pipeline(self): """ Returns H2OPCA object which implements fit and transform method to be used in sklearn.Pipeline properly. All parameters defined in self.__params, should be input parameters in H2OPCA.__init__ method. :returns: H2OPCA object :examples: >>> from sklearn.pipeline import Pipeline >>> from h2o.transforms.preprocessing import H2OScaler >>> from h2o.estimators import H2ORandomForestEstimator >>> iris = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv") >>> pipe = Pipeline([("standardize", H2OScaler()), ... ("pca", H2OPrincipalComponentAnalysisEstimator(k=2).init_for_pipeline()), ... ("rf", H2ORandomForestEstimator(seed=42,ntrees=5))]) >>> pipe.fit(iris[:4], iris[4]) """ import inspect from h2o.transforms.decomposition import H2OPCA # check which parameters can be passed to H2OPCA init var_names = list( dict(inspect.getmembers(H2OPCA.__init__.__code__))['co_varnames']) parameters = {k: v for k, v in self._parms.items() if k in var_names} return H2OPCA(**parameters)
def glrm_arrests(): print("Importing USArrests.csv data...") arrestsH2O = h2o.upload_file( pyunit_utils.locate("smalldata/pca_test/USArrests.csv")) pca_h2o = H2OPCA(k=4, transform="STANDARDIZE") pca_h2o.train(x=list(range(4)), training_frame=arrestsH2O) pca_h2o.summary() pca_h2o.show() print("H2O GLRM on standardized data with quadratic loss:\n") glrm_h2o = H2OGeneralizedLowRankEstimator(k=4, transform="STANDARDIZE", loss="Quadratic", gamma_x=0, gamma_y=0, init="SVD", recover_svd=True) glrm_h2o.train(x=arrestsH2O.names, training_frame=arrestsH2O) glrm_h2o.show() # compare table values and make sure they are the same between PCA and GLRM assert pyunit_utils.equal_2D_tables(pca_h2o._model_json["output"]["importance"]._cell_values, glrm_h2o._model_json["output"]["importance"]._cell_values, tolerance=1e-4), \ "PCA and GLRM variance metrics do not agree. Fix it please." sys.stdout.flush()
def pca_wideDataset_rotterdam(): h2o.remove_all() print("Importing Rotterdam.csv data...") rotterdamH2O = h2o.upload_file( pyunit_utils.locate("bigdata/laptop/jira/rotterdam.csv.zip")) y = set(["relapse"]) x = list(set(rotterdamH2O.names) - y) transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"] transformN = transform_types[randint(0, len(transform_types) - 1)] print("transform used on dataset is {0}.\n".format(transformN)) gramSVD = H2OPCA(k=8, impute_missing=True, transform=transformN, seed=12345) gramSVD.train(x=x, training_frame=rotterdamH2O) powerPCA = H2OPCA(k=8, impute_missing=True, transform=transformN, pca_method="Power", seed=12345) # power powerPCA.train(x=x, training_frame=rotterdamH2O) # compare singular values and stuff with GramSVD print("@@@@@@ Comparing eigenvalues between GramSVD and Power...\n") pyunit_utils.assert_H2OTwoDimTable_equal( gramSVD._model_json["output"]["importance"], powerPCA._model_json["output"]["importance"], [ "Standard deviation", "Cumulative Proportion", "Cumulative Proportion" ], tolerance=1e-6, check_all=False) print("@@@@@@ Comparing eigenvectors between GramSVD and Power...\n") # compare singular vectors pyunit_utils.assert_H2OTwoDimTable_equal( gramSVD._model_json["output"]["eigenvectors"], powerPCA._model_json["output"]["eigenvectors"], powerPCA._model_json["output"]["names"], tolerance=1e-6, check_sign=True, check_all=False)
def screeplot_test(): kwargs = {} kwargs['server'] = True australia = h2o.upload_file( pyunit_utils.locate("smalldata/pca_test/AustraliaCoast.csv")) australia_pca = H2OPCA(k=4, transform="STANDARDIZE") australia_pca.train(x=list(range(8)), training_frame=australia) australia_pca.screeplot(type="barplot", **kwargs) australia_pca.screeplot(type="lines", **kwargs)
def pca_pubdev_4167_OOM(): """ This pyunit is written to make sure PCA works with customer data. It is mainly used by customer to verify PCA operations and not to be used as a regular test since I do not want to expose customer data. """ h2o.remove_all() transform_types = [ "NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE" ] # make sure we check all tranforms transformN = transform_types[randint(0, len(transform_types) - 1)] print("transform used on dataset is {0}.\n".format(transformN)) training_data = h2o.import_file(path=pyunit_utils.locate( "/Users/wendycwong/gitBackup/SDatasets/pubdev_4167_Avkash/m120K.tar") ) # Nidhi: import may not work gramSVDPCA = H2OPCA(k=training_data.ncols, transform=transformN) gramSVDPCA.train(x=list(range(0, training_data.ncols)), training_frame=training_data) powerSVDPCA = H2OPCA(k=training_data.ncols, transform=transformN, pca_method="Power") powerSVDPCA.train(x=list(range(0, training_data.ncols)), training_frame=training_data) # compare singular values and stuff between power and GramSVD methods print("@@@@@@ Comparing eigenvalues between GramSVD and Power...\n") pyunit_utils.assert_H2OTwoDimTable_equal( gramSVDPCA._model_json["output"]["importance"], powerSVDPCA._model_json["output"]["importance"], [ "Standard deviation", "Cumulative Proportion", "Cumulative Proportion" ], tolerance=1e-5, check_all=False) print("@@@@@@ Comparing eigenvectors between GramSVD and Power...\n") # compare singular vectors pyunit_utils.assert_H2OTwoDimTable_equal( gramSVDPCA._model_json["output"]["eigenvectors"], powerSVDPCA._model_json["output"]["eigenvectors"], powerSVDPCA._model_json["output"]["names"], tolerance=1e-1, check_sign=True)
def pca_arrests(): print("Importing USArrests.csv data...") arrestsH2O = h2o.upload_file( pyunit_utils.locate("smalldata/pca_test/USArrests.csv")) arrestsH2O.describe() for i in range(4): print("H2O PCA with " + str(i) + " dimensions:\n") print("Using these columns: {0}".format(arrestsH2O.names)) pca_h2o = H2OPCA(k=i + 1) pca_h2o.train(x=list(range(4)), training_frame=arrestsH2O)
def _get_pca_model(predictor_col, response_col, train_f, val_f): from h2o.transforms.decomposition import H2OPCA k = 10 pca_decomp = H2OPCA(k=k, transform="NONE", pca_method="Power", impute_missing=True) pca_decomp.train(x=predictor_columns, training_frame=train_f) pca_decomp.summary() # Explained Variance logr.log_event(f'Training Accuracy', f'{pca_decomp.varimp()[2][k-1]}') return pca_decomp
def init_for_pipeline(self): """ Returns H2OPCA object which implements fit and transform method to be used in sklearn.Pipeline properly. All parameters defined in self.__params, should be input parameters in H2OPCA.__init__ method. :returns: H2OPCA object """ import inspect from h2o.transforms.decomposition import H2OPCA # check which parameters can be passed to H2OPCA init var_names = list(dict(inspect.getmembers(H2OPCA.__init__.__code__))['co_varnames']) parameters = {k: v for k, v in self._parms.items() if k in var_names} return H2OPCA(**parameters)
def scale_pca_rf_pipe(): from h2o.transforms.preprocessing import H2OScaler from h2o.transforms.decomposition import H2OPCA from h2o.estimators.random_forest import H2ORandomForestEstimator from sklearn.pipeline import Pipeline iris = h2o.import_file( path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) # build transformation pipeline using sklearn's Pipeline and H2O transforms pipe = Pipeline([("standardize", H2OScaler()), ("pca", H2OPCA(k=2)), ("rf", H2ORandomForestEstimator(seed=42, ntrees=50))]) pipe.fit(iris[:4], iris[4])
def pca_car(): num_runs = 10 run_time_c = [] car = h2o.import_file(path=pyunit_utils.locate("smalldata/pca_test/car.arff.txt")) # Nidhi: import may not work for run_index in range(num_runs): # multiple runs to get an idea of run time info carPCA = H2OPCA(k=car.ncols, transform="STANDARDIZE") carPCA.train(x=list(range(0, car.ncols)), training_frame=car) run_time_c.append(carPCA._model_json['output']['end_time']-carPCA._model_json['output']['start_time']) print("PCA model training time with car.arff.txt data in ms is {0}".format(run_time_c[run_index])) h2o.remove(carPCA) assert (max(run_time_c)) < 1000, "PCA runs for car.arff.txt take too much time!"
def pca_pubdev_4314(): print("Importing prostate_cat.csv data...\n") prostate = h2o.upload_file(pyunit_utils.locate("smalldata/prostate/prostate_cat.csv")) prostate.describe() print("PCA with k = 3, retx = FALSE, transform = 'STANDARDIZE'") fitPCA = H2OPCA(k=3, transform="StANDARDIZE", pca_method="GramSVD") fitPCA.train(x=list(range(0,8)), training_frame=prostate) print(fitPCA.summary()) varimpPandas = fitPCA.varimp(use_pandas=True) assert_is_type(varimpPandas, DataFrame) varimpList = fitPCA.varimp() print(varimpList) assert_is_type(varimpList, list) sys.stdout.flush()
def pca_arrests(): print "Importing USArrests.csv data..." arrestsH2O = h2o.upload_file( pyunit_utils.locate("smalldata/pca_test/USArrests.csv")) arrestsH2O.describe() from h2o.transforms.decomposition import H2OPCA for i in range(4): print "H2O PCA with " + str(i) + " dimensions:\n" print "Using these columns: {0}".format(arrestsH2O.names) pca_h2o = H2OPCA(k=i + 1) pca_h2o.train(x=range(4), training_frame=arrestsH2O)
def pca_scoring(): print("Importing arrests.csv data...") arrestsH2O = h2o.upload_file( pyunit_utils.locate("smalldata/pca_test/USArrests.csv")) print("Run PCA with transform = 'DEMEAN'") fitH2O = H2OPCA(k=4, transform="DEMEAN") fitH2O.train(x=list(range(4)), training_frame=arrestsH2O) # TODO: fitH2O.show() print("Project training data into eigenvector subspace") predH2O = fitH2O.predict(arrestsH2O) print("H2O Projection:") predH2O.head()
def scale_pca_rf_pipe(): from h2o.transforms.preprocessing import H2OScaler from h2o.transforms.decomposition import H2OPCA # this should work below, but it's not yet: https://0xdata.atlassian.net/browse/PUBDEV-5236 #from h2o.estimators.pca import H2OPrincipalComponentAnalysisEstimator as H2OPCA from h2o.estimators.random_forest import H2ORandomForestEstimator from sklearn.pipeline import Pipeline iris = h2o.import_file( path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) # build transformation pipeline using sklearn's Pipeline and H2O transforms pipe = Pipeline([("standardize", H2OScaler()), ("pca", H2OPCA(k=2)), ("rf", H2ORandomForestEstimator(seed=42, ntrees=50))]) pipe.fit(iris[:4], iris[4])
def scale_pca_rf_pipe(): from h2o.transforms.preprocessing import H2OScaler from h2o.transforms.decomposition import H2OPCA from h2o.estimators.random_forest import H2ORandomForestEstimator from sklearn.pipeline import Pipeline from sklearn.grid_search import RandomizedSearchCV from h2o.cross_validation import H2OKFold from h2o.model.regression import h2o_r2_score from sklearn.metrics.scorer import make_scorer from scipy.stats import randint iris = h2o.import_file( path=tests.locate("smalldata/iris/iris_wheader.csv")) # build transformation pipeline using sklearn's Pipeline and H2O transforms pipe = Pipeline([("standardize", H2OScaler()), ("pca", H2OPCA(n_components=2)), ("rf", H2ORandomForestEstimator(seed=42, ntrees=50))]) params = { "standardize__center": [True, False], # Parameters to test "standardize__scale": [True, False], "pca__n_components": randint(2, iris[1:].shape[1]), "rf__ntrees": randint(50, 60), "rf__max_depth": randint(4, 8), "rf__min_rows": randint(5, 10), } custom_cv = H2OKFold(iris, n_folds=5, seed=42) random_search = RandomizedSearchCV(pipe, params, n_iter=5, scoring=make_scorer(h2o_r2_score), cv=custom_cv, random_state=42, n_jobs=1) random_search.fit(iris[1:], iris[0]) print random_search.best_estimator_
def pca_arrests(): print("Importing USArrests.csv data...") arrests = h2o.upload_file( pyunit_utils.locate("smalldata/pca_test/USArrests.csv")) arrests.describe() # import from h2o.transforms.decomposition for i in range(4): print("H2O PCA with " + str(i) + " dimensions:\n") print("Using these columns: {0}".format(arrests.names)) pca_h2o = H2OPCA(k=i + 1) pca_h2o.train(x=list(range(4)), training_frame=arrests) # TODO: pca_h2o.show() # import from h2o.estimators.pca for i in range(4): print("H2O PCA with " + str(i) + " dimensions:\n") print("Using these columns: {0}".format(arrests.names)) pca_h2o = H2OPrincipalComponentAnalysisEstimator(k=i + 1) pca_h2o.train(x=list(range(4)), training_frame=arrests)
def pca_prostate(): print "Importing prostate.csv data...\n" prostate = h2o.upload_file( pyunit_utils.locate("smalldata/logreg/prostate.csv")) print "Converting CAPSULE, RACE, DPROS and DCAPS columns to factors" prostate["CAPSULE"] = prostate["CAPSULE"].asfactor() prostate["RACE"] = prostate["RACE"].asfactor() prostate["DPROS"] = prostate["DPROS"].asfactor() prostate["DCAPS"] = prostate["DCAPS"].asfactor() prostate.describe() print "PCA on columns 3 to 9 with k = 3, retx = FALSE, transform = 'STANDARDIZE'" fitPCA = H2OPCA(k=3, transform="NONE", pca_method="Power") fitPCA.train(x=range(2, 9), training_frame=prostate) pred = fitPCA.predict(prostate) print "Projection matrix:\n" pred.head()
def setup_model(self): """ This function setup the gridsearch hyper-parameters that will be used later on: 1. It will first try to grab all the parameters that are griddable and parameters used by PCA. 2. It will find the intersection of parameters that are both griddable and used by PCA. 3. There are several extra parameters that are used by PCA that are denoted as griddable but actually is not. These parameters have to be discovered manually and they These are captured in self.exclude_parameter_lists. 4. We generate the gridsearch hyper-parameter. For numerical parameters, we will generate those randomly. For enums, we will include all of them. :return: None """ # build bare bone model to get all parameters model = H2OPCA(k=10, transform="NONE", pca_method=self.pca_method) model.train(x=self.x_indices, training_frame=self.training1_data) self.model_run_time = pyunit_utils.find_grid_runtime([model]) # find model train time print("Time taken to build a base barebone model is {0}".format(self.model_run_time)) # grab all gridable parameters and its type (self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \ pyunit_utils.get_gridables(model._model_json["parameters"]) # randomly generate griddable parameters including values outside legal range, like setting alpha values to # be outside legal range of 0 and 1 and etc (self.hyper_params, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \ pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params, self.exclude_parameter_lists, self.gridable_parameters, self.gridable_types, self.gridable_defaults, random.randint(1, self.max_int_number), self.max_int_val, self.min_int_val, random.randint(1, self.max_real_number), self.max_real_val, self.min_real_val) # scale the max_runtime_secs parameters time_scale = self.time_scale * self.model_run_time if "max_runtime_secs" in list(self.hyper_params): self.hyper_params["max_runtime_secs"] = [time_scale * x for x in self.hyper_params["max_runtime_secs"]] if 'max_iterations' in list(self.hyper_params): self.hyper_params['max_iterations'] = [self.max_iter_scale * x for x in self.hyper_params['max_iterations']] # generate a new final_hyper_params which only takes a subset of all griddable parameters while # hyper_params take all griddable parameters and generate the grid search hyper-parameters [self.possible_number_models, self.final_hyper_params] = \ pyunit_utils.check_and_count_models(self.hyper_params, self.params_zero_one, self.params_more_than_zero, self.params_more_than_one, self.params_zero_positive, self.max_grid_model) # must add max_runtime_secs to restrict unit test run time and as a promise to Arno to test for this if ("max_runtime_secs" not in list(self.final_hyper_params)) and \ ("max_runtime_secs" in list(self.hyper_params)): self.final_hyper_params["max_runtime_secs"] = self.hyper_params["max_runtime_secs"] len_good_time = len([x for x in self.hyper_params["max_runtime_secs"] if (x >= 0)]) self.possible_number_models = self.possible_number_models*len_good_time # must include k in hyper-parameters if ('k' not in list(self.final_hyper_params)) and ('k' in list(self.hyper_params)): self.final_hyper_params["k"] = self.hyper_params["k"] len_good_k = len([x for x in self.hyper_params["k"] if (x > 0)]) self.possible_number_models = self.possible_number_models*len_good_k # write out the hyper-parameters used into json files. pyunit_utils.write_hyper_parameters_json(self.current_dir, self.sandbox_dir, self.json_filename, self.final_hyper_params)
def algo_max_runtime_secs(): ''' This pyunit test is written to ensure that the max_runtime_secs can restrict the model training time for all h2o algos. See PUBDEV-4702. ''' global model_within_max_runtime global err_bound seed = 12345 # deeplearning training1_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/gridsearch/gaussian_training1_set.csv")) y_index = training1_data.ncol - 1 x_indices = list(range(y_index)) model = H2ODeepLearningEstimator(distribution='gaussian', seed=seed, hidden=[10, 10, 10]) grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices, y_index) cleanUp([training1_data, model]) # stack ensemble, stacking part is not iterative print( "******************** Skip testing stack ensemble. Not an iterative algo." ) # GBM run training1_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/gridsearch/multinomial_training1_set.csv")) y_index = training1_data.ncol - 1 x_indices = list(range(y_index)) training1_data[y_index] = training1_data[y_index].round().asfactor() model = H2OGradientBoostingEstimator(distribution="multinomial", seed=seed) grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices, y_index) cleanUp([model]) # GLM run model = H2OGeneralizedLinearEstimator(family='multinomial', seed=seed) grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices, y_index) cleanUp([model]) # naivebayes, not iterative print( "******************** Skip testing Naives Bayes. Not an iterative algo." ) # random foreset model = H2ORandomForestEstimator(ntrees=100, score_tree_interval=0) grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices) cleanUp([model, training1_data]) # deepwater if H2ODeepWaterEstimator.available(): training1_data = h2o.import_file( path=pyunit_utils.locate("smalldata/gbm_test/ecology_model.csv")) training1_data = training1_data.drop('Site') training1_data['Angaus'] = training1_data['Angaus'].asfactor() y_index = "Angaus" x_indices = list(range(1, training1_data.ncol)) model = H2ODeepWaterEstimator(epochs=50, hidden=[4096, 4096, 4096], hidden_dropout_ratios=[0.2, 0.2, 0.2]) grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices, y_index) cleanUp([training1_data, model]) # GLRM, do not make sense to stop in the middle of an iteration training1_data = h2o.import_file( path=pyunit_utils.locate("smalldata/gridsearch/glrmdata1000x25.csv")) x_indices = list(range(training1_data.ncol)) model = H2OGeneralizedLowRankEstimator(k=10, loss="Quadratic", gamma_x=0.3, gamma_y=0.3, transform="STANDARDIZE") grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices) cleanUp([training1_data, model]) # PCA training1_data = h2o.import_file( path=pyunit_utils.locate("smalldata/gridsearch/pca1000by25.csv")) x_indices = list(range(training1_data.ncol)) model = H2OPCA(k=10, transform="STANDARDIZE", pca_method="Power", compute_metrics=True) grabRuntimeInfo(err_bound * 3, 1.2, model, training1_data, x_indices) cleanUp([training1_data, model]) # kmeans training1_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/gridsearch/kmeans_8_centers_3_coords.csv")) x_indices = list(range(training1_data.ncol)) model = H2OKMeansEstimator(k=10) grabRuntimeInfo(err_bound * 2, 2.0, model, training1_data, x_indices) cleanUp([training1_data, model]) # word2vec train = h2o.import_file(pyunit_utils.locate("bigdata/laptop/text8.gz"), header=1, col_types=["string"]) used = train[0:170000, 0] w2v_model = H2OWord2vecEstimator() grabRuntimeInfo(err_bound, 2.0, w2v_model, used, [], 0) cleanUp([train, used, w2v_model]) if sum(model_within_max_runtime) > 0: sys.exit(1)
def javapredict(algo, equality, train, test, x, y, compile_only=False, **kwargs): print "Creating model in H2O" if algo == "gbm": model = H2OGradientBoostingEstimator(**kwargs) elif algo == "random_forest": model = H2ORandomForestEstimator(**kwargs) elif algo == "deeplearning": model = H2ODeepLearningEstimator(**kwargs) elif algo == "glm": model = H2OGeneralizedLinearEstimator(**kwargs) elif algo == "naive_bayes": model = H2ONaiveBayesEstimator(**kwargs) elif algo == "kmeans": model = H2OKMeansEstimator(**kwargs) elif algo == "pca": model = H2OPCA(**kwargs) else: raise (ValueError, "algo {0} is not supported".format(algo)) if algo == "kmeans" or algo == "pca": model.train(x=x, training_frame=train) else: model.train(x=x, y=y, training_frame=train) print model # HACK: munge model._id so that it conforms to Java class name. For example, change K-means to K_means. # TODO: clients should extract Java class name from header. regex = re.compile("[+\\-* !@#$%^&()={}\\[\\]|;:'\"<>,.?/]") pojoname = regex.sub("_", model._id) print "Downloading Java prediction model code from H2O" tmpdir = os.path.normpath( os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "results", pojoname)) os.mkdir(tmpdir) h2o.download_pojo(model, path=tmpdir) h2o_genmodel_jar = os.path.join(tmpdir, "h2o-genmodel.jar") assert os.path.exists( h2o_genmodel_jar ), "Expected file {0} to exist, but it does not.".format(h2o_genmodel_jar) print "h2o-genmodel.jar saved in {0}".format(h2o_genmodel_jar) java_file = os.path.join(tmpdir, pojoname + ".java") assert os.path.exists( java_file), "Expected file {0} to exist, but it does not.".format( java_file) print "java code saved in {0}".format(java_file) print "Compiling Java Pojo" javac_cmd = [ "javac", "-cp", h2o_genmodel_jar, "-J-Xmx12g", "-J-XX:MaxPermSize=256m", java_file ] subprocess.check_call(javac_cmd) if not compile_only: print "Predicting in H2O" predictions = model.predict(test) predictions.summary() predictions.head() out_h2o_csv = os.path.join(tmpdir, "out_h2o.csv") h2o.download_csv(predictions, out_h2o_csv) assert os.path.exists( out_h2o_csv ), "Expected file {0} to exist, but it does not.".format(out_h2o_csv) print "H2O Predictions saved in {0}".format(out_h2o_csv) print "Setting up for Java POJO" in_csv = os.path.join(tmpdir, "in.csv") h2o.download_csv(test[x], in_csv) # hack: the PredictCsv driver can't handle quoted strings, so remove them f = open(in_csv, 'r+') csv = f.read() csv = re.sub('\"', '', csv) f.seek(0) f.write(csv) f.truncate() f.close() assert os.path.exists( in_csv), "Expected file {0} to exist, but it does not.".format( in_csv) print "Input CSV to PredictCsv saved in {0}".format(in_csv) print "Running PredictCsv Java Program" out_pojo_csv = os.path.join(tmpdir, "out_pojo.csv") cp_sep = ";" if sys.platform == "win32" else ":" java_cmd = [ "java", "-ea", "-cp", h2o_genmodel_jar + cp_sep + tmpdir, "-Xmx12g", "-XX:MaxPermSize=2g", "-XX:ReservedCodeCacheSize=256m", "hex.genmodel.tools.PredictCsv", "--header", "--model", pojoname, "--input", in_csv, "--output", out_pojo_csv ] p = subprocess.Popen(java_cmd, stdout=PIPE, stderr=STDOUT) o, e = p.communicate() print "Java output: {0}".format(o) assert os.path.exists( out_pojo_csv ), "Expected file {0} to exist, but it does not.".format(out_pojo_csv) predictions2 = h2o.upload_file(path=out_pojo_csv) print "Pojo predictions saved in {0}".format(out_pojo_csv) print "Comparing predictions between H2O and Java POJO" # Dimensions hr, hc = predictions.dim pr, pc = predictions2.dim assert hr == pr, "Expected the same number of rows, but got {0} and {1}".format( hr, pr) assert hc == pc, "Expected the same number of cols, but got {0} and {1}".format( hc, pc) # Value for r in range(hr): hp = predictions[r, 0] if equality == "numeric": pp = float.fromhex(predictions2[r, 0]) assert abs( hp - pp ) < 1e-4, "Expected predictions to be the same (within 1e-4) for row {0}, but got {1} and {2}".format( r, hp, pp) elif equality == "class": pp = predictions2[r, 0] assert hp == pp, "Expected predictions to be the same for row {0}, but got {1} and {2}".format( r, hp, pp) else: raise (ValueError, "equality type {0} is not supported".format(equality))
prostate_df.describe() glm_classifier = H2OGeneralizedLinearEstimator(family="binomial", nfolds=10, alpha=0.5) glm_classifier.train(x=["AGE", "RACE", "PSA", "DCAPS"], y="CAPSULE", training_frame=prostate_df) glm_classifier from h2o.estimators.kmeans import H2OKMeansEstimator cluster_estimator = H2OKMeansEstimator(k=3) cluster_estimator.train(x=[0, 1, 2, 3], training_frame=iris_df) cluster_estimator from h2o.transforms.decomposition import H2OPCA pca_decomp = H2OPCA(k=2, transform="NONE", pca_method="Power") pca_decomp.train(x=range(0, 4), training_frame=iris_df) pca_decomp pred = pca_decomp.predict(iris_df) pred.head() # Projection results # Grid Search ntrees_opt = [5, 10, 15] max_depth_opt = [2, 3, 4] learn_rate_opt = [0.1, 0.2] hyper_parameters = { "ntrees": ntrees_opt, "max_depth": max_depth_opt, "learn_rate": learn_rate_opt
def pca_wideDataset_rotterdam(): h2o.remove_all() print("Importing Rotterdam.csv data...") rotterdamH2O = h2o.upload_file( pyunit_utils.locate("bigdata/laptop/jira/rotterdam.csv.zip")) y = set(["relapse"]) x = list(set(rotterdamH2O.names) - y) transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"] transformN = transform_types[randint(0, len(transform_types) - 1)] print("transform used on dataset is {0}.\n".format(transformN)) buildModel = [False, False, False] buildModel[randint(0, len(buildModel) - 1)] = True expNum = 0 if (buildModel[expNum]): # special test with GLRM. Need use_all_levels to be true print("------ Testing GLRM PCA --------") gramSVD = H2OPCA(k=8, impute_missing=True, transform=transformN, seed=12345, use_all_factor_levels=True) gramSVD.train(x=x, training_frame=rotterdamH2O) glrmPCA = H2OGeneralizedLowRankEstimator(k=8, transform=transformN, seed=12345, init="Random", max_iterations=10, recover_svd=True, regularization_x="None", regularization_y="None") glrmPCA.train(x=x, training_frame=rotterdamH2O) # compare singular values and stuff with GramSVD print("@@@@@@ Comparing eigenvectors between GramSVD and GLRM...\n") print("@@@@@@ Comparing eigenvalues between GramSVD and GLRM...\n") pyunit_utils.assert_H2OTwoDimTable_equal( gramSVD._model_json["output"]["importance"], glrmPCA._model_json["output"]["importance"], [ "Standard deviation", "Cumulative Proportion", "Cumulative Proportion" ], tolerance=1, check_all=False) # compare singular vectors pyunit_utils.assert_H2OTwoDimTable_equal( gramSVD._model_json["output"]["eigenvectors"], glrmPCA._model_json["output"]["eigenvectors"], glrmPCA._model_json["output"]["names"], tolerance=1e-6, check_sign=True, check_all=False) h2o.remove(gramSVD) h2o.remove(glrmPCA) expNum = expNum + 1 if (buildModel[expNum]): print("------ Testing Power PCA --------") gramSVD = H2OPCA(k=8, impute_missing=True, transform=transformN, seed=12345) gramSVD.train(x=x, training_frame=rotterdamH2O) powerPCA = H2OPCA(k=8, impute_missing=True, transform=transformN, pca_method="Power", seed=12345) # power powerPCA.train(x=x, training_frame=rotterdamH2O) # compare singular values and stuff with GramSVD print("@@@@@@ Comparing eigenvalues between GramSVD and Power...\n") pyunit_utils.assert_H2OTwoDimTable_equal( gramSVD._model_json["output"]["importance"], powerPCA._model_json["output"]["importance"], [ "Standard deviation", "Cumulative Proportion", "Cumulative Proportion" ], tolerance=1e-6, check_all=False) print("@@@@@@ Comparing eigenvectors between GramSVD and Power...\n") # compare singular vectors pyunit_utils.assert_H2OTwoDimTable_equal( gramSVD._model_json["output"]["eigenvectors"], powerPCA._model_json["output"]["eigenvectors"], powerPCA._model_json["output"]["names"], tolerance=1e-6, check_sign=True, check_all=False) expNum = expNum + 1 if (buildModel[expNum]): print("------ Testing Randomized PCA --------") gramSVD = H2OPCA(k=8, impute_missing=True, transform=transformN, seed=12345) gramSVD.train(x=x, training_frame=rotterdamH2O) randomizedPCA = H2OPCA(k=8, impute_missing=True, transform=transformN, pca_method="Randomized", seed=12345, max_iterations=5) # power randomizedPCA.train(x=x, training_frame=rotterdamH2O) # compare singular values and stuff with GramSVD print( "@@@@@@ Comparing eigenvalues between GramSVD and Randomized...\n" ) pyunit_utils.assert_H2OTwoDimTable_equal( gramSVD._model_json["output"]["importance"], randomizedPCA._model_json["output"]["importance"], [ "Standard deviation", "Cumulative Proportion", "Cumulative Proportion" ], tolerance=1e-1, check_all=False) print("@@@@@@ Comparing eigenvectors between GramSVD and Power...\n") # compare singular vectors pyunit_utils.assert_H2OTwoDimTable_equal( gramSVD._model_json["output"]["eigenvectors"], randomizedPCA._model_json["output"]["eigenvectors"], randomizedPCA._model_json["output"]["names"], tolerance=1e-6, check_sign=True, check_all=False) h2o.remove_all()
def pca_scoring_history_importance(): """ This test aims to check and make sure PCA returns the scoring history and importance which are reported missing for certain PCA mode. Apart from changing the PCA mode, I throw in the transform type to test as well randomly. """ transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"] transformN = transform_types[randint(0, len(transform_types) - 1)] print("Importing australia.csv data...\n") australia = h2o.upload_file( pyunit_utils.locate("smalldata/extdata/australia.csv")) col_indices = list(range(0, australia.ncol)) print("transform is {0}.\n".format(transformN)) # checking out PCA with GramSVD print("@@@@@@ Building PCA with GramSVD...\n") gramSVD = H2OPCA(k=3, transform=transformN) gramSVD.train(x=col_indices, training_frame=australia) # check PCA with PCA set to Randomized print("@@@@@@ Building PCA with Randomized...\n") randomizedPCA = H2OPCA(k=3, transform=transformN, pca_method="Randomized", compute_metrics=True, use_all_factor_levels=True) randomizedPCA.train(x=col_indices, training_frame=australia) # compare singular values and stuff with GramSVD print("@@@@@@ Comparing eigenvalues between GramSVD and Randomized...\n") pyunit_utils.assert_H2OTwoDimTable_equal( gramSVD._model_json["output"]["importance"], randomizedPCA._model_json["output"]["importance"], [ "Standard deviation", "Cumulative Proportion", "Cumulative Proportion" ], tolerance=1e-3) print("@@@@@@ Comparing eigenvectors between GramSVD and Randomized...\n") # compare singular vectors pyunit_utils.assert_H2OTwoDimTable_equal( gramSVD._model_json["output"]["eigenvectors"], randomizedPCA._model_json["output"]["eigenvectors"], randomizedPCA._model_json["output"]["names"], tolerance=5e-2, check_sign=True) # check PCA with PCA set to Power print("@@@@@@ Building PCA with Power...\n") powerPCA = H2OPCA(k=3, transform=transformN, pca_method="Power", compute_metrics=True, use_all_factor_levels=True) powerPCA.train(x=col_indices, training_frame=australia) # compare singular values and stuff with GramSVD print("@@@@@@ Comparing eigenvalues between GramSVD and Power...\n") pyunit_utils.assert_H2OTwoDimTable_equal( gramSVD._model_json["output"]["importance"], powerPCA._model_json["output"]["importance"], [ "Standard deviation", "Cumulative Proportion", "Cumulative Proportion" ]) print("@@@@@@ Comparing eigenvectors between GramSVD and Power...\n") # compare singular vectors pyunit_utils.assert_H2OTwoDimTable_equal( gramSVD._model_json["output"]["eigenvectors"], powerPCA._model_json["output"]["eigenvectors"], powerPCA._model_json["output"]["names"], tolerance=1e-5, check_sign=True) # check PCA with PCA set to GLRM print("@@@@@@ Building PCA with GLRM...\n") glrmPCA = H2OPCA(k=3, transform=transformN, pca_method="GLRM", compute_metrics=True, use_all_factor_levels=True) glrmPCA.train(x=col_indices, training_frame=australia) # compare singular values and stuff with GramSVD print("@@@@@@ Comparing eigenvalues between GramSVD and GLRM...\n") pyunit_utils.assert_H2OTwoDimTable_equal( gramSVD._model_json["output"]["importance"], glrmPCA._model_json["output"]["importance"], [ "Standard deviation", "Cumulative Proportion", "Cumulative Proportion" ], tolerance=2e-2) print("@@@@@@ Comparing eigenvectors between GramSVD and GLRM...\n") # compare singular vectors pyunit_utils.assert_H2OTwoDimTable_equal( gramSVD._model_json["output"]["eigenvectors"], glrmPCA._model_json["output"]["eigenvectors"], glrmPCA._model_json["output"]["names"], tolerance=2e-1, check_sign=True) # make sure we find the scoring history and it is not empty for all the PCA modes # just check and make sure the cell_values exceed 0 assert len(gramSVD._model_json["output"]["scoring_history"].cell_values) > 0, "PCA Scoring history setting " \ "pca_method to GramSVD is empty." assert len(powerPCA._model_json["output"]["scoring_history"].cell_values) > 0, "PCA Scoring history setting " \ "pca_method to using is empty." assert len(randomizedPCA._model_json["output"]["scoring_history"].cell_values) > 0, "PCA Scoring history setting " \ "pca_method to Randomized is " \ "empty." assert len(glrmPCA._model_json["output"]["scoring_history"].cell_values) > 0, "PCA Scoring history setting " \ "pca_method to GLRM is empty."
from h2o.estimators.glm import H2OGeneralizedLinearEstimator prostate_data_path = h2o.system_file("prostate.csv") prostate_df = h2o.import_file(path=prostate_data_path) prostate_df["RACE"] = prostate_df["RACE"].asfactor() prostate_df.describe() glm_classifier = H2OGeneralizedLinearEstimator(family="binomial", nfolds=10, alpha=0.5) glm_classifier.train(x=["AGE","RACE","PSA","DCAPS"],y="CAPSULE", training_frame=prostate_df) glm_classifier from h2o.estimators.kmeans import H2OKMeansEstimator cluster_estimator = H2OKMeansEstimator(k=3) cluster_estimator.train(x=[0,1,2,3], training_frame=iris_df) cluster_estimator from h2o.transforms.decomposition import H2OPCA pca_decomp = H2OPCA(k=2, transform="NONE", pca_method="Power") pca_decomp.train(x=range(0,4), training_frame=iris_df) pca_decomp pred = pca_decomp.predict(iris_df) pred.head() # Projection results # Grid Search ntrees_opt = [5, 10, 15] max_depth_opt = [2, 3, 4] learn_rate_opt = [0.1, 0.2] hyper_parameters = {"ntrees": ntrees_opt, "max_depth":max_depth_opt, "learn_rate":learn_rate_opt} from h2o.grid.grid_search import H2OGridSearch gs = H2OGridSearch(H2OGradientBoostingEstimator(distribution="multinomial"), hyper_params=hyper_parameters) gs.train(x=range(0,iris_df.ncol-1), y=iris_df.ncol-1, training_frame=iris_df, nfold=10)
def test_PCA_grid_search_over_params(self): """ test_pca_grid_search_over_params: test for condition 1 and performs the following: a. build H2O PCA models using grid search. Count and make sure models are only built for hyper-parameters set to legal values. No model is built for bad hyper-parameters values. We should instead get a warning/error message printed out. c. For each model built using grid search, we will extract the parameters used in building that model and manually build a H2O PCA model. Training metrics are calculated from the gridsearch model and the manually built model. If their metrics differ by too much, print a warning message but don't fail the test. d. we will check and make sure the models are built within the max_runtime_secs time limit that was set for it as well. If max_runtime_secs was exceeded, declare test failure. """ print("*******************************************************************************************") print("test_PCA_grid_search_over_params for PCA ") h2o.cluster_info() try: print("Hyper-parameters used here is {0}".format(self.final_hyper_params)) # start grid search grid_model = H2OGridSearch(H2OPCA(pca_method=self.pca_method), hyper_params=self.final_hyper_params) grid_model.train(x=self.x_indices, training_frame=self.training1_data) self.correct_model_number = len(grid_model) # store number of models built # make sure the correct number of models are built by gridsearch if not (self.correct_model_number == self.possible_number_models): # wrong grid model number self.test_failed += 1 print("test_PCA_grid_search_over_params for PCA failed: number of models built by gridsearch " "does not equal to all possible combinations of hyper-parameters") else: # add parameters into params_dict. Use this to manually build model params_dict = dict() params_dict["pca_method"] = self.pca_method total_run_time_limits = 0.0 # calculate upper bound of max_runtime_secs true_run_time_limits = 0.0 manual_run_runtime = 0.0 # compare performance metric of model built by gridsearch with manually built model for each_model in grid_model: params_list = grid_model.get_hyperparams_dict(each_model._id) params_list.update(params_dict) model_params = dict() # need to taken out max_runtime_secs from model parameters, it is now set in .train() if "max_runtime_secs" in params_list: model_params["max_runtime_secs"] = params_list["max_runtime_secs"] max_runtime = params_list["max_runtime_secs"] del params_list["max_runtime_secs"] else: max_runtime = 0 # make sure manual model was provided the same max_runtime_secs as the grid model each_model_runtime = pyunit_utils.find_grid_runtime([each_model]) manual_model = H2OPCA(**params_list) manual_model.train(x=self.x_indices, training_frame=self.training1_data, **model_params) # collect the time taken to manually built all models model_runtime = pyunit_utils.find_grid_runtime([manual_model]) # time taken to build this model manual_run_runtime += model_runtime if max_runtime > 0: # shortest possible time it takes to build this model if max_runtime < self.model_run_time: total_run_time_limits += model_runtime else: total_run_time_limits += max_runtime true_run_time_limits += max_runtime # compute and compare test metrics between the two models grid_model_metrics = \ sum(each_model._model_json["output"]["model_summary"].cell_values[0][1:params_list["k"]]) manual_model_metrics = \ sum(manual_model._model_json["output"]["model_summary"].cell_values[0][1:params_list["k"]]) # just compare the mse in this case within tolerance: if not((type(grid_model_metrics) == str) or (type(manual_model_metrics) == str)): if (abs(grid_model_metrics) > 0) and \ (abs(grid_model_metrics - manual_model_metrics)/grid_model_metrics > self.allowed_diff): print("test_PCA_grid_search_over_params for PCA warning: grid search model mdetric ({0}) " "and manually built H2O model metric ({1}) differ too much" "!".format(grid_model_metrics, manual_model_metrics)) total_run_time_limits = max(total_run_time_limits, true_run_time_limits) * (1+self.extra_time_fraction) # make sure the max_runtime_secs is working to restrict model built time if not(manual_run_runtime <= total_run_time_limits): self.test_failed += 1 print("test_PCA_grid_search_over_params for PCA failed: time taken to manually build models is {0}." " Maximum allowed time is {1}".format(manual_run_runtime, total_run_time_limits)) else: print("time taken to manually build all models is {0}. Maximum allowed time is " "{1}".format(manual_run_runtime, total_run_time_limits)) if self.test_failed == 0: print("test_PCA_grid_search_over_params for PCA has passed!") except Exception as e: if self.possible_number_models > 0: print("test_PCA_grid_search_over_params for PCA failed: exception ({0}) was thrown for no reason.".format(e)) self.test_failed += 1
#gs = H2OGridSearch(H2OGradientBoostingEstimator(distribution="multinomial"), hyper_params=hyper_parameters) #gs.train(x=range(0, iris_df.ncol-1), y=iris_df.ncol-1, training_frame=iris_df, nfold=10) ## ## Pipeline ## from h2o.transforms.preprocessing import H2OScaler from h2o.transforms.decomposition import H2OPCA from sklearn.pipeline import Pipeline h2o.no_progress() pipeline = Pipeline([ ("standardize", H2OScaler()), ("pca", H2OPCA(k=2)), ("gbm", H2OGradientBoostingEstimator(distribution="multinomial")) ]) print pipeline.fit(iris_df[:4], iris_df[4]) ## ## Randomized Gird Search ## from sklearn.grid_search import RandomizedSearchCV from h2o.cross_validation import H2OKFold from h2o.model.regression import h2o_r2_score from sklearn.metrics.scorer import make_scorer params = { "standardize__center": [True, False],
def pca_max_k(): data = h2o.upload_file( pyunit_utils.locate("bigdata/laptop/jira/rotterdam.csv.zip")) y = set(["relapse"]) x = list(set(data.names) - y) pcaGramSVD = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="GramSVD", impute_missing=True, max_iterations=100) pcaGramSVD.train(x, training_frame=data) pcaPower = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="Power", impute_missing=True, max_iterations=100, seed=12345) pcaPower.train(x, training_frame=data) # compare singular values and stuff with GramSVD print("@@@@@@ Comparing eigenvalues between GramSVD and Power...\n") pyunit_utils.assert_H2OTwoDimTable_equal( pcaGramSVD._model_json["output"]["importance"], pcaPower._model_json["output"]["importance"], [ "Standard deviation", "Cumulative Proportion", "Cumulative Proportion" ], tolerance=1) correctEigNum = pcaPower.full_parameters["k"]["actual_value"] gramSVDNum = len( pcaGramSVD._model_json["output"]["importance"].cell_values[0]) - 1 powerNum = len( pcaPower._model_json["output"]["importance"].cell_values[0]) - 1 assert correctEigNum == gramSVDNum, "PCA GramSVD FAIL: expected number of eigenvalues: " + correctEigNum + \ ", actual: " + gramSVDNum + "." assert correctEigNum == powerNum, "PCA Power FAIL: expected number of eigenvalues: " + correctEigNum + \ ", actual: " + powerNum + "." # Randomized and GLRM does not have wide dataset implementation. Check with smaller datasets data = h2o.upload_file( pyunit_utils.locate("smalldata/prostate/prostate_cat.csv")) x = list(set(data.names)) pcaRandomized = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="Randomized", impute_missing=True, max_iterations=100, seed=12345) pcaRandomized.train(x, training_frame=data) # should still work with rank deficient dataset pcaRandomizedF = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="Randomized", use_all_factor_levels=True, impute_missing=True, max_iterations=100, seed=12345) pcaRandomizedF.train(x, training_frame=data) pcaPower = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="Power", impute_missing=True, max_iterations=100, seed=12345) pcaPower.train(x, training_frame=data) # should still work with rank deficient dataset pcaPowerF = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="Power", use_all_factor_levels=True, impute_missing=True, max_iterations=100, seed=12345) pcaPowerF.train(x, training_frame=data) # eigenvalues between the PCA and Randomize should be close, I hope... print( "@@@@@@ Comparing eigenvalues between Randomized and Power PCA...\n") pyunit_utils.assert_H2OTwoDimTable_equal( pcaRandomized._model_json["output"]["importance"], pcaPower._model_json["output"]["importance"], [ "Standard deviation", "Cumulative Proportion", "Cumulative Proportion" ]) # eigenvalues between the PCA and Randomize should be close with rank deficient dataset, I hope... print( "@@@@@@ Comparing eigenvalues between Randomized and Power PCA with rank deficient dataset...\n" ) pyunit_utils.assert_H2OTwoDimTable_equal( pcaRandomizedF._model_json["output"]["importance"], pcaPowerF._model_json["output"]["importance"], [ "Standard deviation", "Cumulative Proportion", "Cumulative Proportion" ]) pcaGLRM = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="GLRM", use_all_factor_levels=True, max_iterations=100, seed=12345) pcaGLRM.train(x, training_frame=data) correctEigNum = pcaGLRM.full_parameters["k"]["actual_value"] glrmNum = len( pcaGLRM._model_json["output"]["importance"].cell_values[0]) - 1 assert correctEigNum == glrmNum, "PCA GLRM FAIL: expected number of eigenvalues: " + correctEigNum + \ ", actual: " + glrmNum + "."