def checkpoint_new_category_in_predictor(): sv1 = h2o.upload_file(pyunit_utils.locate("smalldata/iris/setosa_versicolor.csv")) sv2 = h2o.upload_file(pyunit_utils.locate("smalldata/iris/setosa_versicolor.csv")) vir = h2o.upload_file(pyunit_utils.locate("smalldata/iris/virginica.csv")) print("checkpoint_new_category_in_predictor-1") m1 = H2ODeepLearningEstimator(epochs=100) m1.train(x=[0,1,2,4], y=3, training_frame=sv1) m2 = H2ODeepLearningEstimator(epochs=200, checkpoint=m1.model_id) m2.train(x=[0,1,2,4], y=3, training_frame=sv2) print("checkpoint_new_category_in_predictor-2") # attempt to continue building model, but with an expanded categorical predictor domain. # this should fail try: m3 = H2ODeepLearningEstimator(epochs=200, checkpoint=m1.model_id) m3.train(x=[0,1,2,4], y=3, training_frame=vir) assert False, "Expected continued model-building to fail with new categories introduced in predictor" except EnvironmentError: pass print("checkpoint_new_category_in_predictor-3") # attempt to predict on new model, but with observations that have expanded categorical predictor domain. predictions = m2.predict(vir) print("checkpoint_new_category_in_predictor-4")
def weights_and_distributions(): htable = h2o.upload_file(pyunit_utils.locate("smalldata/gbm_test/moppe.csv")) htable["premiekl"] = htable["premiekl"].asfactor() htable["moptva"] = htable["moptva"].asfactor() htable["zon"] = htable["zon"] # gamma dl = H2ODeepLearningEstimator(distribution="gamma") dl.train(x=range(3),y="medskad",training_frame=htable, weights_column="antskad") predictions = dl.predict(htable) # gaussian dl = H2ODeepLearningEstimator(distribution="gaussian") dl.train(x=range(3),y="medskad",training_frame=htable, weights_column="antskad") predictions = dl.predict(htable) # poisson dl = H2ODeepLearningEstimator(distribution="poisson") dl.train(x=range(3),y="medskad",training_frame=htable, weights_column="antskad") predictions = dl.predict(htable) # tweedie dl = H2ODeepLearningEstimator(distribution="tweedie") dl.train(x=range(3),y="medskad",training_frame=htable, weights_column="antskad") predictions = dl.predict(htable)
def deep_learning(xval=None, sample_size=None, nfolds=None, hparams=None, for_stacking=None): """ create a deep learning algorithm estimator :param xval: if for cross-validation :param sample_size: training set sample amount :param nfolds: k value for k-fold cross-validation :param hparams: hyper parameters for grid search :param for_stacking: if it is used for stacking :return: a constructed deep learning estimator, a parameters' dict for grid search """ if sample_size <= 10000: default_nfolds = 3 hidden_opts = [[30, 30], [20, 20], [10, 10]] input_dropout_ratio_opts = [0, 0.05, 0.1] l1_opts = [0, 1e-4, 1e-6] l2_opts = [0, 1e-4, 1e-6] elif 10000 < sample_size <= 100000: default_nfolds = 3 hidden_opts = [[20, 20], [30, 30]] input_dropout_ratio_opts = [0, 0.05] l1_opts = [0, 1e-6] l2_opts = [0, 1e-6] else: if sample_size > 500000: default_nfolds = 1 else: default_nfolds = 2 hidden_opts = [[20, 20], [10, 10]] input_dropout_ratio_opts = [0, 0.05] l1_opts = [1e-6] l2_opts = [1e-6] default_hparams = dict({'hidden': hidden_opts, 'input_dropout_ratio': input_dropout_ratio_opts, 'l1': l1_opts, 'l2': l2_opts}) if nfolds is None: nfolds = default_nfolds if hparams is None: hparams = default_hparams if xval: if for_stacking: dl_estimator = H2ODeepLearningEstimator(nfolds=nfolds, fold_assignment="Modulo", seed=1, keep_cross_validation_predictions=True, shuffle_training_data=True) else: dl_estimator = H2ODeepLearningEstimator(nfolds=nfolds, shuffle_training_data=True) else: dl_estimator = H2ODeepLearningEstimator(shuffle_training_data=True) return dl_estimator, hparams
def offsets_and_distributions(): # cars cars = h2o.upload_file( pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) cars = cars[cars["economy_20mpg"].isna() == 0] cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() offset = h2o.H2OFrame(python_obj=[[.5] for x in range(398)]) offset.set_name(0, "x1") cars = cars.cbind(offset) # insurance insurance = h2o.import_file( pyunit_utils.locate("smalldata/glm_test/insurance.csv")) insurance["offset"] = insurance["Holders"].log() # bernoulli - offset not supported #dl = h2o.deeplearning(x=cars[2:8], y=cars["economy_20mpg"], distribution="bernoulli", offset_column="x1", # training_frame=cars) #predictions = dl.predict(cars) from h2o.estimators.deeplearning import H2ODeepLearningEstimator # gamma dl = H2ODeepLearningEstimator(distribution="gamma") dl.train(x=range(3), y="Claims", training_frame=insurance, offset_column="offset") predictions = dl.predict(insurance) # gaussian dl = H2ODeepLearningEstimator(distribution="gaussian") dl.train(x=range(3), y="Claims", training_frame=insurance, offset_column="offset") predictions = dl.predict(insurance) # poisson dl = H2ODeepLearningEstimator(distribution="poisson") dl.train(x=range(3), y="Claims", training_frame=insurance, offset_column="offset") predictions = dl.predict(insurance) # tweedie dl = H2ODeepLearningEstimator(distribution="tweedie") dl.train(x=range(3), y="Claims", training_frame=insurance, offset_column="offset") predictions = dl.predict(insurance)
def pubdev_2041(): iris = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris.csv")) s = iris.runif(seed=12345) train1 = iris[s >= 0.5] train2 = iris[s < 0.5] m1 = H2ODeepLearningEstimator(epochs=100) m1.train(x=list(range(4)), y=4, training_frame=train1) # update m1 with new training data m2 = H2ODeepLearningEstimator(checkpoint=m1.model_id, epochs=200) m2.train(x=list(range(4)), y=4, training_frame=train2)
def imbalance(): print "Test checks if Deep Learning works fine with an imbalanced dataset" covtype = h2o.upload_file( pyunit_utils.locate("smalldata/covtype/covtype.20k.data")) covtype[54] = covtype[54].asfactor() from h2o.estimators.deeplearning import H2ODeepLearningEstimator hh_imbalanced = H2ODeepLearningEstimator(l1=1e-5, activation="Rectifier", loss="CrossEntropy", hidden=[200, 200], epochs=1, balance_classes=False, reproducible=True, seed=1234) hh_imbalanced.train(x=range(54), y=54, training_frame=covtype) print hh_imbalanced hh_balanced = H2ODeepLearningEstimator(l1=1e-5, activation="Rectifier", loss="CrossEntropy", hidden=[200, 200], epochs=1, balance_classes=True, reproducible=True, seed=1234) hh_balanced.train(x=range(54), y=54, training_frame=covtype) print hh_balanced #compare overall logloss class_6_err_imbalanced = hh_imbalanced.logloss() class_6_err_balanced = hh_balanced.logloss() if class_6_err_imbalanced < class_6_err_balanced: print "--------------------" print "" print "FAIL, balanced error greater than imbalanced error" print "" print "" print "class_6_err_imbalanced" print class_6_err_imbalanced print "" print "class_6_err_balanced" print class_6_err_balanced print "" print "--------------------" assert class_6_err_imbalanced >= class_6_err_balanced, "balance_classes makes it worse!"
def deeplearning_demo(): # Training data train_data = h2o.import_file( path=tests.locate("smalldata/gbm_test/ecology_model.csv")) train_data = train_data.drop('Site') train_data['Angaus'] = train_data['Angaus'].asfactor() print train_data.describe() train_data.head() # Testing data test_data = h2o.import_file( path=tests.locate("smalldata/gbm_test/ecology_eval.csv")) test_data['Angaus'] = test_data['Angaus'].asfactor() print test_data.describe() test_data.head() # Run DeepLearning dl = H2ODeepLearningEstimator(loss="CrossEntropy", epochs=1000, hidden=[20, 20, 20]) dl.train(x=range(1, train_data.ncol), y="Angaus", training_frame=train_data, validation_frame=test_data) dl.show()
def generate_dataset(family, nrow, ncol, networkStructure, activation, realFrac, intFrac, enumFrac, missingFrac, factorRange, numericRange, targetFactor): if family=="bernoulli": responseFactor = 2 elif family == 'gaussian': responseFactor = 1; else : responseFactor = targetFactor trainData = random_dataset(nrow, ncol, realFrac=realFrac, intFrac=intFrac, enumFrac=enumFrac, factorR=factorRange, integerR=numericRange, responseFactor=responseFactor, misFrac=missingFrac) myX = trainData.names myY = 'response' myX.remove(myY) m = H2ODeepLearningEstimator(distribution = family, hidden=networkStructure, activation=activation, epochs=0, initial_weight_distribution='normal') m.train(training_frame=trainData,x=myX,y= myY) f2 = m.predict(trainData) finalDataset = trainData[myX] finalDataset = finalDataset.cbind(f2[0]) finalDataset.set_name(col=finalDataset.ncols-1, name='response') return finalDataset
def test_dl(): train = import_iris2() ae_model = H2OAutoEncoderEstimator( activation="Tanh", hidden=[40, 80], model_id="ae_model", epochs=1, ignore_const_cols=False ) ae_model.train(list(range(4)), training_frame=train) dl1 = H2ODeepLearningEstimator(hidden=[10, 10], export_weights_and_biases=True) dl1.train(x=list(range(4)), y=4, training_frame=train) w1 = dl1.weights(0) w3 = dl1.weights(2) b1 = dl1.biases(0) b2 = dl1.biases(1) params = { "initial_weights": [w1, None, w3], "initial_biases": [b1, b2, None], "pretrained_autoencoder": "ae_model", "hidden": [40, 80], "ignore_const_cols": False } hyper_params = { "epochs": [2, 4, 6, 10, 20, 50], "rate": [.005, .006, .007] } grid_ft_resume( train, "DEEP_LEARNING", params, hyper_params, dl_start, dl_resume )
def deeplearning_basic(): iris_hex = h2o.import_file( path=pyunit_utils.locate("smalldata/iris/iris.csv")) hh = H2ODeepLearningEstimator(loss="CrossEntropy") hh.train(x=list(range(3)), y=4, training_frame=iris_hex) hh.show()
def tweedie_weights(): data = h2o.import_file(pyunit_utils.locate("smalldata/glm_test/cancar_logIn.csv")) data["C1M3"] = ((data["Class"] == 1) & (data["Merit"] == 3)).asfactor() data["C3M3"] = ((data["Class"] == 3) & (data["Merit"] == 3)).asfactor() data["C4M3"] = ((data["Class"] == 4) & (data["Merit"] == 3)).asfactor() data["C1M2"] = ((data["Class"] == 1) & (data["Merit"] == 2)).asfactor() data["Merit"] = data["Merit"].asfactor() data["Class"] = data["Class"].asfactor() loss = old_div(data["Cost"], data["Insured"]) loss.set_name(0,"Loss") cancar = loss.cbind(data) # Without weights myX = ["Merit","Class","C1M3","C4M3"] dl = H2ODeepLearningEstimator(distribution="tweedie",hidden=[1],epochs=1000, train_samples_per_iteration=-1,reproducible=True, activation="Tanh",balance_classes=False, force_load_balance=False, seed=2353123, tweedie_power=1.5,score_training_samples=0, score_validation_samples=0) dl.train(x=myX,y="Loss", training_frame=cancar) mean_residual_deviance = dl.mean_residual_deviance() # With weights dl.train(x=myX, y="Loss", training_frame=cancar, weights_column="Insured")
def deep_learning_metrics_test(): # connect to existing cluster df = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) df.drop("ID") # remove ID df['CAPSULE'] = df['CAPSULE'].asfactor() # make CAPSULE categorical vol = df['VOL'] vol[vol == 0] = float("nan") # 0 VOL means 'missing' r = vol.runif() # random train/test split train = df[r < 0.8] test = df[r >= 0.8] # See that the data is ready train.describe() train.head() train.tail() test.describe() test.head() test.tail() # Run DeepLearning print("Train a Deeplearning model: ") dl = H2ODeepLearningEstimator(epochs=100, hidden=[10, 10, 10], loss="CrossEntropy") dl.train(x=list(range(2, train.ncol)), y="CAPSULE", training_frame=train) print("Binomial Model Metrics: ") print() dl.show() p = dl.model_performance(test) p.show()
def pubdev_2223(): covtype = h2o.import_file( pyunit_utils.locate("smalldata/covtype/covtype.20k.data")) covtype[54] = covtype[54].asfactor() dlmodel = H2ODeepLearningEstimator(hidden=[17, 191], epochs=1, balance_classes=False, reproducible=True, seed=1234, export_weights_and_biases=True) dlmodel.train(x=list(range(54)), y=54, training_frame=covtype) print( "Normalization/Standardization multipliers for numeric predictors: {0}\n" .format(dlmodel.normmul())) print( "Normalization/Standardization offsets for numeric predictors: {0}\n". format(dlmodel.normsub())) print( "Normalization/Standardization multipliers for numeric response: {0}\n" .format(dlmodel.respmul())) print("Normalization/Standardization offsets for numeric response: {0}\n". format(dlmodel.respsub())) print("Categorical offsets for one-hot encoding: {0}\n".format( dlmodel.catoffsets()))
def dl_demo(): from h2o.estimators.deeplearning import H2ODeepLearningEstimator df[1] = df[1].asfactor() # 随机统一数字,每行一个 random = df[0].runif() # 60%的训练集 train = df[random < 0.6] # 30%的验证集 valid = df[0.6 <= random < 0.9] # 10%的测试集 test = df[random >= 0.9] m = H2ODeepLearningEstimator() print( 'm.train_print:', m.train(x=train.names[2:], y=train.names[1], training_frame=train, validation_frame=valid)) print('m.train_print_end') print('m_print:', m) # 预测 print('m.predict_print:\n', m.predict(test)) print('m.predict_print_end') # 在训练数据上显示性能 m.model_performance() # 在验证数据上显示性能 m.model_performance(valid=True) # 评分并计算测试数据的新指标! print('m.model_performance(test_data=test)_print:', m.model_performance(test_data=test)) print('m.model_performance(test_data=test)_print_end') # 训练数据的均方差 m.mse() # 验证集上的均方差 print('m.mse_print:', m.mse(valid=True)) m.r2() print('m.r2_print:', m.r2(valid=True)) print('m.confusion_matrix_print:', m.confusion_matrix()) # 混淆矩阵的最大精度 m.confusion_matrix(metrics="accuracy") # check out the help for more! m.confusion_matrix("min_per_class_accuracy")
def hyperopt_train_test(params): dl = H2ODeepLearningEstimator(**params) if 'hidden' in params.keys(): dl.hidden = list(params['hidden']) if 'hidden_dropout_ratios' in params.keys(): dl.hidden_dropout_ratios = list(params['hidden_dropout_ratios']) dl.train(x=X_vars, y=y_var, training_frame=t, validation_frame=v) return dl.model_performance(v).logloss()
def deep_learning(name): """ Get the Deep Learning Model :param name: model name, will determine filename :return: """ params = get_params("deep_learning") return H2ODeepLearningEstimator(model_id=name, **params)
def algo_pr_auc_test(): ''' This pyunit test is written to make sure we can call pr_auc() on all binomial models. ''' seed = 123456789 prostate_train = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate_train.csv")) prostate_train["CAPSULE"] = prostate_train["CAPSULE"].asfactor() # Build H2O GBM classification model: gbm_h2o = H2OGradientBoostingEstimator(ntrees=10, learn_rate=0.1, max_depth=4, min_rows=10, distribution="bernoulli", seed=seed) gbm_h2o.train(x=list(range(1,prostate_train.ncol)),y="CAPSULE", training_frame=prostate_train) print("*************************** Printing GBM model") print(gbm_h2o) print("pr_auc for GBM model is {0}".format(gbm_h2o.pr_auc())) # Build H2O GLM classification model: glm_h2o = H2OGeneralizedLinearEstimator(family='binomial', seed=seed) glm_h2o.train(x=list(range(1,prostate_train.ncol)),y="CAPSULE", training_frame=prostate_train) print("*************************** Printing GLM model") print(glm_h2o) # glm scoring history does not contain AUC, and hence no pr_auc print("pr_auc for GLM model is {0}".format(glm_h2o.pr_auc())) rf_h2o = H2ORandomForestEstimator(ntrees=10, score_tree_interval=0) rf_h2o.train(x=list(range(1,prostate_train.ncol)),y="CAPSULE", training_frame=prostate_train) print("*************************** Printing random forest model") print(rf_h2o) print("pr_auc for Random Forest model is {0}".format(rf_h2o.pr_auc())) dl_h2o = H2ODeepLearningEstimator(distribution='bernoulli', seed=seed, hidden=[2,2]) dl_h2o.train(x=list(range(1,prostate_train.ncol)),y="CAPSULE", training_frame=prostate_train) print("*************************** Printing deeplearning model") print(dl_h2o) print("pr_auc for deeplearning model is {0}".format(dl_h2o.pr_auc())) assert abs(gbm_h2o.pr_auc()-glm_h2o.pr_auc()) < 0.9, \ "problem with pr_auc values" assert abs(rf_h2o.pr_auc()-dl_h2o.pr_auc()) < 0.9, \ "problem with pr_auc values" assert abs(rf_h2o.pr_auc()-glm_h2o.pr_auc()) < 0.9, \ "problem with pr_auc values" # try to call pr_auc() for regression. Should encounter error. h2o_data = h2o.import_file(path=pyunit_utils.locate("smalldata/prostate/prostate_complete.csv.zip")) myY = "GLEASON" myX = ["ID","AGE","RACE","CAPSULE","DCAPS","PSA","VOL","DPROS"] h2o_model = H2OGeneralizedLinearEstimator(family="gaussian", link="identity",alpha=0.5, Lambda=0) h2o_model.train(x=myX, y=myY, training_frame=h2o_data) try: print(h2o_model.pr_auc()) assert 1==2, "pr_auc() should raise an error for multinomial but did not." except: pass
def deeplearning_multi(): print("Test checks if Deep Learning works fine with a multiclass training and test dataset") prostate = h2o.import_file(pyunit_utils.locate("smalldata/logreg/prostate.csv")) prostate[4] = prostate[4].asfactor() hh = H2ODeepLearningEstimator(loss="CrossEntropy") hh.train(x=[0,1],y=4, training_frame=prostate, validation_frame=prostate) hh.show()
def weights_and_biases(): print "Test checks if Deep Learning weights and biases are accessible from R" covtype = h2o.upload_file(pyunit_utils.locate("smalldata/covtype/covtype.20k.data")) covtype[54] = covtype[54].asfactor() from h2o.estimators.deeplearning import H2ODeepLearningEstimator dlmodel = H2ODeepLearningEstimator(hidden=[17,191], epochs=1, balance_classes=False, reproducible=True, seed=1234, export_weights_and_biases=True) dlmodel.train(x=range(54),y=54,training_frame=covtype) print dlmodel weights1 = dlmodel.weights(0) weights2 = dlmodel.weights(1) weights3 = dlmodel.weights(2) biases1 = dlmodel.biases(0) biases2 = dlmodel.biases(1) biases3 = dlmodel.biases(2) w1c = weights1.ncol w1r = weights1.nrow assert w1c == 52, "wrong dimensionality! expected {0}, but got {1}.".format(52, w1c) assert w1r == 17, "wrong dimensionality! expected {0}, but got {1}.".format(17, w1r) w2c = weights2.ncol w2r = weights2.nrow assert w2c == 17, "wrong dimensionality! expected {0}, but got {1}.".format(17, w2c) assert w2r == 191, "wrong dimensionality! expected {0}, but got {1}.".format(191, w2r) w3c = weights3.ncol w3r = weights3.nrow assert w3c == 191, "wrong dimensionality! expected {0}, but got {1}.".format(191, w3c) assert w3r == 7, "wrong dimensionality! expected {0}, but got {1}.".format(7, w3r) b1c = biases1.ncol b1r = biases1.nrow assert b1c == 1, "wrong dimensionality! expected {0}, but got {1}.".format(1, b1c) assert b1r == 17, "wrong dimensionality! expected {0}, but got {1}.".format(17, b1r) b2c = biases2.ncol b2r = biases2.nrow assert b2c == 1, "wrong dimensionality! expected {0}, but got {1}.".format(1, b2c) assert b2r == 191, "wrong dimensionality! expected {0}, but got {1}.".format(191, b2r) b3c = biases3.ncol b3r = biases3.nrow assert b3c == 1, "wrong dimensionality! expected {0}, but got {1}.".format(1, b3c) assert b3r == 7, "wrong dimensionality! expected {0}, but got {1}.".format(7, b3r)
def deep_1( K, dfs, dfs_collector, test, test_collector ): r = 'deep_1' features = on_top2 val_hf = h2o.H2OFrame(test) ntrees = 100 seed = 1155 v = np.zeros(shape=[len(test)]) for i in range(K): print() print('in model:', r, ' k-fold:', i + 1, '/', K) print() b = [i for i in range(K)] b.remove(i) c = [dfs[b[j]] for j in range(K - 1)] dt = pd.concat(c) train_hf = h2o.H2OFrame(dt) del dt dfs_i = h2o.H2OFrame(dfs[i]) # features = list(train_hf.columns) features.remove('target') print('- ' * 10) for c in features: print("'{}',".format(c)) print('- ' * 10) model = H2ODeepLearningEstimator(hidden=[200,200], epochs=500) model.train(x=features, y='target', training_frame=train_hf) del train_hf p = model.predict(dfs_i) dfs_collector[i][r] = h2o.as_list(p, use_pandas=True).values print(dfs_collector[i].head()) print(dfs_collector[i].head().dtypes) q = model.predict(val_hf) dd = h2o.as_list(q, use_pandas=True) a = dd['predict'] a = np.array(a, dtype=pd.Series).tolist() # print(type(a)) # print(a.shape) v += a print('# ' * 10) for show_v in range(5): print(v[show_v]) print('# ' * 10) test_collector[r] = v / K print(test_collector.head()) return dfs_collector, test_collector, r
def checkpoint_new_category_in_response(): sv = h2o.upload_file( pyunit_utils.locate("smalldata/iris/setosa_versicolor.csv")) iris = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris.csv")) sv = h2o.upload_file( pyunit_utils.locate("smalldata/iris/setosa_versicolor.csv")) iris = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris.csv")) m1 = H2ODeepLearningEstimator(epochs=100) m1.train(x=[0, 1, 2, 3], y=4, training_frame=sv) # attempt to continue building model, but with an expanded categorical response domain. # this should fail try: m2 = H2ODeepLearningEstimator(checkpoint=m1.model_id, epochs=200) m2.train(x=[0, 1, 2, 3], y=4, training_frame=iris) assert False, "Expected continued model-building to fail with new categories introduced in response" except EnvironmentError: pass
def deeplearning_no_hidden(): iris_hex = h2o.import_file( path=pyunit_utils.locate("smalldata/iris/iris.csv")) hh = H2ODeepLearningEstimator(hidden=[], loss="CrossEntropy", export_weights_and_biases=True) hh.train(x=list(range(4)), y=4, training_frame=iris_hex) hh.show() weights1 = hh.weights(0) assert weights1.shape[0] == 3 assert weights1.shape[1] == 4
def varimp_plot_test(): kwargs = {} kwargs['server'] = True # import data set cars = h2o.import_file(pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) # Constructing validation and train sets by sampling (20/80) s = cars[0].runif() cars_train = cars[s <= 0.8] cars_valid = cars[s > 0.8] # set list of features, target, and convert target to factor predictors = ["displacement", "power", "weight", "acceleration", "year"] response_col = "economy_20mpg" cars[response_col] = cars[response_col].asfactor() # Build and train a DRF model # to do: comment this out cars_rf = H2ORandomForestEstimator() cars_rf.train(x=predictors, y=response_col, training_frame=cars_train, validation_frame=cars_valid) #Plot DRF Variable Importances, check that num_of_features accepts input cars_rf.varimp_plot(server=True) cars_rf.varimp_plot(num_of_features=2, server=True) # test saving: tmpdir = tempfile.mkdtemp(prefix="h2o-func") path="{}/plot1.png".format(tmpdir) test_plot_result_saving(cars_rf.varimp_plot(server=True), "{}/plot2.png".format(tmpdir), cars_rf.varimp_plot(server=True, save_plot_path=path), path) # Build and train a GBM model cars_gbm = H2OGradientBoostingEstimator() cars_gbm.train(x=predictors, y=response_col, training_frame=cars_train, validation_frame=cars_valid) # Plot GBM Variable Importances cars_gbm.varimp_plot(server=True) cars_gbm.varimp_plot(num_of_features=2, server=True) # Build and train a Deep Learning model cars_dl = H2ODeepLearningEstimator(variable_importances=True) cars_dl.train(x=predictors, y=response_col, training_frame=cars_train, validation_frame=cars_valid) # Plot Deep Learning Variable Importances cars_dl.varimp_plot(server=True) cars_dl.varimp_plot(num_of_features=2, server=True) # check that varimp_plot() uses std_coef_plot() for a glm cars_glm = H2OGeneralizedLinearEstimator() cars_glm.train(x=predictors, y=response_col, training_frame=cars_train, validation_frame=cars_valid) cars_glm.varimp_plot(server=True) cars_glm.varimp_plot(num_of_features=2, server=True)
def tweedie_offset(): insurance = h2o.import_file( pyunit_utils.locate("smalldata/glm_test/insurance.csv")) insurance["offset"] = insurance["Holders"].log() insurance["Group"] = insurance["Group"].asfactor() insurance["Age"] = insurance["Age"].asfactor() insurance["District"] = insurance["District"].asfactor() from h2o.estimators.deeplearning import H2ODeepLearningEstimator # without offset dl = H2ODeepLearningEstimator(distribution="tweedie", hidden=[1], epochs=1000, train_samples_per_iteration=-1, reproducible=True, activation="Tanh", single_node_mode=False, balance_classes=False, force_load_balance=False, seed=23123, tweedie_power=1.5, score_training_samples=0, score_validation_samples=0) dl.train(x=range(3), y="Claims", training_frame=insurance) mean_residual_deviance = dl.mean_residual_deviance() assert abs(0.556 - mean_residual_deviance) < 1e-3, "Expected mean residual deviance to be 0.556, but got " \ "{0}".format(mean_residual_deviance) predictions = dl.predict(insurance) assert abs(47.61-predictions[0].mean()) < 1e-2, "Expected mean of predictions to be 47.61, but got " \ "{0}".format(predictions[0].mean()) assert abs(1.94-predictions[0].min()) < 1e-1, "Expected min of predictions to be 1.94, but got " \ "{0}".format(predictions[0].min()) assert abs(284.6-predictions[0].max()) < 28, "Expected max of predictions to be 284.6, but got " \ "{0}".format(predictions[0].max()) # with offset dl.train(x=range(3), y="Claims", training_frame=insurance, offset_column="offset") mean_residual_deviance = dl.mean_residual_deviance() assert abs(0.261-mean_residual_deviance) < 1e-2, "Expected mean residual deviance to be 0.261, but got " \ "{0}".format(mean_residual_deviance) predictions = dl.predict(insurance) assert abs(49.53-predictions[0].mean()) < 1e-1, "Expected mean of predictions to be 49.53, but got " \ "{0}".format(predictions[0].mean()) assert abs(1.074-predictions[0].min()) < 1e-1, "Expected min of predictions to be 1.074, but got " \ "{0}".format(predictions[0].min()) assert abs(397.3-predictions[0].max()) < 40, "Expected max of predictions to be 397.3, but got " \ "{0}".format(predictions[0].max())
def missing(): # Connect to a pre-existing cluster missing_ratios = [0, 0.1, 0.25, 0.5, 0.75, 0.99] errors = [0, 0, 0, 0, 0, 0] for i in range(len(missing_ratios)): data = h2o.upload_file( pyunit_utils.locate("smalldata/junit/weather.csv")) data[15] = data[15].asfactor() #ChangeTempDir data[16] = data[16].asfactor() #ChangeTempMag data[17] = data[17].asfactor() #ChangeWindDirect data[18] = data[18].asfactor() #MaxWindPeriod data[19] = data[19].asfactor() #RainToday data[21] = data[21].asfactor() #PressureChange data[23] = data[23].asfactor() #RainTomorrow print "For missing {0}%".format(missing_ratios[i] * 100) # add missing values to the data section of the file (leave the response alone) if missing_ratios[i] > 0: resp = data[23] pred = data[:, range(23) + range(24, data.ncol)] data_missing = pred.insert_missing_values( fraction=missing_ratios[i]) data_fin = data_missing.cbind(resp) else: data_fin = data # split into train + test datasets ratio = data_fin[0].runif() train = data_fin[ratio <= .75] test = data_fin[ratio > .75] from h2o.estimators.deeplearning import H2ODeepLearningEstimator hh = H2ODeepLearningEstimator(epochs=5, reproducible=True, seed=12345, activation='RectifierWithDropout', l1=1e-5, input_dropout_ratio=0.) hh.train(x=range(2, 22), y=23, training_frame=train, validation_frame=test) errors[i] = hh.error()[0][1] for i in range(len(missing_ratios)): print "missing ratio: {0}% --> classification error: {1}".format( missing_ratios[i] * 100, errors[i]) assert sum(errors) < 2.2, "Sum of classification errors is too large!"
def _get_mlp_model(predictor_col, response_col, train_f, val_f): from h2o.estimators.deeplearning import H2ODeepLearningEstimator mlp_model = H2ODeepLearningEstimator(activation='tanh', adaptive_rate=False, nesterov_accelerated_gradient=False, hidden=[10, 10], seed=123, epochs=10) mlp_model.train(x=predictor_col, y=response_col, training_frame=train_f, validation_frame=val_f) return mlp_model
def main(): # Generate dataset for y = x^2 df = sine_df(glob_train_periods, glob_density) # Start h2o h2o.init(ip='192.168.0.41', port=65432, max_mem_size_GB=128) # Create H2OFrame column_types = ['real', 'real'] hf = h2o.H2OFrame(df, column_types=column_types) train, val = hf.split_frame(ratios=[0.8]) # Create model predictors = 'x' response = 'y' model = H2ODeepLearningEstimator( model_id='dnn_sine', epochs=5000, hidden=[800], activation='rectifier', # hidden_dropout_ratios=[0.0], l1=1e-4, l2=1e-4, max_w2=0.55, stopping_rounds=8, # stopping_tolerance=1e-4, stopping_metric='rmse', # Control scoring epochs score_interval=0, score_duty_cycle=1, shuffle_training_data=False, replicate_training_data=True, train_samples_per_iteration=int(0.5 * len(df) / 1.258), ) model.train(x=predictors, y=response, training_frame=train, validation_frame=val) # Create test set with domain outside training test_df = sine_df(glob_test_periods, glob_density) test = h2o.H2OFrame(test_df, column_types=column_types) test_df['predict'] = model.predict(test).as_data_frame() # Plot results plt.plot(test_df['x'], test_df['y']) plt.plot(test_df['x'], test_df['predict']) plt.xlim(-glob_test_periods, glob_test_periods) plt.show()
def offset_init_train_deeplearning(): # Connect to a pre-existing cluster cars = h2o.upload_file( pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) cars = cars[cars["economy_20mpg"].isna() == 0] offset = h2o.H2OFrame([[.5]] * 398) offset.set_names(["x1"]) cars = cars.cbind(offset) # offset_column passed in the train method dl_train = H2ODeepLearningEstimator(hidden=[20, 20], epochs=10) dl_train.train(x=list(range(2, 8)), y="economy_20mpg", training_frame=cars, offset_column="x1") predictions_train = dl_train.predict(cars) # test offset_column passed in estimator init dl_init = H2ODeepLearningEstimator(hidden=[20, 20], epochs=10, offset_column="x1") dl_init.train(x=list(range(2, 8)), y="economy_20mpg", training_frame=cars) predictions_init = dl_init.predict(cars) # case the both offset column parameters are set and only the parameter in train will be used dl_init_train = H2ODeepLearningEstimator(hidden=[20, 20], epochs=10, offset_column="x1") dl_init_train.train(x=list(range(2, 8)), y="economy_20mpg", training_frame=cars, offset_column="x1") predictions_init_train = dl_init_train.predict(cars) assert predictions_train == predictions_init, "Expected predictions of a model with offset_column in train method has to be same as predictions of a model with offset_column in constructor." assert predictions_train == predictions_init_train, "Expected predictions of a model with offset_column in train method has to be same as predictions of a model with offset_column in both constructor and init."
def deeplearning_multi(): print("Test checks if Deep Learning works fine with a categorical dataset") # print(locate("smalldata/logreg/protstate.csv")) prostate = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) prostate[1] = prostate[1].asfactor() #CAPSULE -> CAPSULE prostate[2] = prostate[2].asfactor() #AGE -> Factor prostate[3] = prostate[3].asfactor() #RACE -> Factor prostate[4] = prostate[4].asfactor() #DPROS -> Factor prostate[5] = prostate[5].asfactor() #DCAPS -> Factor prostate = prostate.drop('ID') #remove ID prostate.describe() hh = H2ODeepLearningEstimator(loss="CrossEntropy", hidden=[10,10], use_all_factor_levels=False) hh.train(x=list(set(prostate.names) - {"CAPSULE"}), y="CAPSULE", training_frame=prostate) hh.show()
def deeplearning_export(): print("###### DEEPLEARNING ######") frame = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) problem = random.sample(list(range(3)), 1)[0] predictors = ["displacement", "power", "weight", "acceleration", "year"] if problem == 1: response_col = "economy_20mpg" frame[response_col] = frame[response_col].asfactor() elif problem == 2: response_col = "cylinders" frame[response_col] = frame[response_col].asfactor() else: response_col = "economy" print("Response column: {0}".format(response_col)) model = H2ODeepLearningEstimator(nfolds=random.randint(3, 10), fold_assignment="Modulo", hidden=[20, 20], epochs=10) model.train(x=predictors, y=response_col, training_frame=frame) h2o.download_pojo(model, path=RESULT_DIR) model.download_mojo(path=RESULT_DIR)