def mojo_predict_api_test(sandbox_dir): data = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) input_csv = "%s/in.csv" % sandbox_dir output_csv = "%s/prediction.csv" % sandbox_dir h2o.export_file(data[1, 2:], input_csv) data[1] = data[1].asfactor() model = H2OGradientBoostingEstimator(distribution="bernoulli") model.train(x=[2, 3, 4, 5, 6, 7, 8], y=1, training_frame=data) # download mojo model_zip_path = os.path.join(sandbox_dir, 'model.zip') genmodel_path = os.path.join(sandbox_dir, 'h2o-genmodel.jar') download_mojo(model, model_zip_path) assert os.path.isfile(model_zip_path) assert os.path.isfile(genmodel_path) # test that we can predict using default paths h2o.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=model_zip_path, verbose=True) h2o.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=model_zip_path, genmodel_jar_path=genmodel_path, verbose=True) assert os.path.isfile(output_csv) os.remove(model_zip_path) os.remove(genmodel_path) os.remove(output_csv) # test that we can predict using custom genmodel path other_sandbox_dir = tempfile.mkdtemp() try: genmodel_path = os.path.join(other_sandbox_dir, 'h2o-genmodel-custom.jar') download_mojo(model, model_zip_path, genmodel_path) assert os.path.isfile(model_zip_path) assert os.path.isfile(genmodel_path) try: h2o.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=model_zip_path, verbose=True) assert False, "There should be no h2o-genmodel.jar at %s" % sandbox_dir except RuntimeError: pass assert not os.path.isfile(output_csv) h2o.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=model_zip_path, genmodel_jar_path=genmodel_path, verbose=True) assert os.path.isfile(output_csv) os.remove(output_csv) output_csv = "%s/out.prediction" % other_sandbox_dir # test that we can predict using default paths h2o.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=model_zip_path, genmodel_jar_path=genmodel_path, verbose=True, output_csv_path=output_csv) assert os.path.isfile(output_csv) os.remove(model_zip_path) os.remove(genmodel_path) os.remove(output_csv) finally: shutil.rmtree(other_sandbox_dir)
def mojo_predict_csv_test(target_dir): mojo_file_name = "prostate_isofor_model.zip" mojo_zip_path = os.path.join(target_dir, mojo_file_name) data_path = pyunit_utils.locate("smalldata/logreg/prostate.csv") prostate = h2o.import_file(path=data_path) # ================================================================= # Isolation Forest # ================================================================= isofor = H2OIsolationForestEstimator() isofor.train(training_frame=prostate) pred_h2o = isofor.predict(prostate) pred_h2o_df = pred_h2o.as_data_frame(use_pandas=True) download_mojo(isofor, mojo_zip_path) output_csv = "%s/prediction.csv" % target_dir print("\nPerforming Isolation Forest Prediction using MOJO @... " + target_dir) pred_mojo_csv = h2o.mojo_predict_csv(input_csv_path=data_path, mojo_zip_path=mojo_zip_path, output_csv_path=output_csv) pred_mojo_df = pd.DataFrame(pred_mojo_csv, dtype=np.float64, columns=["predict", "mean_length"]) print("*** pred_h2o_df ***") print(pred_h2o_df) print("***pred_mojo_df ***") print(pred_mojo_df) assert_frame_equal(pred_h2o_df, pred_mojo_df, check_dtype=False)
def demo_xgboost_concurrent_contributions(): prostate_path = pyunit_utils.locate("smalldata/logreg/prostate.csv") prostate = h2o.import_file(path=prostate_path) prostate["CAPSULE"] = prostate["CAPSULE"].asfactor() xgb_model = H2OXGBoostEstimator() xgb_model.train( x=["AGE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"], y="CAPSULE", training_frame=prostate) results_dir = os.path.join(pyunit_utils.locate("results"), "xgb_concurrent") os.mkdir(results_dir) mojo_path = xgb_model.download_mojo(results_dir, get_genmodel_jar=True) # how many parallel threads to run concurrency = 4 reference_result = h2o.mojo_predict_csv( input_csv_path=prostate_path, mojo_zip_path=mojo_path, output_csv_path=os.path.join(results_dir, "predictions.csv"), predict_contributions=True, extra_cmd_args=["--testConcurrent", str(concurrency)]) print(reference_result) for test_id in range(4): with open(os.path.join(results_dir, "predictions.csv." + str(test_id))) as csv_file: concurrent_result = list(csv.DictReader(csv_file)) assert reference_result == concurrent_result
def build_mojo_pipeline(): results_dir = pyunit_utils.locate("results") iris_csv = pyunit_utils.locate('smalldata/iris/iris_train.csv') iris = h2o.import_file(iris_csv) pca = H2OPrincipalComponentAnalysisEstimator(k=2) pca.train(training_frame=iris) principal_components = pca.predict(iris) km = H2OKMeansEstimator(k=3) km.train(training_frame=principal_components) pca_mojo_path = pca.download_mojo(path=results_dir) km_mojo_path = km.download_mojo(get_genmodel_jar=True, path=results_dir) java_cmd = [ "java", "-cp", os.path.join(results_dir, "h2o-genmodel.jar"), "hex.genmodel.tools.BuildPipeline", "--mapping" ] pca_mojo_name = os.path.basename(pca_mojo_path).split('.')[0] for i, pc in enumerate(principal_components.columns): mapping = pc + '=' + pca_mojo_name + ':' + str(i) java_cmd += [mapping] java_cmd += [ "--output", os.path.join(results_dir, "pipe.zip"), "--input", km_mojo_path, pca_mojo_path ] subprocess.Popen(java_cmd, stdout=PIPE, stderr=STDOUT).communicate() h2o_preds = km.predict(principal_components) mojo_preds_raw = h2o.mojo_predict_csv(input_csv_path=iris_csv, mojo_zip_path=os.path.join( results_dir, "pipe.zip")) mojo_preds = h2o.H2OFrame([c['cluster'] for c in mojo_preds_raw], column_names=['predict']) assert (mojo_preds == h2o_preds).mean()[0, "predict"] == 1
def test_setInvNumNA(): train = h2o.import_file( pyunit_utils.locate( "smalldata/glm_test/pubdev_6617_setInvNumNA_train.csv")) testdata = pyunit_utils.locate( "smalldata/glm_test/pubdev_6617_setInvNumNA_test.csv") testdataModel = h2o.import_file( pyunit_utils.locate( "smalldata/glm_test/pubdev_6617_setInvNumNA_test_model.csv")) response = "C2" x = ["C1"] params = { 'missing_values_handling': "MeanImputation", 'family': 'gaussian' } glmMultinomialModel = pyunit_utils.build_save_model_GLM( params, x, train, response) # build and save mojo model MOJONAME = pyunit_utils.getMojoName(glmMultinomialModel._id) TMPDIR = os.path.normpath( os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME)) mojoLoco = os.path.normpath(os.path.join(TMPDIR, MOJONAME + '.zip')) mojoOut = os.path.normpath(os.path.join(TMPDIR, "mojo_out.csv")) genJarDir = str.split(str(TMPDIR), '/') genJarDir = '/'.join(genJarDir[0:genJarDir.index('h2o-py')] ) # locate directory of genmodel.jar jarpath = os.path.join(genJarDir, "h2o-assemblies/genmodel/build/libs/genmodel.jar") mojoPredict = h2o.mojo_predict_csv(input_csv_path=testdata, mojo_zip_path=mojoLoco, output_csv_path=mojoOut, genmodel_jar_path=jarpath, verbose=True, setInvNumNA=True) modelPred = glmMultinomialModel.predict(testdataModel) for ind in range(5): assert abs(float(mojoPredict[ind]['predict'])-modelPred[ind,0])<1e-6, "model predict {1} and mojo predict {0} differs " \ "too much".format(float(mojoPredict[0]['predict']), modelPred[ind,0])
def mojo_predict_csv_test(target_dir): mojo_file_name = "prostate_gbm_model.zip" mojo_zip_path = os.path.join(target_dir, mojo_file_name) prostate = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) r = prostate[0].runif() train = prostate[r < 0.70] test = prostate[r >= 0.70] # Getting first row from test data frame pdf = test[1, 2:] input_csv = "%s/in.csv" % target_dir output_csv = "%s/output.csv" % target_dir h2o.export_file(pdf, input_csv) # ================================================================= # Regression # ================================================================= regression_gbm1 = H2OGradientBoostingEstimator(distribution="gaussian") regression_gbm1.train(x=[2, 3, 4, 5, 6, 7, 8], y=1, training_frame=train) pred_reg = regression_gbm1.predict(pdf) contribs_reg = regression_gbm1.predict_contributions(pdf) p1 = pred_reg[0, 0] print("Regression prediction: " + str(p1)) download_mojo(regression_gbm1, mojo_zip_path) print("\nPerforming Regression Prediction using MOJO @... " + target_dir) prediction_result = h2o.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=mojo_zip_path, output_csv_path=output_csv) print("Prediction result: " + str(prediction_result)) assert p1 == float( prediction_result[0]['predict'] ), "expected predictions to be the same for binary and MOJO model for regression" print("\nComparing Regression Contributions using MOJO @... " + target_dir) contributions_result = h2o.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=mojo_zip_path, output_csv_path=output_csv, predict_contributions=True) assert contributions_result is not None contributions_pandas = pandas.read_csv(output_csv) assert_frame_equal(contribs_reg.as_data_frame(use_pandas=True), contributions_pandas, check_dtype=False) # ================================================================= # Binomial # ================================================================= train[1] = train[1].asfactor() bernoulli_gbm1 = H2OGradientBoostingEstimator(distribution="bernoulli") bernoulli_gbm1.train(x=[2, 3, 4, 5, 6, 7, 8], y=1, training_frame=train) pred_bin = bernoulli_gbm1.predict(pdf) contribs_bin = bernoulli_gbm1.predict_contributions(pdf) binary_prediction_0 = pred_bin[0, 1] binary_prediction_1 = pred_bin[0, 2] print("Binomial prediction: p0: " + str(binary_prediction_0)) print("Binomial prediction: p1: " + str(binary_prediction_1)) download_mojo(bernoulli_gbm1, mojo_zip_path) print("\nPerforming Binomial Prediction using MOJO @... " + target_dir) prediction_result = h2o.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=mojo_zip_path, output_csv_path=output_csv) mojo_prediction_0 = float(prediction_result[0]['0']) mojo_prediction_1 = float(prediction_result[0]['1']) print("Binomial prediction: p0: " + str(mojo_prediction_0)) print("Binomial prediction: p1: " + str(mojo_prediction_1)) assert binary_prediction_0 == mojo_prediction_0, "expected predictions to be the same for binary and MOJO model for Binomial - p0" assert binary_prediction_1 == mojo_prediction_1, "expected predictions to be the same for binary and MOJO model for Binomial - p1" print("\nComparing Binary Classification Contributions using MOJO @... " + target_dir) contributions_bin_result = h2o.mojo_predict_csv( input_csv_path=input_csv, mojo_zip_path=mojo_zip_path, output_csv_path=output_csv, predict_contributions=True) assert contributions_bin_result is not None contributions_bin_pandas = pandas.read_csv(output_csv) print(contributions_bin_pandas) print(contribs_bin.as_data_frame(use_pandas=True)) assert_frame_equal(contribs_bin.as_data_frame(use_pandas=True), contributions_bin_pandas, check_dtype=False) # ================================================================= # Multinomial # ================================================================= iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv")) r = iris[0].runif() train = iris[r < 0.90] test = iris[r >= 0.10] # Getting first row from test data frame pdf = test[1, 0:4] input_csv = "%s/in-multi.csv" % target_dir output_csv = "%s/output.csv" % target_dir h2o.export_file(pdf, input_csv) multi_gbm = H2OGradientBoostingEstimator() multi_gbm.train(x=['C1', 'C2', 'C3', 'C4'], y='C5', training_frame=train) pred_multi = multi_gbm.predict(pdf) multinomial_prediction_1 = pred_multi[0, 1] multinomial_prediction_2 = pred_multi[0, 2] multinomial_prediction_3 = pred_multi[0, 3] print("Multinomial prediction (Binary): p0: " + str(multinomial_prediction_1)) print("Multinomial prediction (Binary): p1: " + str(multinomial_prediction_2)) print("Multinomial prediction (Binary): p2: " + str(multinomial_prediction_3)) download_mojo(multi_gbm, mojo_zip_path) print("\nPerforming Multinomial Prediction using MOJO @... " + target_dir) prediction_result = h2o.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=mojo_zip_path, output_csv_path=output_csv) mojo_prediction_1 = float(prediction_result[0]['Iris-setosa']) mojo_prediction_2 = float(prediction_result[0]['Iris-versicolor']) mojo_prediction_3 = float(prediction_result[0]['Iris-virginica']) print("Multinomial prediction (MOJO): p0: " + str(mojo_prediction_1)) print("Multinomial prediction (MOJO): p1: " + str(mojo_prediction_2)) print("Multinomial prediction (MOJO): p2: " + str(mojo_prediction_3)) assert multinomial_prediction_1 == mojo_prediction_1, "expected predictions to be the same for binary and MOJO model for Multinomial - p0" assert multinomial_prediction_2 == mojo_prediction_2, "expected predictions to be the same for binary and MOJO model for Multinomial - p1" assert multinomial_prediction_3 == mojo_prediction_3, "expected predictions to be the same for binary and MOJO model for Multinomial - p2"
def mojo_predict_csv_test(target_dir): mojo_file_name = "prostate_gbm_model.zip" mojo_zip_path = os.path.join(target_dir, mojo_file_name) prostate = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) r = prostate[0].runif() train = prostate[r < 0.70] test = prostate[r >= 0.70] # Getting first row from test data frame pdf = test[1, 2:] input_csv = "%s/in.csv" % target_dir output_csv = "%s/output.csv" % target_dir h2o.export_file(pdf, input_csv) # ================================================================= # Regression # ================================================================= regression_gbm1 = H2OGradientBoostingEstimator(distribution="gaussian") regression_gbm1.train(x=[2, 3, 4, 5, 6, 7, 8], y=1, training_frame=train) pred_reg = regression_gbm1.predict(pdf) p1 = pred_reg[0, 0] print("Regression prediction: " + str(p1)) download_mojo(regression_gbm1, mojo_zip_path) print("\nPerforming Regression Prediction using MOJO @... " + target_dir) prediction_result = h2o.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=mojo_zip_path, output_csv_path=output_csv) print("Prediction result: " + str(prediction_result)) assert p1 == float(prediction_result[0]['predict']), "expected predictions to be the same for binary and MOJO model for regression" # ================================================================= # Binomial # ================================================================= train[1] = train[1].asfactor() bernoulli_gbm1 = H2OGradientBoostingEstimator(distribution="bernoulli") bernoulli_gbm1.train(x=[2, 3, 4, 5, 6, 7, 8], y=1, training_frame=train) pred_bin = bernoulli_gbm1.predict(pdf) binary_prediction_0 = pred_bin[0, 1] binary_prediction_1 = pred_bin[0, 2] print("Binomial prediction: p0: " + str(binary_prediction_0)) print("Binomial prediction: p1: " + str(binary_prediction_1)) download_mojo(bernoulli_gbm1, mojo_zip_path) print("\nPerforming Binomial Prediction using MOJO @... " + target_dir) prediction_result = h2o.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=mojo_zip_path, output_csv_path=output_csv) mojo_prediction_0 = float(prediction_result[0]['0']) mojo_prediction_1 = float(prediction_result[0]['1']) print("Binomial prediction: p0: " + str(mojo_prediction_0)) print("Binomial prediction: p1: " + str(mojo_prediction_1)) assert binary_prediction_0 == mojo_prediction_0, "expected predictions to be the same for binary and MOJO model for Binomial - p0" assert binary_prediction_1 == mojo_prediction_1, "expected predictions to be the same for binary and MOJO model for Binomial - p1" # ================================================================= # Multinomial # ================================================================= iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv")) r = iris[0].runif() train = iris[r < 0.90] test = iris[r >= 0.10] # Getting first row from test data frame pdf = test[1, 0:4] input_csv = "%s/in-multi.csv" % target_dir output_csv = "%s/output.csv" % target_dir h2o.export_file(pdf, input_csv) multi_gbm = H2OGradientBoostingEstimator() multi_gbm.train(x=['C1', 'C2', 'C3', 'C4'], y='C5', training_frame=train) pred_multi = multi_gbm.predict(pdf) multinomial_prediction_1 = pred_multi[0, 1] multinomial_prediction_2 = pred_multi[0, 2] multinomial_prediction_3 = pred_multi[0, 3] print("Multinomial prediction (Binary): p0: " + str(multinomial_prediction_1)) print("Multinomial prediction (Binary): p1: " + str(multinomial_prediction_2)) print("Multinomial prediction (Binary): p2: " + str(multinomial_prediction_3)) download_mojo(multi_gbm, mojo_zip_path) print("\nPerforming Binomial Prediction using MOJO @... " + target_dir) prediction_result = h2o.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=mojo_zip_path, output_csv_path=output_csv) mojo_prediction_1 = float(prediction_result[0]['Iris-setosa']) mojo_prediction_2 = float(prediction_result[0]['Iris-versicolor']) mojo_prediction_3 = float(prediction_result[0]['Iris-virginica']) print("Multinomial prediction (MOJO): p0: " + str(mojo_prediction_1)) print("Multinomial prediction (MOJO): p1: " + str(mojo_prediction_2)) print("Multinomial prediction (MOJO): p2: " + str(mojo_prediction_3)) assert multinomial_prediction_1 == mojo_prediction_1, "expected predictions to be the same for binary and MOJO model for Multinomial - p0" assert multinomial_prediction_2 == mojo_prediction_2, "expected predictions to be the same for binary and MOJO model for Multinomial - p1" assert multinomial_prediction_3 == mojo_prediction_3, "expected predictions to be the same for binary and MOJO model for Multinomial - p2"
def mojo_predict_csv_test(sandbox_dir): data = h2o.import_file( path=pyunit_utils.locate("smalldata/coxph_test/heart.csv")) input_csv = "%s/in.csv" % sandbox_dir output_csv = "%s/prediction.csv" % sandbox_dir h2o.export_file(data, input_csv) data['transplant'] = data['transplant'].asfactor() model = H2OCoxProportionalHazardsEstimator(stratify_by=["transplant"], start_column="start", stop_column="stop") model.train(x=["age", "surgery", "transplant"], y="event", training_frame=data) h2o_prediction = model.predict(data) # download mojo model_zip_path = os.path.join(sandbox_dir, 'model.zip') genmodel_path = os.path.join(sandbox_dir, 'h2o-genmodel.jar') download_mojo(model, model_zip_path) assert os.path.isfile(model_zip_path) assert os.path.isfile(genmodel_path) # test that we can predict using default paths h2o.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=model_zip_path, verbose=True) h2o.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=model_zip_path, genmodel_jar_path=genmodel_path, verbose=True) assert os.path.isfile(output_csv) os.remove(model_zip_path) os.remove(genmodel_path) os.remove(output_csv) # test that we can predict using custom genmodel path other_sandbox_dir = tempfile.mkdtemp() try: genmodel_path = os.path.join(other_sandbox_dir, 'h2o-genmodel-custom.jar') download_mojo(model, model_zip_path, genmodel_path) assert os.path.isfile(model_zip_path) assert os.path.isfile(genmodel_path) try: h2o.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=model_zip_path, verbose=True) assert False, "There should be no h2o-genmodel.jar at %s" % sandbox_dir except RuntimeError: pass assert not os.path.isfile(output_csv) h2o.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=model_zip_path, genmodel_jar_path=genmodel_path, verbose=True) assert os.path.isfile(output_csv) os.remove(output_csv) output_csv = "%s/out.prediction" % other_sandbox_dir # test that we can predict using default paths mojo_prediction = h2o.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=model_zip_path, genmodel_jar_path=genmodel_path, verbose=True, output_csv_path=output_csv) assert os.path.isfile(output_csv) os.remove(model_zip_path) os.remove(genmodel_path) os.remove(output_csv) print(h2o_prediction) print(mojo_prediction) assert len(mojo_prediction) == h2o_prediction.nrows assert_frame_equal(h2o_prediction.as_data_frame(use_pandas=True), pandas.DataFrame( [float(m['lp']) for m in mojo_prediction], columns=["lp"]), check_dtype=False) finally: shutil.rmtree(other_sandbox_dir)