def test_mojo_ids(): # Train a model airlines = h2o.import_file( path=pyunit_utils.locate("smalldata/testng/airlines_train.csv")) model = H2OGradientBoostingEstimator(ntrees=1) model.train(x=["Origin", "Dest"], y="IsDepDelayed", training_frame=airlines, verbose=False) # Save the previously created model into a temporary file original_model_filename = tempfile.mkdtemp() original_model_filename = model.save_mojo(original_model_filename) original_model_id = model.model_id print(original_model_id) # Import MOJO from the temporary file mojo_model = h2o.import_mojo(original_model_filename, model_id=original_model_id) print(mojo_model.model_id) assert_equals(mojo_model.model_id, original_model_id, "Ids should be the same.") # Download the MOJO original_model_filename = model.download_mojo(original_model_filename) # Upload MOJO from the temporary file mojo_model_up = h2o.upload_mojo(original_model_filename, model_id=original_model_id) print(mojo_model_up.model_id) assert_equals(mojo_model_up.model_id, original_model_id, "Ids should be the same.") # Load MOJO model from file mojo_model_from_file = H2OGenericEstimator.from_file( original_model_filename, original_model_id) print(mojo_model_from_file.model_id) assert_equals(mojo_model_from_file.model_id, original_model_id, "Ids should be the same.") # Test initialize model_id from path mojo_model_up_wid = h2o.upload_mojo(original_model_filename) print(mojo_model_up_wid.model_id) assert_equals(mojo_model_up_wid.model_id, original_model_id, "Ids should not be the same.") mojo_model_im_wid = h2o.import_mojo(original_model_filename) print(mojo_model_im_wid.model_id) assert_equals(mojo_model_im_wid.model_id, original_model_id, "Ids should not be the same.")
def mojo_conveniece(): # Train a model airlines = h2o.import_file(path=pyunit_utils.locate("smalldata/testng/airlines_train.csv")) model = H2OGradientBoostingEstimator(ntrees = 1) model.train(x = ["Origin", "Dest"], y = "IsDepDelayed", training_frame=airlines) #Save the previously created model into a temporary file original_model_filename = tempfile.mkdtemp() original_model_filename = model.save_mojo(original_model_filename) # Load the model from the temporary file mojo_model = h2o.import_mojo(original_model_filename) assert isinstance(mojo_model, H2OGenericEstimator) # Test scoring is available on the model predictions = mojo_model.predict(airlines) assert predictions is not None assert predictions.nrows == 24421 ##### # MOJO UPLOAD TEST ##### # Download the MOJO original_model_filename = model.download_mojo(original_model_filename) # Load the model from the temporary file mojo_model = h2o.upload_mojo(original_model_filename) assert isinstance(mojo_model, H2OGenericEstimator) # Test scoring is available on the model predictions = mojo_model.predict(airlines) assert predictions is not None assert predictions.nrows == 24421
def titanic(): df = h2o.import_file(pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), col_types={'pclass': "enum", 'survived': "enum"}) x = ["age", "sibsp", "parch", "fare", "sex", "pclass"] # Split the dataset into train and test train, test = df.split_frame(ratios=[.8], seed=1234) rfit = H2ORuleFitEstimator(min_rule_length=4, max_rule_length=5, max_num_rules=3, seed=1234, model_type="rules") rfit.train(training_frame=train, x=x, y="survived", validation_frame=test) assert rfit.rmse(valid=True) is not None, "validation metrics should be present" print(rfit.rule_importance()) assert rfit._model_json["output"]["model_summary"] is not None, "model_summary should be present" assert len(rfit._model_json["output"]["model_summary"]._cell_values) > 0, "model_summary's content should be present" rfit_predictions = rfit.predict(test) import tempfile tmpdir = tempfile.mkdtemp() try: mojo_path = rfit.save_mojo(tmpdir) mojo_model = h2o.upload_mojo(mojo_path) finally: import shutil shutil.rmtree(tmpdir) mojo_predictions = mojo_model.predict(test) assert pyunit_utils.compare_frames(rfit_predictions, mojo_predictions, 0)
def iris(): df = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris_train.csv"), col_types={'species': "enum"}) x = df.columns y = "species" x.remove(y) # Split the dataset into train and test train, test = df.split_frame(ratios=[.8], seed=1234) rfit = H2ORuleFitEstimator(min_rule_length=4, max_rule_length=5, max_num_rules=3, seed=1234, model_type="rules") rfit.train(training_frame=train, x=x, y=y, validation_frame=test) assert rfit.rmse( valid=True) is not None, "validation metrics should be present" print(rfit.rule_importance()) assert rfit._model_json["output"][ "model_summary"] is not None, "model_summary should be present" assert len(rfit._model_json["output"]["model_summary"]._cell_values ) > 0, "model_summary's content should be present" rfit_predictions = rfit.predict(test) frame = rfit.predict_rules(train, ['M0T38N5_Iris-virginica']) assert frame.sum().getrow()[0] == 49.0 import tempfile tmpdir = tempfile.mkdtemp() try: mojo_path = rfit.save_mojo(tmpdir) mojo_model = h2o.upload_mojo(mojo_path) finally: import shutil shutil.rmtree(tmpdir) mojo_predictions = mojo_model.predict(test) assert pyunit_utils.compare_frames(rfit_predictions, mojo_predictions, 0) # test predict_rules also on linear variable input rfit = H2ORuleFitEstimator(min_rule_length=4, max_rule_length=5, max_num_rules=3, seed=1234, model_type="rules_and_linear") rfit.train(training_frame=train, x=x, y=y, validation_frame=test) print(rfit.rule_importance()) frame = rfit.predict_rules( train, ['linear.petal_len_Iris-setosa', 'linear.petal_wid_Iris-virginica']) assert frame.sum().getrow()[0] == train.nrows
def mojo_conveniece(): # Train a model airlines = h2o.import_file( path=pyunit_utils.locate("smalldata/testng/airlines_train.csv")) model = H2OGradientBoostingEstimator(ntrees=1) model.train(x=["Origin", "Dest"], y="IsDepDelayed", training_frame=airlines) #Save the previously created model into a temporary file original_model_filename = tempfile.mkdtemp() original_model_filename = model.save_mojo(original_model_filename) # Load the model from the temporary file mojo_model = h2o.import_mojo(original_model_filename) assert isinstance(mojo_model, H2OGenericEstimator) # Test scoring is available on the model predictions = mojo_model.predict(airlines) assert predictions is not None assert predictions.nrows == 24421 ##### # MOJO UPLOAD TEST ##### # Download the MOJO original_model_filename = model.download_mojo(original_model_filename) # Load the model from the temporary file mojo_model = h2o.upload_mojo(original_model_filename) assert isinstance(mojo_model, H2OGenericEstimator) # Test scoring is available on the model predictions = mojo_model.predict(airlines) assert predictions is not None assert predictions.nrows == 24421 ##### # MOJO to POJO Conversion test with POJO re-import ##### pojo_directory = os.path.join(pyunit_utils.locate("results"), model.model_id + ".java") pojo_path = model.download_pojo(path=pojo_directory) mojo2_model = h2o.import_mojo(pojo_path) predictions2 = mojo2_model.predict(airlines) assert predictions2 is not None assert predictions2.nrows == 24421 assert_frame_equal(predictions.as_data_frame(), predictions2.as_data_frame())
def gbm_mojo_reproducibility_info(): prostate_hex = h2o.import_file(pyunit_utils.locate("smalldata/testng/prostate.csv")) model = H2OIsolationForestEstimator() model.train(training_frame=prostate_hex) print("Downloading Java prediction model code from H2O") TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", model._id)) os.makedirs(TMPDIR) mojo_path = model.download_mojo(path=TMPDIR) gbmModel = h2o.upload_mojo(mojo_path=mojo_path) isinstance(gbmModel._model_json['output']['reproducibility_information_table'][1]['h2o_cluster_uptime'][0], float) isinstance(gbmModel._model_json['output']['reproducibility_information_table'][0]['java_version'][0], str) assert(gbmModel._model_json['output']['reproducibility_information_table'][2]['input_frame'][0] == 'training_frame')
def dl_mojo_reproducibility_info(): # Training data train_data = h2o.import_file( path=tests.locate("smalldata/gbm_test/ecology_model.csv")) train_data = train_data.drop('Site') train_data['Angaus'] = train_data['Angaus'].asfactor() print(train_data.describe()) train_data.head() # Testing data test_data = h2o.import_file( path=tests.locate("smalldata/gbm_test/ecology_eval.csv")) test_data['Angaus'] = test_data['Angaus'].asfactor() print(test_data.describe()) test_data.head() # Run DeepLearning model = H2ODeepLearningEstimator(loss="CrossEntropy", epochs=1000, hidden=[20, 20, 20]) model.train(x=list(range(1, train_data.ncol)), y="Angaus", training_frame=train_data, validation_frame=test_data) print("Downloading Java prediction model code from H2O") TMPDIR = os.path.normpath( os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", model._id)) os.makedirs(TMPDIR) mojo_path = model.download_mojo(path=TMPDIR) dlModel = h2o.upload_mojo(mojo_path=mojo_path) isinstance( dlModel._model_json['output']['reproducibility_information_table'][1] ['h2o_cluster_uptime'][0], float) isinstance( dlModel._model_json['output']['reproducibility_information_table'][0] ['java_version'][0], str) assert (dlModel._model_json['output']['reproducibility_information_table'] [2]['input_frame'][0] == 'training_frame') assert (dlModel._model_json['output']['reproducibility_information_table'] [2]['input_frame'][1] == 'validation_frame')
def test_helper(train_path, test_path, target, classification, blending, metalearner_transform): train = h2o.import_file(path=pu.locate(train_path)) test = h2o.import_file(path=pu.locate(test_path)) if classification: train[target] = train[target].asfactor() if blending: train, blend = train.split_frame(ratios=[.7], seed=seed) model_args = dict() if blending else dict( nfolds=3, fold_assignment="Modulo", keep_cross_validation_predictions=True) gbm = H2OGradientBoostingEstimator(ntrees=10, seed=seed, **model_args) gbm.train(y=target, training_frame=train) rf = H2ORandomForestEstimator(ntrees=10, seed=seed, **model_args) rf.train(y=target, training_frame=train) se = H2OStackedEnsembleEstimator( base_models=[rf, gbm], metalearner_transform=metalearner_transform) se.train(y=target, training_frame=train, **(dict(blending_frame=blend) if blending else dict())) se_predictions = se.predict(test) import tempfile tmpdir = tempfile.mkdtemp() try: mojo_path = se.save_mojo(tmpdir) mojo_model = h2o.upload_mojo(mojo_path) finally: import shutil shutil.rmtree(tmpdir) mojo_predictions = mojo_model.predict(test) assert pu.compare_frames(se_predictions, mojo_predictions, 0)
def xgb_mojo_reproducibility_info(): df = h2o.import_file( path=pyunit_utils.locate("smalldata/gbm_test/ecology_model.csv")) df["Angaus"] = df["Angaus"].asfactor() df["Weights"] = h2o.H2OFrame.from_python( abs(np.random.randn(df.nrow, 1)).tolist())[0] print(df.col_names) train, calib = df.split_frame( ratios=[.8], destination_frames=["eco_train", "eco_calib"], seed=42) model = H2OXGBoostEstimator(ntrees=100, distribution="bernoulli", min_rows=10, max_depth=5, weights_column="Weights", calibrate_model=True, calibration_frame=calib) model.train(x=list(range(2, train.ncol)), y="Angaus", training_frame=train) print("Downloading Java prediction model code from H2O") TMPDIR = os.path.normpath( os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", model._id)) os.makedirs(TMPDIR) mojo_path = model.download_mojo(path=TMPDIR) xgbModel = h2o.upload_mojo(mojo_path=mojo_path) isinstance( xgbModel._model_json['output']['reproducibility_information_table'][1] ['h2o_cluster_uptime'][0], float) isinstance( xgbModel._model_json['output']['reproducibility_information_table'][0] ['java_version'][0], str) assert (xgbModel._model_json['output']['reproducibility_information_table'] [2]['input_frame'][0] == 'training_frame') assert (xgbModel._model_json['output']['reproducibility_information_table'] [2]['input_frame'][2] == 'calibration_frame')
def titanic(): df = h2o.import_file(pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), col_types={ 'pclass': "enum", 'survived': "enum" }) x = ["age", "sibsp", "parch", "fare", "sex", "pclass"] # Split the dataset into train and test train, test = df.split_frame(ratios=[.8], seed=1234) rfit = H2ORuleFitEstimator(min_rule_length=4, max_rule_length=5, max_num_rules=3, seed=1234, model_type="rules") rfit.train(training_frame=train, x=x, y="survived", validation_frame=test) rfit2 = H2ORuleFitEstimator(min_rule_length=4, max_rule_length=5, max_num_rules=3, seed=1234, model_type="rules", lambda_=1e-8) rfit2.train(training_frame=train, x=x, y="survived", validation_frame=test) assert len(rfit.rule_importance()['rule']) < len( rfit2.rule_importance()['rule']) assert rfit.rmse( valid=True) is not None, "validation metrics should be present" print(rfit.rule_importance()) count = 0 for i in range(train.nrows): # these conditions are taken from the resulting rule rfit.rule_importance()['rule'][0] if (train.as_data_frame()['age'][i] >= 14.977890968322754 or math.isnan(train.as_data_frame()['age'][i])) and\ (train.as_data_frame()['fare'][i] < 56.036006927490234 or math.isnan(train.as_data_frame()['fare'][i])) and\ (train.as_data_frame()['sex'][i] == "male") and\ (train.as_data_frame()['sibsp'][i] < 3.5 or math.isnan(train.as_data_frame()['sibsp'][i])): count = count + 1 assert abs(rfit.rule_importance()['support'][0] - count / train.nrows) < 1e-6 assert rfit._model_json["output"][ "model_summary"] is not None, "model_summary should be present" assert len(rfit._model_json["output"]["model_summary"]._cell_values ) > 0, "model_summary's content should be present" rfit_predictions = rfit.predict(test) import tempfile tmpdir = tempfile.mkdtemp() try: mojo_path = rfit.save_mojo(tmpdir) mojo_model = h2o.upload_mojo(mojo_path) finally: import shutil shutil.rmtree(tmpdir) mojo_predictions = mojo_model.predict(test) assert pyunit_utils.compare_frames(rfit_predictions, mojo_predictions, 0) rfit = H2ORuleFitEstimator(min_rule_length=1, max_rule_length=1, max_num_rules=3, seed=1234, model_type="rules") rfit.train(training_frame=train, x=x, y="survived", validation_frame=test) print(rfit.rule_importance()) count = 0 for i in range(train.nrows): # this condition is taken from the resulting rule rfit.rule_importance()['rule'][0] if train.as_data_frame()['sex'][i] == 'female': count = count + 1 assert abs(rfit.rule_importance()['support'][0] - count / train.nrows) < 1e-6
def __setstate__(self, state): self._mojo_path = state.path self._mojo_model = h2o.upload_mojo(state.path) self._column_names = state.colnames
def __init__(self, mojo_path, column_names=None): self._mojo_path = mojo_path self._mojo_model = h2o.upload_mojo(mojo_path) self._column_names = column_names
def generate_and_import_combined_pojo(): if sys.version_info[0] < 3: # Python 2 print("This example needs Python 3.x+") return weather_orig = h2o.import_file( path=pyunit_utils.locate("smalldata/junit/weather.csv")) weather = weather_orig # working copy features = list(set(weather.names) - {"Date", "RainTomorrow", "Sunshine"}) features.sort() response = "RISK_MM" glm_model = H2OGeneralizedLinearEstimator() glm_model.train(x=features, y=response, training_frame=weather) glm_preds = glm_model.predict(weather) gbm_model = H2OGradientBoostingEstimator(ntrees=5) gbm_model.train(x=features, y=response, training_frame=weather) gbm_preds = gbm_model.predict(weather) # Drop columns that we will calculate in POJO manually (we will recreate them in POJO to be the exact same) weather = weather.drop("ChangeTemp") weather = weather.drop("ChangeTempDir") (combined_pojo_name, combined_pojo_path) = generate_combined_pojo(glm_model, gbm_model) print("Combined POJO was stored in: " + combined_pojo_path) # Note: when using upload_mojo - always specify model_id=<POJO class name> pojo_model = h2o.upload_mojo(combined_pojo_path, model_id=combined_pojo_name) # Testing begins # Sanity test - test parameterization that delegates to GLM weather["Bias"] = 1 # behave like GLM pojo_glm_preds = pojo_model.predict(weather) assert_frame_equal(pojo_glm_preds.as_data_frame(), glm_preds.as_data_frame()) # Sanity test - test parameterization that delegates to GBM weather["Bias"] = 0 # behave like GBM pojo_gbm_preds = pojo_model.predict(weather) assert_frame_equal(pojo_gbm_preds.as_data_frame(), gbm_preds.as_data_frame()) # Test per-segment specific behavior, segments are defined by ChangeWindDirect weather["Bias"] = float("NaN") for change_wind_dir in weather["ChangeWindDirect"].levels()[0]: weather_cwd = weather[weather["ChangeWindDirect"] == change_wind_dir] weather_orig_cwd = weather_orig[weather_orig["ChangeWindDirect"] == change_wind_dir] pojo_weather_cwd_preds = pojo_model.predict(weather_cwd) if change_wind_dir == "c" or change_wind_dir == "l": expected = glm_model.predict(weather_orig_cwd) * 2 assert_frame_equal(pojo_weather_cwd_preds.as_data_frame(), expected.as_data_frame()) elif change_wind_dir == "n": expected = (glm_model.predict(weather_orig_cwd) + gbm_model.predict(weather_orig_cwd)) / 2 assert_frame_equal(pojo_weather_cwd_preds.as_data_frame(), expected.as_data_frame()) elif change_wind_dir == "s": expected = gbm_model.predict(weather_orig_cwd) assert_frame_equal(pojo_weather_cwd_preds.as_data_frame(), expected.as_data_frame())
def gbm_mojo_reproducibility_info(): problems = ['binomial', 'multinomial', 'regression'] PROBLEM = problems[randint(0, (len(problems) - 1))] TESTROWS = 2000 df = pyunit_utils.random_dataset(PROBLEM, verbose=False, NTESTROWS=TESTROWS) train = df[TESTROWS:, :] x = list(set(df.names) - {"respose"}) params = {'ntrees': 50, 'learn_rate': 0.1, 'max_depth': 4} gbmModel = pyunit_utils.build_save_model_GBM(params, x, train, "response") isinstance( gbmModel._model_json['output']['reproducibility_information_table'][1] ['h2o_cluster_uptime'][0], float) isinstance( gbmModel._model_json['output']['reproducibility_information_table'][0] ['java_version'][0], str) assert (gbmModel._model_json['output']['reproducibility_information_table'] [2]['input_frame'][0] == 'training_frame') ecology = h2o.import_file( path=pyunit_utils.locate("smalldata/gbm_test/ecology_model.csv")) ecology['Angaus'] = ecology['Angaus'].asfactor() train, calib = ecology.split_frame(seed=12354) predictors = ecology.columns[3:13] w = h2o.create_frame(binary_fraction=1, binary_ones_fraction=0.5, missing_fraction=0, rows=744, cols=1) w.set_names(["weight"]) train = train.cbind(w) model = H2OGradientBoostingEstimator(ntrees=10, max_depth=5, min_rows=10, learn_rate=0.1, distribution="multinomial", weights_column="weight", calibrate_model=True, calibration_frame=calib) model.train(x=predictors, y="Angaus", training_frame=train) print("Downloading Java prediction model code from H2O") TMPDIR = os.path.normpath( os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", model._id)) os.makedirs(TMPDIR) mojo_path = model.download_mojo(path=TMPDIR) gbmModel = h2o.upload_mojo(mojo_path=mojo_path) isinstance( gbmModel._model_json['output']['reproducibility_information_table'][1] ['h2o_cluster_uptime'][0], float) isinstance( gbmModel._model_json['output']['reproducibility_information_table'][0] ['java_version'][0], str) assert (gbmModel._model_json['output']['reproducibility_information_table'] [2]['input_frame'][0] == 'training_frame') assert (gbmModel._model_json['output']['reproducibility_information_table'] [2]['input_frame'][2] == 'calibration_frame')
def predict_risk_scores(**kwargs): mongo_connect = kwargs["dag_run"].conf.get("mongo_connect") database = kwargs["dag_run"].conf.get("database") from_date = kwargs["dag_run"].conf.get("from_date") to_date = kwargs["dag_run"].conf.get("to_date") speciality_list = kwargs["dag_run"].conf.get("speciality_list") p_auth = authenticate.prediction_login(**kwargs) client = pymongo.MongoClient(mongo_connect) db = client[database] for i in speciality_list: speciality_name = i[0].replace("-", "").replace(" ", "").replace( ",", "").replace("&", "").lower() #Generate the name for the filtered claim collection provider_collection = "provider_" provider_collection = provider_collection + speciality_name provider_collection = provider_collection + "_" + from_date + to_date provider_collection = provider_collection.replace("-", "").replace( " ", "").replace(",", "").replace("&", "").lower() pros = h2o.import_file("/data/" + provider_collection + ".csv") pros["provider_fraudulent"] = pros["provider_fraudulent"].asfactor() path = "/data/models/" + speciality_name + ".zip" model_key = h2o.upload_mojo(path) if "alpha" in model_key.params: model_type = "GLM" else: model_type = "GBM" print(model_type) if model_type in ['GBM', 'XRT']: contrib = model_key.predict_contributions(pros) contribdrop = contrib.drop("_id") contributions = contribdrop.cbind(pros["_id"]) export_path = "/data/" + speciality_name + "_contrib_oot.csv" h2o.export_file(contributions, path=export_path, force=True) db[speciality_name + "_contrib_oot"].drop() d.csv_import(p_auth, "medscheme_new", speciality_name + "_contrib_oot", speciality_name + "_contrib_oot.csv") elif model_type == 'GLM': print("HERE") detail_path = "/data/models/" + speciality_name[0].upper( ) + speciality_name[1:] + "/experimental/modelDetails.json" with open(detail_path) as json_file: mojo_dict = json.load(json_file) coeff_dict = dict( zip(mojo_dict['output']['coefficients_table']['data'][0], mojo_dict['output']['coefficients_table']['data'][1])) contrib_table = speciality_name + "_contrib_oot" db[contrib_table].drop() for doc in db[provider_collection].find(): insert_dict = {"_id": doc["_id"]} iter_count = 0 for j in coeff_dict: if (j in ["Intercept", "_id"]): pass elif "." in j: var_value = j[j.index(".") + 1:] var_name = j[:j.index(".")] if doc[var_name] == var_value: insert_dict[var_name] = coeff_dict[j] elif j not in doc: insert_dict[j] = 0 else: insert_dict[j] = doc[j] * coeff_dict[j] db[contrib_table].insert_one(insert_dict) pred = model_key.predict(pros) colsCombine_df = pred.cbind(pros["_id"]) export_path = "/data/" + speciality_name + "_prediction_oot.csv" h2o.export_file(colsCombine_df, path=export_path, force=True) db[speciality_name + "_prediction_oot"].drop() d.csv_import(p_auth, "medscheme_new", speciality_name + "_prediction_oot", speciality_name + "_prediction_oot.csv")