def test_print_mojo(): prostate_train = h2o.import_file(pyunit_utils.locate("smalldata/prostate/prostate.csv")) prostate_train["CAPSULE"] = prostate_train["CAPSULE"].asfactor() ntrees = 20 for algo in ALGOS: print("testing " + algo.__name__) model = algo(ntrees=ntrees) model.train(x=list(range(1,prostate_train.ncol)),y="CAPSULE", training_frame=prostate_train) mojo_path = model.download_mojo(RESULTS_DIR) # print all into JSON mojo_str = h2o.print_mojo(mojo_path) print("dumping " + algo.__name__ + " JSON trees") print("==BEGIN==") print(mojo_str) print("==/END==") mojo_dict = json.loads(mojo_str) assert "trees" in mojo_dict.keys() assert ntrees == len(mojo_dict["trees"]) # print one tree to dot mojo_str = h2o.print_mojo(mojo_path, tree_index=2, format="dot") print("dumping " + algo.__name__ + " DOT tree") print("==BEGIN==") print(mojo_str) print("==/END==") assert "Level 0" in mojo_str
def test_print_mojo(): prostate_train = h2o.import_file( pyunit_utils.locate("smalldata/prostate/prostate.csv")) prostate_train["CAPSULE"] = prostate_train["CAPSULE"].asfactor() ntrees = 20 learning_rate = 0.1 depth = 5 min_rows = 10 gbm_h2o = H2OGradientBoostingEstimator(ntrees=ntrees, learn_rate=learning_rate, max_depth=depth, min_rows=min_rows, distribution="bernoulli") gbm_h2o.train(x=list(range(1, prostate_train.ncol)), y="CAPSULE", training_frame=prostate_train) mojo_path = gbm_h2o.download_mojo(RESULTS_DIR) # print all mojo_str = h2o.print_mojo(mojo_path) mojo_dict = json.loads(mojo_str) assert "trees" in mojo_dict.keys() assert ntrees == len(mojo_dict["trees"]) # print one tree to dot mojo_str = h2o.print_mojo(mojo_path, tree_index=2, format="dot") assert "Level 0" in mojo_str
def test_print_mojo(): prostate_train = h2o.import_file( pyunit_utils.locate("smalldata/prostate/prostate.csv")) prostate_train["CAPSULE"] = prostate_train["CAPSULE"].asfactor() ntrees = 5 for algo in ALGOS: print("testing " + algo.__name__) model = algo(ntrees=ntrees) model.train(x=list(range(1, prostate_train.ncol)), y="CAPSULE", training_frame=prostate_train) mojo_path = model.download_mojo(RESULTS_DIR) # print all into JSON mojo_str = h2o.print_mojo(mojo_path) print("dumping " + algo.__name__ + " JSON trees") print("==BEGIN==") print(mojo_str) print("==/END==") mojo_dict = json.loads(mojo_str) assert "trees" in mojo_dict.keys() assert ntrees == len(mojo_dict["trees"]) # print one tree into JSON mojo_single_str = h2o.print_mojo(mojo_path, tree_index=2) mojo_single_dict = json.loads(mojo_single_str) mojo_single_dict["trees"][0]["index"] = 2 # patch the index number assert mojo_dict["trees"][2] == mojo_single_dict["trees"][0] # print all into PNG png_dir = h2o.print_mojo(mojo_path, format="png") for tree_idx in range(ntrees): fn = "Tree" + str(tree_idx) + (".png" if algo == H2OIsolationForestEstimator else "_Class0.png") tree_file = os.path.join(png_dir, fn) print(tree_file) assert os.path.isfile(tree_file) # print one tree into PNG png_single_file = h2o.print_mojo(mojo_path, format="png", tree_index=2) assert os.path.isfile(png_single_file) # print one tree to dot mojo_str = h2o.print_mojo(mojo_path, tree_index=2, format="dot") print("dumping " + algo.__name__ + " DOT tree") print("==BEGIN==") print(mojo_str) print("==/END==") assert "Level 0" in mojo_str
def xgboost_reweight_tree(): prostate_frame = h2o.import_file(path=pyunit_utils.locate("smalldata/prostate/prostate.csv")) prostate_frame["RACE"] = prostate_frame["RACE"].asfactor() prostate_frame["CAPSULE"] = prostate_frame["CAPSULE"].asfactor() x = ["AGE", "RACE", "GLEASON", "DCAPS", "PSA", "VOL", "DPROS"] y = 'CAPSULE' xgb_model = H2OXGBoostEstimator() xgb_model.train(x=x, y=y, training_frame=prostate_frame) # 0. Save original MOJO oring_mojo_path = xgb_model.download_mojo() orig_mojo_str = h2o.print_mojo(oring_mojo_path) # 1. Get original contributions contribs_original = xgb_model.predict_contributions(prostate_frame) assert contribs_original.col_names == [ u'RACE.0', u'RACE.1', u'RACE.2', u'RACE.missing(NA)', u'AGE', u'DPROS', u'DCAPS', u'PSA', u'VOL', u'GLEASON', u'BiasTerm' ] # 2. Scale weights => contributions should stay the same weights_scale = 2 prostate_frame["weights"] = weights_scale h2o.rapids('(tree.update.weights {} {} "{}")'.format(xgb_model.model_id, prostate_frame.frame_id, "weights")) contribs_reweighted = xgb_model.predict_contributions(prostate_frame) assert_frame_equal(contribs_reweighted.as_data_frame(), contribs_original.as_data_frame(), check_less_precise=3) # 3. Reweight based on small subset of the data => contributions are expected to change prostate_subset = prostate_frame.head(10) h2o.rapids('(tree.update.weights {} {} "{}")'.format(xgb_model.model_id, prostate_subset.frame_id, "weights")) contribs_subset = xgb_model.predict_contributions(prostate_subset) assert contribs_subset["BiasTerm"].min() != contribs_original["BiasTerm"].min() # 4. Save modified mojo reweighted_mojo_path = xgb_model.download_mojo() reweighted_mojo_str = h2o.print_mojo(reweighted_mojo_path) # Sanity check assert orig_mojo_str != reweighted_mojo_str # Check first tree weight init_f = 1 / (1 + math.exp(0)) hess_coef = init_f * (1 - init_f) orig_trees = json.loads(orig_mojo_str) assert orig_trees["trees"][0]["root"]["weight"] == prostate_frame.nrow * hess_coef reweighted_trees = json.loads(reweighted_mojo_str) assert reweighted_trees["trees"][0]["root"]["weight"] == prostate_subset.nrow * hess_coef * weights_scale
def convert(model, name=None, initial_types=None, doc_string='', target_opset=None, targeted_onnx=onnx.__version__, custom_conversion_functions=None, custom_shape_calculators=None): ''' This function produces an equivalent ONNX model of the given H2O MOJO model. Supported model types: - GBM, with limitations: - poisson, gamma, tweedie distributions not supported - multinomial distribution supported with 3 or more classes (use binomial otherwise) Ohter limitations: - modes with categorical splits not supported :param model: H2O MOJO model loaded into memory (see below for example) :param name: The name of the graph (type: GraphProto) in the produced ONNX model (type: ModelProto) :param initial_types: a python list. Each element is a tuple of a variable name and a type defined in data_types.py :param doc_string: A string attached onto the produced ONNX model :param target_opset: number, for example, 7 for ONNX 1.2, and 8 for ONNX 1.3. :param targeted_onnx: A string (for example, '1.1.2' and '1.2') used to specify the targeted ONNX version of the produced model. If ONNXMLTools cannot find a compatible ONNX python package, an error may be thrown. :param custom_conversion_functions: a dictionary for specifying the user customized conversion function :param custom_shape_calculators: a dictionary for specifying the user customized shape calculator :return: An ONNX model (type: ModelProto) which is equivalent to the input xgboost model :examples: >>> from onnxmltools.convert import convert_h2o >>> file = open("/path/to/h2o_mojo.zip", "rb") >>> mojo_content = file.read() >>> file.close() >>> h2o_onnx_model = convert_h2o(mojo_content) ''' if name is None: name = str(uuid4().hex) if initial_types is None: initial_types = [('input', FloatTensorType(shape=['None', 'None']))] _, model_path = tempfile.mkstemp() f = open(model_path, "wb") f.write(model) f.close() mojo_str = h2o.print_mojo(model_path, format="json") mojo_model = json.loads(mojo_str) if mojo_model["params"]["algo"] != "gbm": raise ValueError( "Model type not supported (algo=%s). Only GBM Mojo supported for now." % mojo_model["params"]["algo"]) target_opset = target_opset if target_opset else get_opset_number_from_onnx( ) topology = parse_h2o(mojo_model, initial_types, target_opset, custom_conversion_functions, custom_shape_calculators) topology.compile() onnx_model = convert_topology(topology, name, doc_string, target_opset, targeted_onnx) return onnx_model