def build_audit_h2o(classifier, name): mapper = DataFrameMapper( [([column], ContinuousDomain()) for column in ["Age", "Hours", "Income"]] + [([column], CategoricalDomain()) for column in ["Employment", "Education", "Marital", "Occupation", "Gender", "Deductions"]] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("uploader", H2OFrameCreator()), ("classifier", classifier) ]) pipeline.fit(audit_X, H2OFrame(audit_y.to_frame(), column_types = ["categorical"])) pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13)) classifier = pipeline._final_estimator store_mojo(classifier, name) store_pkl(pipeline, name) adjusted = pipeline.predict(audit_X) adjusted.set_names(["h2o(Adjusted)", "probability(0)", "probability(1)"]) store_csv(adjusted.as_data_frame(), name)
def build_auto_h2o(regressor, name): transformer = ColumnTransformer( [(column, CategoricalDomain(), [column]) for column in ["cylinders", "model_year", "origin"]] + [(column, ContinuousDomain(), [column]) for column in ["displacement", "horsepower", "weight", "acceleration"]] ) pipeline = PMMLPipeline([ ("transformer", transformer), ("uploader", H2OFrameCreator(column_names = ["cylinders", "model_year", "origin", "displacement", "horsepower", "weight", "acceleration"], column_types = ["enum", "enum", "enum", "numeric", "numeric", "numeric", "numeric"])), ("regressor", regressor) ]) pipeline.fit(auto_X, H2OFrame(auto_y.to_frame())) pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13)) regressor = pipeline._final_estimator store_mojo(regressor, name) store_pkl(pipeline, name) mpg = pipeline.predict(auto_X) mpg.set_names(["mpg"]) store_csv(mpg.as_data_frame(), name)
def predict_row(row, node): if node.classifier: if isinstance(node.classifier, h2o.estimators.H2OEstimator): if not isinstance(row, H2OFrame): column_types_row = get_h2o_column_types(row.columns) row = H2OFrame(row, column_types=column_types_row) prediction = node.classifier.predict(row) prediction = np.concatenate( prediction['predict'].as_data_frame().values) else: prediction = node.classifier.predict(row) if prediction[0] == 0: prediction = predict_row(row, node.left_node) else: prediction = predict_row(row, node.right_node) else: return list(node.classes)[0] return prediction
def _prepare_one_hot(file, y, exclude_cols=None): if exclude_cols is None: exclude_cols = [] dir_path = os.path.dirname(os.path.realpath(__file__)) frame = h2o.import_file(dir_path + "/" + file) train, test = frame.split_frame([0.95], seed=42) cols_to_encode = [] other_cols = [] for name, ctype in test.types.items(): if name == y or name in exclude_cols: pass elif ctype == "enum": cols_to_encode.append(name) else: other_cols.append(name) train_frame = train.as_data_frame() train_encode = train_frame.loc[:, cols_to_encode] train_other = train_frame.loc[:, other_cols + [y]] enc = OneHotEncoder(categories='auto', handle_unknown='ignore') enc.fit(train_encode) colnames = [] for cidx in range(len(cols_to_encode)): for val in enc.categories_[cidx]: colnames.append(cols_to_encode[cidx] + "." + val) train_encoded = enc.transform(train_encode.values).toarray() train_encoded = pd.DataFrame(train_encoded) train_encoded.columns = colnames train = train_other.join(train_encoded) train = H2OFrame(train) test_frame = test.as_data_frame() test_encode = test_frame.loc[:, cols_to_encode] test_other = test_frame.loc[:, other_cols] test_encoded = enc.transform(test_encode.values).toarray() test_encoded = pd.DataFrame(test_encoded) test_encoded.columns = colnames test = test_other.join(test_encoded) return train, test
def classify(x_train, y_train, estimator, x_test): """ Make the classification and provide the result for the given estimator. For the H2O library, the transformation in H2oFrame is integrated :param x_train: the dataset for training :param y_train: the classes for training :param estimator: the estimator to be considered :param x_test: the dataset for testing :return: - the prediction for the x_test - the trained estimator """ if isinstance(estimator, h2o.estimators.H2OEstimator): current_estimator = fit_h2o(x_train, y_train, estimator) column_types_x = get_h2o_column_types(x_test.columns) x_test = H2OFrame(x_test, column_types=column_types_x) prediction = current_estimator.predict(x_test) return np.concatenate( prediction['predict'].as_data_frame().values), current_estimator else: current_estimator = clone(estimator) current_estimator.fit(x_train, y_train) return current_estimator.predict(x_test), current_estimator
def pubdev_6393(): locations = [['location'], ['�X県 A市 '], # First observation contains replacement character for unknown char ['X県 B市']] frame = H2OFrame(locations, header=True, column_types=['enum']) assert frame.ncols == 1 assert frame.nrows == len(locations) - 1 frame_categories= frame['location'].categories() print(frame_categories) frame_converted = frame['location'].ascharacter().asfactor() assert frame_converted.ncols == 1 assert frame_converted.nrows == len(locations) - 1 frame_converted_categories = frame_converted.categories(); print(frame_converted_categories) # Check for the representation of categoricals to be exactly the same # No explicit check for any specific behavior, the behavior of Categorical and asFactor should be the same for i in range(0,len(frame_converted_categories)): assert frame_categories[i] == frame_converted_categories[i]
def pubdev_6439(): data = [ ['C1'], [ 'X県 A市 ' ], # First observation contains replacement character for unknown char ['X県 B市'] ] frame = H2OFrame(data, header=True, column_types=['enum']) frame_categories = frame['C1'].categories() print(frame_categories) # Two observations assert len(frame_categories) == 2 assert len( frame_categories[0] ) == 6 # First observation has six characters (space at the end) assert len( frame_categories[1] ) == 5 # Second observation has 5 characters (missing space at the end) # Python 2 and 3 handle strings differently if (sys.version_info[0] == 3): assert ''.join(data[1]) == frame_categories[ 0] # First categorical level equals to first observation assert ''.join(data[2]) == frame_categories[ 1] # Second categorical levels equals to second observation elif (sys.version_info[0] == 2): assert ''.join(data[1]).decode("utf-8") == frame_categories[ 0] # First categorical level equals to first observation assert ''.join(data[2]).decode("utf-8") == frame_categories[ 1] # Second categorical levels equals to second observation else: assert False
def train_w2v(df, epochs=None, save_dir=None): """ trains word2vec model on all text columns of df. Returns w2v model object that can transform data. """ print("training word2vec model ...") args = {} if epochs is not None: args['epochs'] = int(epochs) if save_dir is not None: args['export_checkpoints_dir'] = os.path.join(save_dir,"h2o_model/") df = df.copy() text_columns = get_text_cols(df) print("Text columns are: ", text_columns) df_text = df[text_columns] text_frame = H2OFrame(df_text) for col in text_columns: text_frame[col] = text_frame[col].ascharacter() words = text_frame.tokenize(" ") w2v_model = H2OWord2vecEstimator(sent_sample_rate = 0.0, **args) w2v_model.train(training_frame=words) w2v_model.text_columns = text_columns return w2v_model
def yhat_h2o_classification(m, d): from h2o import H2OFrame return m.predict(H2OFrame( d, column_types=m._column_types)).as_data_frame().to_numpy()[:, 2]
def yhat_h2o_regression(m, d): from h2o import H2OFrame return m.predict(H2OFrame( d, column_types=m._column_types)).as_data_frame().to_numpy().flatten()
# usage: python test_name.py --usecloud ipaddr:port # ip_port = sys.argv[2].split(":") print ip_port ip = ip_port[0] port = int(ip_port[1]) ###################################################### # # Sample Running GBM on prostate.csv # Connect to a pre-existing cluster cluster = h2o.init(ip=ip, port=port) df = H2OFrame(remote_fname="../../../smalldata/logreg/prostate.csv") print df.describe() # Remove ID from training frame del df['ID'] # For VOL & GLEASON, a zero really means "missing" vol = df['VOL'] vol[vol == 0] = None gle = df['GLEASON'] gle[gle == 0] = None # Convert CAPSULE to a logical factor df['CAPSULE'] = df['CAPSULE'].asfactor() # Test/train split
("Income", ContinuousDomain()), (["Hours", "Income"], Alias(ExpressionTransformer("X[1] / (X[0] * 52)"), "Hourly_Income"))]) classifier = H2ORandomForestEstimator(ntrees=17) predict_proba_transformer = Pipeline([ ("expression", ExpressionTransformer("X[1]")), ("cut", Alias(CutTransformer(bins=[0.0, 0.75, 0.90, 1.0], labels=["no", "maybe", "yes"]), "Decision", prefit=True)) ]) pipeline = PMMLPipeline([("local_mapper", mapper), ("uploader", H2OFrameCreator()), ("remote_classifier", classifier)], predict_proba_transformer=predict_proba_transformer) pipeline.fit(audit_X, H2OFrame(audit_y.to_frame(), column_types=["categorical"])) pipeline.verify(audit_X.sample(100)) sklearn2pmml(pipeline, "pmml/RandomForestAudit.pmml") if "--deploy" in sys.argv: from openscoring import Openscoring os = Openscoring("http://localhost:8080/openscoring") os.deployFile("RandomForestAudit", "pmml/RandomForestAudit.pmml")
async def serve(q: Q): if q.args.train: # train WaveML Model using H2O-3 AutoML q.client.wave_model = build_model( train_df=q.client.train_df, target_column='target', model_type=ModelType.H2O3, _h2o3_max_runtime_secs=5, _h2o3_nfolds=2, _h2o3_include_algos=['DRF', 'XGBoost', 'GBM']) model_id = q.client.wave_model.model.model_id accuracy = round(q.client.wave_model.model.accuracy()[0][1] * 100, 2) # show training details and prediction option q.page['example'].items[1].buttons.items[1].button.disabled = False q.page['example'].items[2].message_bar.type = 'success' q.page['example'].items[ 2].message_bar.text = 'Training successfully completed!' q.page['example'].items[ 3].text.content = f'''**H2O AutoML model id:** {model_id} <br /> **Accuracy:** {accuracy}%''' q.page['example'].items[4].text.content = '' q.page['example'].items[5].text.content = '' elif q.args.predict: # predict on test data preds = q.client.wave_model.predict(test_df=q.client.test_df) shaps = q.client.wave_model.model.predict_contributions( H2OFrame(q.client.test_df)).as_data_frame() # show predictions q.page['example'].items[ 2].message_bar.text = 'Prediction successfully completed!' q.page['example'].items[ 4].text.content = f'''**Example predictions:** <br /> {preds[0]} <br /> {preds[1]} <br /> {preds[2]}''' q.page['example'].items[ 5].text.content = f'''**Example SHAP contributions:** <br /> {shaps.head(3).to_html()}''' else: # prepare sample train and test dataframes data = load_breast_cancer(as_frame=True)['frame'] q.client.train_df, q.client.test_df = train_test_split(data, train_size=0.8) # display ui q.page['example'] = ui.form_card( box='1 1 -1 -1', items=[ ui.text(content='''The sample dataset used is the <a href="https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_breast_cancer.html#sklearn.datasets.load_breast_cancer" target="_blank">breast cancer dataset</a>.''' ), ui.buttons(items=[ ui.button(name='train', label='Train', primary=True), ui.button(name='predict', label='Predict', primary=True, disabled=True), ]), ui.message_bar(type='warning', text='Training will take a few seconds'), ui.text(content=''), ui.text(content=''), ui.text(content='') ]) await q.page.save()
def pav(y, X, w): # make H2O Frame (y, X, w) frame = H2OFrame(np.column_stack((y, X, w))) return H2OFrame._expr(expr=ExprNode("isotonic.pav", frame))[["C1", "C2"]]
def get_simple_preprocessed_input_test_frame(): doc_ids = [0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2] words = ['A', 'B', 'C', 'A', 'a', 'a', 'Z', 'C', 'c', 'B', 'C'] return H2OFrame(OrderedDict([('DocID', doc_ids), ('Words', words)]), column_types=['numeric', 'string'])
def get_simple_input_test_frame(): doc_ids = [0, 1, 2] documents = ['A B C', 'A a a Z', 'C c B C'] return H2OFrame(OrderedDict([('DocID', doc_ids), ('Document', documents)]), column_types=['numeric', 'string'])
# Set dummy response var in test data test[response_name_fact] = None test[response_name_fact] = test[response_name_fact].asfactor() # Combine data into one set for simpler processing all_data = train.rbind(test) train = None test = None # Get train and test indexes test_idx = all_data[response_name_fact].isna() train_idx = test_idx.logical_negation() # Process the data to create additional features all_data = H2OFrame(helpers.pre_pipeline_process(all_data.as_data_frame())) # Predict Age missing_ages_idx = all_data['Age'].isna() unknown_ages_df = all_data[missing_ages_idx] unknown_ages_df.pop('Age') known_ages_df = all_data[missing_ages_idx.logical_negation()] age_model = H2ORandomForestEstimator(seed=42) age_model.train( ['Title', 'Sex', 'Embarked', 'Pclass', 'SibSp', 'Parch', 'Fare'], 'Age', training_frame=known_ages_df) age_prediction = age_model.predict(unknown_ages_df) age_join_frame = age_prediction.cbind(unknown_ages_df['PassengerId']) all_data = helpers.merge_ages(all_data, age_join_frame)
def test_asserts(): """Test type-checking functionality.""" def assert_error(*args, **kwargs): """Check that assert_is_type() with given arguments throws an error.""" try: assert_is_type(*args, **kwargs) raise RuntimeError("Failed to throw an exception") except H2OTypeError as exc: # Check whether the message can stringify properly message = str(exc) assert len(message) < 1000 return class A(object): """Dummy A.""" class B(A): """Dummy B.""" class C(A): """Dummy C.""" class D(B, C): """Dummy D.""" assert_is_type(3, int) assert_is_type(2**100, int) assert_is_type("3", str) assert_is_type(u"3", str) assert_is_type("foo", u"foo") assert_is_type(u"foo", "foo") assert_is_type("I", *list("ABCDEFGHIJKL")) assert_is_type(False, bool) assert_is_type(43, str, bool, int) assert_is_type(4 / 3, int, float) assert_is_type(None, None) assert_is_type(None, A, str, None) assert_is_type([], [float]) assert_is_type([1, 4, 5], [int]) assert_is_type([1.0, 2, 5], [int, float]) assert_is_type([[2.0, 3.1, 0], [2, 4.4, 1.1], [-1, 0]], [[int, float]]) assert_is_type([1, None, 2], [int, float, None]) assert_is_type({1, 5, 1, 1, 3}, {int}) assert_is_type({1, "hello", 3}, {int, str}) assert_is_type({"foo": 1, "bar": 2}, {str: int}) assert_is_type({"foo": 3, "bar": [5], "baz": None}, {str: U(int, None, [int])}) assert_is_type({"foo": 1, "bar": 2}, {"foo": int, "bar": U(int, float, None), "baz": bool}) assert_is_type({}, {"spam": int, "egg": int}) assert_is_type({"spam": 10}, {"spam": int, "egg": int}) assert_is_type({"egg": 1}, {"spam": int, "egg": int}) assert_is_type({"egg": 1, "spam": 10}, {"spam": int, "egg": int}) assert_is_type({"egg": 1, "spam": 10}, Dict(egg=int, spam=int)) assert_is_type({"egg": 1, "spam": 10}, Dict(egg=int, spam=int, ham=U(int, None))) assert_is_type((1, 3), (int, int)) assert_is_type(("a", "b", "c"), (int, int, int), (str, str, str)) assert_is_type((1, 3, 4, 7, 11, 18), Tuple(int)) assert_is_type((1, 3, "spam", 3, "egg"), Tuple(int, str)) assert_is_type([1, [2], [{3}]], [int, [int], [{3}]]) assert_is_type(A(), None, A) assert_is_type(B(), None, A) assert_is_type(C(), A, B) assert_is_type(D(), I(A, B, C)) assert_is_type(A, type) assert_is_type(B, lambda aa: issubclass(aa, A)) for a in range(-2, 5): assert_is_type(a, -2, -1, 0, 1, 2, 3, 4) assert_is_type(1, numeric) assert_is_type(2.2, numeric) assert_is_type(1, I(numeric, object)) assert_is_type(34, I(int, NOT(0))) assert_is_type(["foo", "egg", "spaam"], [I(str, NOT("spam"))]) assert_is_type(H2OFrame(), h2oframe) assert_is_type([[2.0, 3.1, 0], [2, 4.4, 1.1], [-1, 0, 0]], I([[numeric]], lambda v: all(len(vi) == len(v[0]) for vi in v))) assert_is_type([None, None, float('nan'), None, "N/A"], [None, "N/A", I(float, math.isnan)]) assert_error(3, str) assert_error(0, float) assert_error("Z", *list("ABCDEFGHIJKL")) assert_error(u"Z", "a", "...", "z") assert_error("X", u"x") assert_error(0, bool) assert_error(0, float, str, bool, None) assert_error([1, 5], [float]) assert_error((1, 3), (int, str), (str, int), (float, float)) assert_error(A(), None, B) assert_error(A, A) assert_error(A, lambda aa: issubclass(aa, B)) assert_error(135, I(int, lambda x: 0 <= x <= 100)) assert_error({"foo": 1, "bar": "2"}, {"foo": int, "bar": U(int, float, None)}) assert_error(3, 0, 2, 4) assert_error(None, numeric) assert_error("sss", numeric) assert_error(B(), I(A, B, C)) assert_error(2, I(int, str)) assert_error(0, I(int, NOT(0))) assert_error(None, NOT(None)) assert_error((1, 3, "2", 3), Tuple(int)) assert_error({"spam": 10}, Dict(spam=int, egg=int)) assert_error({"egg": 5}, Dict(spam=int, egg=int)) assert_error(False, h2oframe, pandas_dataframe, numpy_ndarray) assert_error([[2.0, 3.1, 0], [2, 4.4, 1.1], [-1, 0]], I([[numeric]], lambda v: all(len(vi) == len(v[0]) for vi in v))) try: # Cannot use `assert_error` here because typechecks module cannot detect args in (*args, *kwargs) assert_is_type(10000000, I(int, lambda port: 1 <= port <= 65535)) assert False, "Failed to throw an exception" except H2OTypeError as e: assert "integer & 1 <= port <= 65535" in str(e), "Bad error message: '%s'" % e url_regex = r"^(https?)://((?:[\w-]+\.)*[\w-]+):(\d+)/?$" assert_matches("Hello, world!", r"^(\w+), (\w*)!$") assert_matches("http://127.0.0.1:3233/", url_regex) m = assert_matches("https://localhost:54321", url_regex) assert m.group(1) == "https" assert m.group(2) == "localhost" assert m.group(3) == "54321" x = 5 assert_satisfies(x, x < 1000) assert_satisfies(x, x ** x > 1000) assert_satisfies(url_regex, url_regex.lower() == url_regex) try: assert_satisfies(url_regex, url_regex.upper() == url_regex) except H2OValueError as e: assert "url_regex.upper() == url_regex" in str(e), "Error message is bad: " + str(e) try: import pandas import numpy assert_is_type(pandas.DataFrame(), pandas_dataframe) assert_is_type(numpy.ndarray(shape=(5,)), numpy_ndarray) except ImportError: pass
def bernoulli_synthetic_data_mediumGBM(ip, port): # Connect to h2o h2o.init(ip, port) # Generate training dataset (adaptation of http://www.stat.missouri.edu/~speckman/stat461/boost.R) train_rows = 10000 train_cols = 10 # Generate variables V1, ... V10 X_train = np.random.randn(train_rows, train_cols) # y = +1 if sum_i x_{ij}^2 > chisq median on 10 df y_train = np.asarray([ 1 if rs > scipy.stats.chi2.ppf(0.5, 10) else -1 for rs in [sum(r) for r in np.multiply(X_train, X_train).tolist()] ]) # Train scikit gbm # TODO: grid-search distribution = "bernoulli" ntrees = 150 min_rows = 1 max_depth = 2 learn_rate = .01 nbins = 20 gbm_sci = ensemble.GradientBoostingClassifier(learning_rate=learn_rate, n_estimators=ntrees, max_depth=max_depth, min_samples_leaf=min_rows, max_features=None) gbm_sci.fit(X_train, y_train) # Generate testing dataset test_rows = 2000 test_cols = 10 # Generate variables V1, ... V10 X_test = np.random.randn(test_rows, test_cols) # y = +1 if sum_i x_{ij}^2 > chisq median on 10 df y_test = np.asarray([ 1 if rs > scipy.stats.chi2.ppf(0.5, 10) else -1 for rs in [sum(r) for r in np.multiply(X_test, X_test).tolist()] ]) # Score (AUC) the scikit gbm model on the test data auc_sci = roc_auc_score(y_test, gbm_sci.predict_proba(X_test)[:, 1]) # Compare this result to H2O train_h2o = H2OFrame(np.column_stack((y_train, X_train)).tolist()) test_h2o = H2OFrame(np.column_stack((y_test, X_test)).tolist()) gbm_h2o = h2o.gbm(x=train_h2o[1:], y=train_h2o["C1"].asfactor(), distribution=distribution, ntrees=ntrees, min_rows=min_rows, max_depth=max_depth, learn_rate=learn_rate, nbins=nbins) gbm_perf = gbm_h2o.model_performance(test_h2o) auc_h2o = gbm_perf.auc() #Log.info(paste("scikit AUC:", auc_sci, "\tH2O AUC:", auc_h2o)) assert abs(auc_h2o - auc_sci) < 5e-3, "h2o (auc) performance degradation, with respect to scikit. h2o auc: {0} " \ "scickit auc: {1}".format(auc_h2o, auc_sci)
def _data_transform(data: InputData) -> H2OFrame: conc_data = np.concatenate((data.features, data.target.reshape(-1, 1)), 1) frame = H2OFrame(python_obj=conc_data) return frame