def h2omake_metrics(): """ Python API test: h2o.make_metrics(predicted, actual, domain=None, distribution=None) Copied from pyunit_make_metrics.py """ fr = h2o.import_file(pyunit_utils.locate("smalldata/logreg/prostate.csv")) fr["CAPSULE"] = fr["CAPSULE"].asfactor() fr["RACE"] = fr["RACE"].asfactor() response = "RACE" predictors = list(set(fr.names) - {"ID", response}) model = H2OGradientBoostingEstimator(distribution="multinomial", ntrees=2, max_depth=3, min_rows=1, learn_rate=0.01, nbins=20) model.train(x=predictors, y=response, training_frame=fr) predicted = h2o.assign(model.predict(fr)[1:], "pred") actual = h2o.assign(fr[response].asfactor(), "act") domain = fr[response].levels()[0] m0 = model.model_performance(train=True) m1 = h2o.make_metrics(predicted, actual, domain=domain) m2 = h2o.make_metrics(predicted, actual) assert_is_type(m1, H2OMultinomialModelMetrics) assert_is_type(m2, H2OMultinomialModelMetrics) assert abs(m0.mse() - m1.mse()) < 1e-5 assert abs(m0.rmse() - m1.rmse()) < 1e-5 assert abs(m0.logloss() - m1.logloss()) < 1e-5 assert abs(m0.mean_per_class_error() - m1.mean_per_class_error()) < 1e-5 assert abs(m2.mse() - m1.mse()) < 1e-5 assert abs(m2.rmse() - m1.rmse()) < 1e-5 assert abs(m2.logloss() - m1.logloss()) < 1e-5 assert abs(m2.mean_per_class_error() - m1.mean_per_class_error()) < 1e-5
def execute(h2o, params, config): frame_id = config.get('frame_id') model_id = config.get('model_id') df = h2o.get_frame(frame_id) column_header = params.get('column_header') if len(column_header) > 0: df_head = df[:int(column_header)] df = df[int(column_header):] pred_model = h2o.get_model(model_id) df_pred = pred_model.predict(df) df_pred.columns = [x[len('reconstr_'):] for x in df_pred.columns] dest_frame_id = append_frame_id(frame_id, params.get('suffix')) if to_bool(params.get('topn_output')): df_topn = get_topN(df_pred, int(params.get('topn_percent'))) if df_head is not None: df_topn = df_head.cbind(df_topn) h2o.assign(df_topn, dest_frame_id) h2o.remove(str(df_pred.frame_id)) else: h2o.assign(df_pred, dest_frame_id) return {'frame_id': dest_frame_id}
def execute(h2o, params, config): frame_id = config.get('frame_id') df = h2o.get_frame(frame_id) from h2o.estimators import H2OWord2vecEstimator w2v_model = H2OWord2vecEstimator( epochs=int(params.get('epochs')), init_learning_rate=float(params.get('init_learning_rate')), max_runtime_secs=float(params.get('max_runtime_secs')), min_word_freq=int(params.get('min_word_freq')), sent_sample_rate=float(params.get('sent_sample_rate')), vec_size=int(params.get('vec_size')), window_size=int(params.get('window_size'))) w2v_model.train(training_frame=df) save_model(params, w2v_model.model_id) is_transform = params.get("is_transform") if is_transform is not None and to_bool(is_transform): df_vecs = w2v_model.transform( df, aggregate_method=params.get('aggregate_method')) dest_frame_id = append_frame_id(frame_id, params.get('transform_suffix')) h2o.assign(df_vecs, dest_frame_id) else: dest_frame_id = frame_id return {'frame_id': dest_frame_id, 'model_id': w2v_model.model_id}
def _fetch_state(aml_id, properties=None): state_json = h2o.api("GET /99/AutoML/%s" % aml_id) project_name = state_json["project_name"] if project_name is None: raise H2OValueError("No AutoML instance with id {}.".format(aml_id)) leaderboard_list = [key["name"] for key in state_json['leaderboard']['models']] leader_id = leaderboard_list[0] if (leaderboard_list is not None and len(leaderboard_list) > 0) else None should_fetch = lambda prop: properties is None or prop in properties leader = None if should_fetch('leader'): leader = h2o.get_model(leader_id) if leader_id is not None else None leaderboard = None if should_fetch('leaderboard'): leaderboard = H2OAutoML._fetch_table(state_json['leaderboard_table'], key=project_name+"_leaderboard", progress_bar=False) leaderboard = h2o.assign(leaderboard[1:], project_name+"_leaderboard") # removing index and reassign id to ensure persistence on backend event_log = None if should_fetch('event_log'): event_log = H2OAutoML._fetch_table(state_json['event_log_table'], key=project_name+"_eventlog", progress_bar=False) event_log = h2o.assign(event_log[1:], project_name+"_eventlog") # removing index and reassign id to ensure persistence on backend return dict( project_name=project_name, json=state_json, leader_id=leader_id, leader=leader, leaderboard=leaderboard, event_log=event_log, )
def execute(h2o, params, config): frame_id = config.get('frame_id') df = h2o.get_frame(frame_id) target_column = params.get("target_column") analyzer = params.get("analyzer") if len(analyzer) > 0: url = params.get("url") df_token = df[target_column].tokenize( f'tokenize:elasticsearch:{url}?analyzer={analyzer}_analyzer') else: df_token = df[target_column].tokenize(params.get('regex')) if to_bool(params.get('lower_case')): df_token = df_token.tolower() min_word_len = int(params.get('min_word_len')) if min_word_len > 0: df_token = df_token[(df_token.nchar() >= min_word_len) | (df_token.isna()), :] if to_bool(params.get('use_stop_words')): df_token = df_token[(df_token.isna()) | (~df_token.isin(STOP_WORDS)), :] dest_frame_id = append_frame_id(frame_id, params.get('suffix')) h2o.assign(df_token, dest_frame_id) return {'frame_id': dest_frame_id}
def execute(h2o, params, config): frame_id = config.get('frame_id') model_id = config.get('model_id') df = h2o.get_frame(frame_id) input_columns = params.get("input_columns") if input_columns is None or len(input_columns) <= 2: input_columns = df.col_names else: input_columns = json.loads(input_columns) output_columns = params.get("output_columns") if output_columns is None or len(output_columns) <= 2: output_columns = [] else: output_columns = json.loads(output_columns) pred_model = h2o.get_model(model_id) df_pred = pred_model.predict(df[input_columns]) for col_name in output_columns: df_pred[col_name] = df[col_name] dest_frame_id = append_frame_id(frame_id, params.get('suffix')) h2o.assign(df_pred, dest_frame_id) return {'frame_id': dest_frame_id}
def execute(h2o, params, config): frame_id = config.get('frame_id') df = h2o.get_frame(frame_id) column = params.get('column') value = params.get('value') c_type = df.types[column] if c_type == 'real': value = float(value) elif c_type == 'int': value = int(value) elif c_type == 'enum': for c in df[column].categories(): if value == c: value = c break row_conditions = params.get('row_conditions') if row_conditions is not None and len(row_conditions) > 0: mask = parse_row_condition(df, row_conditions) df[mask, column] = value else: df[column] = value dest_frame_id = append_frame_id(frame_id, params.get('suffix')) h2o.assign(df, dest_frame_id) return {'frame_id': dest_frame_id}
def frame_id(self, value): oldname = self.frame_id keep = self._ast is None if keep: h2o.assign(self,value) else: self._id = value h2o.rapids("(rename \"{}\" \"{}\")".format(oldname, value))
def execute(h2o, params, config): frame_id = config.get('frame_id') df = h2o.get_frame(frame_id) df_pivot = df.pivot(index=params.get('index'), column=params.get('column'), value=params.get('value')) dest_frame_id = append_frame_id(frame_id, params.get('suffix')) h2o.assign(df_pivot, dest_frame_id) return {'frame_id': dest_frame_id}
def prepare_data(seed=1): name = 'amldataset' df = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"), destination_frame=name) target = "CAPSULE" df[target] = df[target].asfactor() h2o.assign(df, name) fr = df.split_frame(ratios=[.8, .1], destination_frames=[name+'_'+f for f in ['training', 'validation', 'leaderboard']], seed=seed) train, valid, test = fr[0], fr[1], fr[2] return target, train, valid, test
def execute(h2o, params, config): frame_id = config.get('frame_id') df = h2o.get_frame(frame_id) df_fillna = df.fillna(method=params.get('method'), axis=int(params.get('axis')), maxlen=int(params.get('maxlen'))) dest_frame_id = append_frame_id(frame_id, params.get('suffix')) h2o.assign(df_fillna, dest_frame_id) return {'frame_id': dest_frame_id}
def execute(h2o, params, config): frame_id = config.get('frame_id') df = h2o.get_frame(frame_id) bind_frame_id = params.get('bind_frame_id') df_2 = h2o.get_frame(bind_frame_id) df_bind = df.cbind(df_2) dest_frame_id = append_frame_id(frame_id, params.get('suffix')) h2o.assign(df_bind, dest_frame_id) return {'frame_id': dest_frame_id}
def execute(h2o, params, config): frame_id = config.get('frame_id') df = h2o.get_frame(frame_id) row_conditions = params.get('row_conditions') if row_conditions is not None and len(row_conditions) > 0: mask = parse_row_condition(df, row_conditions) df = df[mask, :] dest_frame_id = append_frame_id(frame_id, params.get('suffix')) h2o.assign(df, dest_frame_id) return {'frame_id': dest_frame_id}
def execute(h2o, params, config): frame_id = config.get('frame_id') df = h2o.get_frame(frame_id) column = params.get('column') ascending = to_bool(params.get('ascending')) df_sort = df.sort(by=[column], ascending=[ascending]) dest_frame_id = append_frame_id(frame_id, params.get('suffix')) h2o.assign(df_sort, dest_frame_id) return {'frame_id': dest_frame_id}
def execute(h2o, params, config): frame_id = config.get('frame_id') df = h2o.get_frame(frame_id) columns = params.get('columns') if columns is not None or len(columns) > 2: columns = json.loads(columns) df = df[columns] df_floor = df.floor() dest_frame_id = append_frame_id(frame_id, params.get('suffix')) h2o.assign(df_floor, dest_frame_id) return {'frame_id': dest_frame_id}
def test_use_kfold_strategy_to_train_a_model_with_cv(): #XXX: TE KFold strategy allows TE to be trained only once in a context of model building with CV, # but it can't be applied just once on the training data, # otherwise this is what's happening when training CV1 for example (fold1 = cv_holdout, f2-n = cv_train): # column `cat_te` for cv_holdout is obtained using fold_1 so, only with information collected from folds_2-n, which is what we want. # column `cat_te` for cv_train however is obtained using fold_i, and each of those contains information about fold_1: this is a data leakage from cv_holdout into cv_train. # on top of this, current version of transform is using a global priorMean for NAs, creating an additional data leakage in CV context. # The priorMean issue can be fixed internally in the implementation of KFold strategy. # However, for proper CCV, we need a deep integration with CV logic in ModelBuilder (translate to Java of course..): # train TE using KFold strategy on the entire train set. # then during CV, for each fold: # train_cv_i = te.transform(train_cv, fold=fold_i) # so that train_cv_i is not encoded at all with encodings from other folds (they include info about current fold) # test_cv_i = te.transform(test_cv, fold=fold_i) # same # finally, the final model is trained with TE applied on the whole training frame: # train = te.transform(train, as_training=True) # still using the fold column, this ensures that the final feature is equivalent to the one used in all the test_cv_i # or # train = te.transform(train) # ignoring the fold column, this way the final te feature uses the entire train set. ds = load_dataset(incl_test=True, incl_foldc=True) te = H2OTargetEncoderEstimator(noise=0, data_leakage_handling="kfold") te.train(y=ds.target, training_frame=ds.train, fold_column="foldc") train_enc_cv = te.transform(ds.train, as_training=True) cols_to_remove = [n[:-3] for n in train_enc_cv.names if n.endswith("_te")] train_enc_cv = h2o.assign(train_enc_cv.drop(cols_to_remove), "train_enc_cv") train_enc_no_cv = te.transform(ds.train) train_enc_no_cv = h2o.assign(train_enc_no_cv.drop(cols_to_remove), "train_enc_no_cv") test_enc = te.transform(ds.test) test_enc = h2o.assign(test_enc.drop(cols_to_remove), "test_enc") print(train_enc_cv) print(train_enc_no_cv) gbm = H2OGradientBoostingEstimator(seed=seed) gbm.train(y=ds.target, training_frame=train_enc_cv, fold_column="foldc") auc_with_ccv = gbm.model_performance(test_enc).auc() print("AUC with CCV : %s" % auc_with_ccv) gbm.train(y=ds.target, training_frame=train_enc_no_cv, fold_column="foldc") auc_no_ccv = gbm.model_performance(test_enc).auc() print("AUC without CCV : %s" % auc_no_ccv) assert auc_with_ccv > auc_no_ccv
def execute(h2o, params, config): frame_id = config.get('frame_id') df = h2o.get_frame(frame_id) frames = params.get('frames') if frames is None or len(frames) <= 2: print("frames are empty.") sys.exit(1) frames = json.loads(frames) df_concat = df.concat([h2o.get_frame(x) for x in frames], axis=int(params.get('axis'))) dest_frame_id = append_frame_id(frame_id, params.get('suffix')) h2o.assign(df_concat, dest_frame_id) return {'frame_id': dest_frame_id}
def execute(h2o, params, config): frame_id = config.get('frame_id') df = h2o.get_frame(frame_id) columns = params.get('columns') if columns is None or len(columns) <= 2: columns = df.columns else: columns = json.loads(columns) df_filtered = df[columns] dest_frame_id = append_frame_id(frame_id, params.get('suffix')) h2o.assign(df_filtered, dest_frame_id) return {'frame_id': dest_frame_id}
def _fetch_leaderboard(aml_id, extensions=None): assert_is_type(extensions, None, str, [str]) extensions = ([] if extensions is None else [extensions] if is_type(extensions, str) else extensions) resp = h2o.api("GET /99/Leaderboards/%s" % aml_id, data=dict(extensions=extensions)) dest_key = resp['project_name'].split('@', 1)[0]+"_custom_leaderboard" lb = H2OAutoML._fetch_table(resp['table'], key=dest_key, progress_bar=False) return h2o.assign(lb[1:], dest_key)
def h2omake_metrics_mutlinomial(): """ Python API test: h2o.make_metrics(predicted, actual, domain=None, distribution=None) Copied from pyunit_make_metrics.py """ fr = h2o.import_file(pyunit_utils.locate("smalldata/logreg/prostate.csv")) fr["CAPSULE"] = fr["CAPSULE"].asfactor() fr["RACE"] = fr["RACE"].asfactor() response = "RACE" predictors = list(set(fr.names) - {"ID", response}) model = H2OGradientBoostingEstimator(distribution="multinomial", ntrees=2, max_depth=3, min_rows=1, learn_rate=0.01, nbins=20, auc_type="MACRO_OVR") model.train(x=predictors, y=response, training_frame=fr) predicted = h2o.assign(model.predict(fr)[1:], "pred") actual = h2o.assign(fr[response].asfactor(), "act") domain = fr[response].levels()[0] m0 = model.model_performance(train=True) m1 = h2o.make_metrics(predicted, actual, domain=domain, auc_type="MACRO_OVR") m2 = h2o.make_metrics(predicted, actual, auc_type="MACRO_OVR") assert_is_type(m1, H2OMultinomialModelMetrics) assert_is_type(m2, H2OMultinomialModelMetrics) assert abs(m0.mse() - m1.mse()) < 1e-5 assert abs(m0.rmse() - m1.rmse()) < 1e-5 assert abs(m0.logloss() - m1.logloss()) < 1e-5 assert abs(m0.mean_per_class_error() - m1.mean_per_class_error()) < 1e-5 assert abs(m0.auc() - m1.auc()) < 1e-5 assert abs(m0.aucpr() - m1.aucpr()) < 1e-5 assert abs(m2.mse() - m1.mse()) < 1e-5 assert abs(m2.rmse() - m1.rmse()) < 1e-5 assert abs(m2.logloss() - m1.logloss()) < 1e-5 assert abs(m2.mean_per_class_error() - m1.mean_per_class_error()) < 1e-5 assert abs(m2.auc() - m1.auc()) < 1e-5 assert abs(m2.aucpr() - m1.aucpr()) < 1e-5
def pyunit_assign(): pros = h2o.import_file(h2o.locate("smalldata/prostate/prostate.csv")) pq = pros.quantile() PSA_outliers = pros[pros["PSA"] <= pq[1,1] or pros["PSA"] >= pq[1,9]] PSA_outliers = h2o.assign(PSA_outliers, "PSA.outliers") pros.head(show=True) PSA_outliers.head(show=True) assert PSA_outliers._id == "PSA.outliers", "Expected frame id to be PSA.outliers, but got {0}".format(PSA_outliers._id)
def pyunit_assign(ip,port): pros = h2o.import_file(h2o.locate("smalldata/prostate/prostate.csv")) pq = pros.quantile() PSA_outliers = pros[pros["PSA"] <= pq[1,1] or pros["PSA"] >= pq[1,9]] PSA_outliers = h2o.assign(PSA_outliers, "PSA.outliers") pros.head(show=True) PSA_outliers.head(show=True) assert PSA_outliers._id == "PSA.outliers", "Expected frame id to be PSA.outliers, but got {0}".format(PSA_outliers._id)
def h2oassign(): """ Python API test: h2o.assign(data, xid) """ old_name = "benign.csv" new_name = "newBenign.csv" training_data = h2o.import_file(pyunit_utils.locate("smalldata/logreg/benign.csv"), destination_frame=old_name) assert training_data.frame_id==old_name, "h2o.import_file() is not working. Wrong frame_id is assigned." temp=h2o.assign(training_data, new_name) assert_is_type(temp, H2OFrame) assert training_data.frame_id==new_name, "h2o.assign() is not working. New frame_id is not assigned."
def pyunit_assign(): pros = h2o.import_file(pyunit_utils.locate("smalldata/prostate/prostate.csv")) pq = pros.quantile() print('1st percentile for PSA:', pq[0,7]) print('99th percentile for PSA:', pq[8,7]) PSA_outliers = pros[ ((pros["PSA"] <= pq[0,7]) | (pros["PSA"] >= pq[8,7])) ] PSA_outliers = h2o.assign(PSA_outliers, "PSA.outliers") print(pros) print(PSA_outliers) assert PSA_outliers.frame_id == "PSA.outliers", "Expected frame id to be PSA.outliers, but got {0}".format(PSA_outliers.frame_id)
def execute(h2o, params, config): frame_id = config.get('frame_id') df = h2o.get_frame(frame_id) columns = params.get('columns') if columns is not None and len(columns) > 2: columns = json.loads(columns) df = df[columns] use_value = params.get('use') if use_value is not None and len(use_value) == 0: use_value = None df_cor = df.cor(na_rm=to_bool(params.get('na_rm')), use=use_value, method=params.get('method')) dest_frame_id = append_frame_id(frame_id, params.get('suffix')) h2o.assign(df_cor, dest_frame_id) return {'frame_id': dest_frame_id}
def javamunge_assembly(): h2o.remove_all() train = pyunit_utils.locate("bigdata/laptop/lending-club/LoanStats3a.csv") test = pyunit_utils.locate("bigdata/laptop/lending-club/LoanStats3b.csv") # lending-club munging assembly print("Import and Parse data") # Add "earliest_cr_line" and "issue_d" and cast as strings to aide Cliff's PR on 7/13 types = {"int_rate": "string", "revol_util": "string", "emp_length": "string", "earliest_cr_line": "string", "issue_d": "string", "last_credit_pull_d": "factor"} data = h2o.import_file(path=train, col_types=types) test = h2o.import_file(path=test, col_types=data.types) ## use the same data types as the training set for the test set test = test[[1,5,19,23,45,66,99,590,8903,9999,10001,23892,23893,50123],:] test = h2o.assign(test,"test") assembly = H2OAssembly( steps=[ # munge int_rate column in place # strip %, trim ws, convert to double ("intrate_rm_junk_char", H2OColOp(op=H2OFrame.gsub, col="int_rate", inplace=True, pattern="%", replacement="")), # strip % ("intrate_trim_ws", H2OColOp(op=H2OFrame.trim, col="int_rate", inplace=True)), # trim ws ("intrate_as_numeric", H2OColOp(op=H2OFrame.asnumeric, col="int_rate", inplace=True)), # string -> double # munge the revol_util in the same way as the int_rate column ("revol_rm_junk_char", H2OColOp(op=H2OFrame.gsub, col="revol_util", inplace=True, pattern="%", replacement="")), # strip % ("revol_trim_ws", H2OColOp(op=H2OFrame.trim, col="revol_util", inplace=True)), # trim ws ("revol_as_numeric", H2OColOp(op=H2OFrame.asnumeric, col="revol_util", inplace=True)), # string -> double # munge earliest_cr_line column (mm-YYYY format) # split into Month and Year columns ("earliest_cr_line_split", H2OColOp(H2OFrame.strsplit, col="earliest_cr_line", inplace=False, new_col_name=["earliest_cr_line_Month","earliest_cr_line_Year"], pattern="-")), # split on '-' ("earliest_cr_line_Year_as_numeric", H2OColOp(op=H2OFrame.asnumeric, col="earliest_cr_line_Year", inplace=True)), # string -> double # munge issue_d column in same way as earliest_cr_line column ("issue_date_split", H2OColOp(op=H2OFrame.strsplit, col="issue_d", inplace=False, new_col_name=["issue_d_Month", "issue_d_Year"], pattern="-")), # split on '-' ("issue_d_Year_as_numeric", H2OColOp(op=H2OFrame.asnumeric, col="issue_d_Year", inplace=True)), # string -> double # do some munging of the emp_length column ("emp_length_rm_years", H2OColOp(op=H2OFrame.gsub, col="emp_length", inplace=True, pattern="([ ]*+[a-zA-Z].*)|(n/a)", replacement="")), # remove " year" and " years", also translate n/a to "" ("emp_length_trim", H2OColOp(op=H2OFrame.trim, col="emp_length", inplace=True)), # trim all the WS off ("emp_length_lt1_point5",H2OColOp(op=H2OFrame.gsub, col="emp_length", inplace=True, pattern="< 1", replacement="0.5")), # translate < 1 => 0.5 ("emp_length_10plus", H2OColOp(op=H2OFrame.gsub, col="emp_length", inplace=True, pattern="10\\+", replacement="10")), # translate 10+ to 10 ("emp_length_as_numeric",H2OColOp(op=H2OFrame.asnumeric, col="emp_length", inplace=True)), # string -> double # compute credit length ("credit_length", H2OBinaryOp(op=H2OAssembly.minus, col="issue_d_Year",inplace=False, new_col_name="longest_credit_length",right=H2OCol("earliest_cr_line_Year"))) ]) res = assembly.fit(data) pyunit_utils.javamunge(assembly, "AssemblyMungingDemoPojo", test)
def test_loo_requires_target_to_encode_training_frame(): ds = load_dataset() te = H2OTargetEncoderEstimator(data_leakage_handling="leave_one_out") te.train(y=ds.target, training_frame=ds.train) train_no_target = h2o.assign(ds.train.drop(ds.target), "train_no_target") assert train_no_target is not None try: te.transform(train_no_target, as_training=True) assert False, "should have raised" except Exception as e: assert "LeaveOneOut strategy requires a response column" in str(e) assert te.predict(train_no_target) is not None
def pyunit_assign(): pros = h2o.import_file( pyunit_utils.locate("smalldata/prostate/prostate.csv")) pq = pros.quantile() print '1st percentile for PSA:', pq[0, 7] print '99th percentile for PSA:', pq[8, 7] PSA_outliers = pros[((pros["PSA"] <= pq[0, 7]) | (pros["PSA"] >= pq[8, 7]))] PSA_outliers = h2o.assign(PSA_outliers, "PSA.outliers") print pros print PSA_outliers assert PSA_outliers.frame_id == "PSA.outliers", "Expected frame id to be PSA.outliers, but got {0}".format( PSA_outliers.frame_id)
def javamunge_assembly(): h2o.remove_all() train = pyunit_utils.locate("bigdata/laptop/lending-club/LoanStats3a.csv") test = pyunit_utils.locate("bigdata/laptop/lending-club/LoanStats3b.csv") # lending-club munging assembly print("Import and Parse data") types = {"int_rate":"String", "revol_util":"String", "emp_length":"String"} data = h2o.import_file(path=train, col_types=types) test = h2o.import_file(path=test, col_types=types) test = test[[1,5,19,23,45,66,99,590,8903,9999,10001,23892,23893,50123],:] test = h2o.assign(test,"test") assembly = H2OAssembly( steps=[ # munge int_rate column in place # strip %, trim ws, convert to double ("intrate_rm_junk_char", H2OColOp(op=H2OFrame.gsub, col="int_rate", inplace=True, pattern="%", replacement="")), # strip % ("intrate_trim_ws", H2OColOp(op=H2OFrame.trim, col="int_rate", inplace=True)), # trim ws ("intrate_as_numeric", H2OColOp(op=H2OFrame.asnumeric, col="int_rate", inplace=True)), # string -> double # munge the revol_util in the same way as the int_rate column ("revol_rm_junk_char", H2OColOp(op=H2OFrame.gsub, col="revol_util", inplace=True, pattern="%", replacement="")), # strip % ("revol_trim_ws", H2OColOp(op=H2OFrame.trim, col="revol_util", inplace=True)), # trim ws ("revol_as_numeric", H2OColOp(op=H2OFrame.asnumeric, col="revol_util", inplace=True)), # string -> double # munge earliest_cr_line column (mm-YYYY format) # split into Month and Year columns ("earliest_cr_line_split", H2OColOp(H2OFrame.strsplit, col="earliest_cr_line", inplace=False, new_col_name=["earliest_cr_line_Month","earliest_cr_line_Year"], pattern="-")), # split on '-' ("earliest_cr_line_Year_as_numeric", H2OColOp(op=H2OFrame.asnumeric, col="earliest_cr_line_Year", inplace=True)), # string -> double # munge issue_d column in same way as earliest_cr_line column ("issue_date_split", H2OColOp(op=H2OFrame.strsplit, col="issue_d", inplace=False, new_col_name=["issue_d_Month", "issue_d_Year"], pattern="-")), # split on '-' ("issue_d_Year_as_numeric", H2OColOp(op=H2OFrame.asnumeric, col="issue_d_Year", inplace=True)), # string -> double # do some munging of the emp_length column ("emp_length_rm_years", H2OColOp(op=H2OFrame.gsub, col="emp_length", inplace=True, pattern="([ ]*+[a-zA-Z].*)|(n/a)", replacement="")), # remove " year" and " years", also translate n/a to "" ("emp_length_trim", H2OColOp(op=H2OFrame.trim, col="emp_length", inplace=True)), # trim all the WS off ("emp_length_lt1_point5",H2OColOp(op=H2OFrame.gsub, col="emp_length", inplace=True, pattern="< 1", replacement="0.5")), # translate < 1 => 0.5 ("emp_length_10plus", H2OColOp(op=H2OFrame.gsub, col="emp_length", inplace=True, pattern="10\\+", replacement="10")), # translate 10+ to 10 ("emp_length_as_numeric",H2OColOp(op=H2OFrame.asnumeric, col="emp_length", inplace=True)), # string -> double # compute credit length ("credit_length", H2OBinaryOp(op=H2OAssembly.minus, col="issue_d_Year",inplace=False, new_col_name="longest_credit_length",right=H2OCol("earliest_cr_line_Year"))) ]) res = assembly.fit(data) pyunit_utils.javamunge(assembly, "AssemblyMungingDemoPojo", test)
def _fetch_table(table, key=None, progress_bar=True): try: # Intentionally mask the progress bar here since showing multiple progress bars is confusing to users. # If any failure happens, revert back to user's original setting for progress and display the error message. ori_progress_state = H2OJob.__PROGRESS_BAR__ H2OJob.__PROGRESS_BAR__ = progress_bar # Parse leaderboard H2OTwoDimTable & return as an H2OFrame fr = h2o.H2OFrame(table.cell_values, destination_frame=key, column_names=table.col_header, column_types=table.col_types) return h2o.assign( fr[1:], key ) # removing index and reassign id to ensure persistence on backend finally: H2OJob.__PROGRESS_BAR__ = ori_progress_state
import h2o h2o.init() datasets = "https://raw.githubusercontent.com/DarrenCook/h2o/bk/datasets/" data = h2o.import_file(datasets + "iris_wheader.csv") data.frame_id data = data[:, 1:] data.frame_id data = h2o.assign(data, "iris") data.frame_id h2o.ls() h2o.remove("iris_wheader.hex") h2o.ls()
def split_frame(self, ratios=None, destination_frames=None, seed=None): """ Split a frame into distinct subsets of size determined by the given ratios. The number of subsets is always 1 more than the number of ratios given. :param ratios: The fraction of rows for each split. :param destination_frames: names of the split frames :param seed: Random seed :return: a list of frames """ if ratios is None: ratios = [0.75] if len(ratios) < 1: raise ValueError("Ratios must have length of at least 1") if destination_frames is not None: if (len(ratios)+1) != len(destination_frames): raise ValueError("The number of provided destination_frames must be one more than the number of provided ratios") num_slices = len(ratios) + 1 boundaries = [] last_boundary = 0 i = 0 while i < num_slices-1: ratio = ratios[i] if ratio < 0: raise ValueError("Ratio must be greater than 0") boundary = last_boundary + ratio if boundary >= 1.0: raise ValueError("Ratios must add up to less than 1.0") boundaries.append(boundary) last_boundary = boundary i += 1 splits = [] tmp_runif = self.runif(seed) i = 0 while i < num_slices: if i == 0: # lower_boundary is 0.0 upper_boundary = boundaries[i] tmp_slice = self[(tmp_runif <= upper_boundary), :] elif i == num_slices-1: lower_boundary = boundaries[i-1] # upper_boundary is 1.0 tmp_slice = self[(tmp_runif > lower_boundary), :] else: lower_boundary = boundaries[i-1] upper_boundary = boundaries[i] tmp_slice = self[((tmp_runif > lower_boundary) & (tmp_runif <= upper_boundary)), :] if destination_frames is None: splits.append(tmp_slice) else: destination_frame_id = destination_frames[i] tmp_slice2 = h2o.assign(tmp_slice, destination_frame_id) splits.append(tmp_slice2) i += 1 return splits
def pyunit_make_metrics(): fr = h2o.import_file(pyunit_utils.locate("smalldata/logreg/prostate.csv")) fr["CAPSULE"] = fr["CAPSULE"].asfactor() fr["RACE"] = fr["RACE"].asfactor() fr.describe() response = "AGE" predictors = list(set(fr.names) - {"ID", response}) print("\n\n======= REGRESSION ========\n") for distr in ["gaussian", "poisson", "laplace", "gamma"]: print("distribution: %s" % distr) model = H2OGradientBoostingEstimator(distribution=distr, ntrees=2, max_depth=3, min_rows=1, learn_rate=0.1, nbins=20) model.train(x=predictors, y=response, training_frame=fr) predicted = h2o.assign(model.predict(fr), "pred") actual = fr[response] m0 = model.model_performance(train=True) m1 = h2o.make_metrics(predicted, actual, distribution=distr) m2 = h2o.make_metrics(predicted, actual) print("model performance:") print(m0) print("make_metrics (distribution=%s):" % distr) print(m1) print("make_metrics (distribution=None):") print(m2) assert abs(m0.mae() - m1.mae()) < 1e-5 assert abs(m0.mse() - m1.mse()) < 1e-5 assert abs(m0.rmse() - m1.rmse()) < 1e-5 assert abs(m0.mean_residual_deviance() - m1.mean_residual_deviance()) < 1e-5 assert abs(m2.mae() - m1.mae()) < 1e-5 assert abs(m2.mse() - m1.mse()) < 1e-5 assert abs(m2.rmse() - m1.rmse()) < 1e-5 assert (abs(m1.mean_residual_deviance() - m2.mean_residual_deviance()) < 1e-7) == (distr == "gaussian") print("\n\n======= BINOMIAL ========\n") response = "CAPSULE" predictors = list(set(fr.names) - {"ID", response}) model = H2OGradientBoostingEstimator() model.train(x=predictors, y=response, distribution="bernoulli", training_frame=fr, ntrees=2, max_depth=3, min_rows=1, learn_rate=0.01, nbins=20) predicted = h2o.assign(model.predict(fr)[2], "pred") actual = h2o.assign(fr[response].asfactor(), "act") domain = ["0", "1"] m0 = model.model_performance(train=True) m1 = h2o.make_metrics(predicted, actual, domain=domain) m2 = h2o.make_metrics(predicted, actual) print("m0:") print(m0) print("m1:") print(m1) print("m2:") print(m2) assert abs(m0.auc() - m1.auc()) < 1e-5 assert abs(m0.mse() - m1.mse()) < 1e-5 assert abs(m0.rmse() - m1.rmse()) < 1e-5 assert abs(m0.logloss() - m1.logloss()) < 1e-5 assert abs(m0.mean_per_class_error()[0][1] - m1.mean_per_class_error()[0][1]) < 1e-5 assert abs(m2.auc() - m1.auc()) < 1e-5 assert abs(m2.mse() - m1.mse()) < 1e-5 assert abs(m2.rmse() - m1.rmse()) < 1e-5 assert abs(m2.logloss() - m1.logloss()) < 1e-5 assert abs(m2.mean_per_class_error()[0][1] - m1.mean_per_class_error()[0][1]) < 1e-5 print("\n\n======= MULTINOMIAL ========\n") response = "RACE" predictors = list(set(fr.names) - {"ID", response}) model = H2OGradientBoostingEstimator() model.train(x=predictors, y=response, distribution="multinomial", training_frame=fr, ntrees=2, max_depth=3, min_rows=1, learn_rate=0.01, nbins=20) predicted = h2o.assign(model.predict(fr)[1:], "pred") actual = h2o.assign(fr[response].asfactor(), "act") domain = fr[response].levels()[0] m0 = model.model_performance(train=True) m1 = h2o.make_metrics(predicted, actual, domain=domain) m2 = h2o.make_metrics(predicted, actual) assert abs(m0.mse() - m1.mse()) < 1e-5 assert abs(m0.rmse() - m1.rmse()) < 1e-5 assert abs(m0.logloss() - m1.logloss()) < 1e-5 assert abs(m0.mean_per_class_error() - m1.mean_per_class_error()) < 1e-5 assert abs(m2.mse() - m1.mse()) < 1e-5 assert abs(m2.rmse() - m1.rmse()) < 1e-5 assert abs(m2.logloss() - m1.logloss()) < 1e-5 assert abs(m2.mean_per_class_error() - m1.mean_per_class_error()) < 1e-5
data["C3"] = data["C3"].asnumeric() data["C4"] = data["C4"].asfactor() data["C5"] = data["C5"].asnumeric() data["C6"] = data["C6"].asfactor() data["C7"] = data["C7"].asfactor() data["C8"] = data["C8"].asfactor() data["C9"] = data["C9"].asfactor() data["C10"] = data["C10"].asfactor() data["C11"] = data["C11"].asnumeric() data["C12"] = data["C12"].asnumeric() data["C13"] = data["C13"].asnumeric() data["C14"] = data["C14"].asfactor() data["C15"] = data["C15"].asfactor() train, test = data.split_frame([0.8]) h2o.assign(train, "train_rf") h2o.assign(test, "test_rf") # Declare model m = H2ORandomForestEstimator(model_id="income_rf", ignore_const_cols=True, ntrees=100, stopping_metric="logloss", stopping_rounds=3, stopping_tolerance=0.02, max_runtime_secs=60, nfolds=10) m.train(x, y, train) performance = m.model_performance(test)
def pyunit_make_metrics(): fr = h2o.import_file(pyunit_utils.locate("smalldata/logreg/prostate.csv")) fr["CAPSULE"] = fr["CAPSULE"].asfactor() fr["RACE"] = fr["RACE"].asfactor() fr.describe() response = "AGE" predictors = list(set(fr.names) - {"ID", response}) print("\n\n======= REGRESSION ========\n") for distr in ["gaussian", "poisson", "laplace", "gamma"]: print("distribution: %s" % distr) model = H2OGradientBoostingEstimator(distribution=distr, ntrees=2, max_depth=3, min_rows=1, learn_rate=0.1, nbins=20) model.train(x=predictors, y=response, training_frame=fr) predicted = h2o.assign(model.predict(fr), "pred") actual = fr[response] m0 = model.model_performance(train=True) m1 = h2o.make_metrics(predicted, actual, distribution=distr) m2 = h2o.make_metrics(predicted, actual) print("model performance:") print(m0) print("make_metrics (distribution=%s):" % distr) print(m1) print("make_metrics (distribution=None):") print(m2) assert abs(m0.mae() - m1.mae()) < 1e-5 assert abs(m0.mse() - m1.mse()) < 1e-5 assert abs(m0.rmse() - m1.rmse()) < 1e-5 assert abs(m0.mean_residual_deviance() - m1.mean_residual_deviance()) < 1e-5 assert abs(m0.rmsle() - m1.rmsle()) < 1e-5 assert abs(m2.mae() - m1.mae()) < 1e-5 assert abs(m2.mse() - m1.mse()) < 1e-5 assert abs(m2.rmse() - m1.rmse()) < 1e-5 assert (abs(m1.mean_residual_deviance() - m2.mean_residual_deviance()) < 1e-7) == (distr == "gaussian") assert abs(m2.rmsle() - m1.rmsle()) < 1e-5 print("\n\n======= BINOMIAL ========\n") response = "CAPSULE" predictors = list(set(fr.names) - {"ID", response}) model = H2OGradientBoostingEstimator(distribution="bernoulli", ntrees=2, max_depth=3, min_rows=1, learn_rate=0.01, nbins=20, seed=1) model.train(x=predictors, y=response, training_frame=fr) predicted = h2o.assign(model.predict(fr)[2], "pred") actual = h2o.assign(fr[response].asfactor(), "act") domain = ["0", "1"] m0 = model.model_performance(train=True) m1 = h2o.make_metrics(predicted, actual, domain=domain) m2 = h2o.make_metrics(predicted, actual) print("m0:") print(m0) print("m1:") print(m1) print("m2:") print(m2) # Testing base metric methods # FIXME: check the same failures for other ModelMetrics impl. and then fix'emall or move them out of base class... base_metrics_methods_failing_on_H2OBinomialModelMetrics = ['aic', 'mae', 'mean_per_class_error', 'mean_residual_deviance', 'rmsle'] for metric_method in (m for m in base_metric_methods if m not in base_metrics_methods_failing_on_H2OBinomialModelMetrics): m0mm = getattr(m0, metric_method)() m1mm = getattr(m1, metric_method)() m2mm = getattr(m2, metric_method)() assert m0mm == m1mm or abs(m0mm - m1mm) < 1e-5, \ "{} is different for model_performance and make_metrics on [0, 1] domain".format(metric_method) assert m1mm == m2mm or abs(m1mm - m2mm) < 1e-5, \ "{} is different for make_metrics on [0, 1] domain and make_metrics without domain".format(metric_method) # FIXME: for binomial mean_per_class_error is strangely accessible as an array assert abs(m0.mean_per_class_error()[0][1] - m1.mean_per_class_error()[0][1]) < 1e-5 assert abs(m2.mean_per_class_error()[0][1] - m1.mean_per_class_error()[0][1]) < 1e-5 failures = 0 for metric_method in base_metrics_methods_failing_on_H2OBinomialModelMetrics: for m in [m0, m1, m2]: try: assert isinstance(getattr(m, metric_method)(), float) except: failures += 1 assert failures == 3 * len(base_metrics_methods_failing_on_H2OBinomialModelMetrics) # Testing binomial-only metric methods binomial_only_metric_methods = ['accuracy', 'F0point5', 'F1', 'F2', 'mcc', 'max_per_class_error', 'mean_per_class_error', 'precision', 'recall', 'specificity', 'fallout', 'missrate', 'sensitivity', 'fpr', 'fnr', 'tpr', 'tnr'] failing_binomial_metrics = ['max_per_class_error', 'recall', 'specificity', 'fallout', 'missrate', 'sensitivity', 'fpr', 'fnr', 'tpr', 'tnr'] for metric_method in (m for m in binomial_only_metric_methods if m not in failing_binomial_metrics): # FIXME: not sure that returning a 2d-array is justified when not passing any threshold m0mm = getattr(m0, metric_method)()[0] m1mm = getattr(m1, metric_method)()[0] m2mm = getattr(m2, metric_method)()[0] assert m0mm == m1mm or abs(m0mm[1] - m1mm[1]) < 1e-5, \ "{} is different for model_performance and make_metrics on [0, 1] domain".format(metric_method) assert m1mm == m2mm or abs(m1mm[1] - m2mm[1]) < 1e-5, \ "{} is different for make_metrics on [0, 1] domain and make_metrics without domain".format(metric_method) failures = 0 for metric_method in failing_binomial_metrics: for m in [m0, m1, m2]: try: assert isinstance(getattr(m, metric_method)()[0][1], float) except: failures += 1 assert failures == 3 * len(failing_binomial_metrics) # Testing confusion matrix cm0 = m0.confusion_matrix(metrics=max_metrics) assert len(cm0) == len(max_metrics) assert all([any(m in header for header in map(lambda cm: cm.table._table_header, cm0) for m in max_metrics)]), \ "got duplicate CM headers, although all metrics are different" cm0t = m0.confusion_matrix(metrics=max_metrics, thresholds=[.3, .6]) assert len(cm0t) == 2 + len(max_metrics) assert 2 == sum([not any(m in header for m in max_metrics) for header in map(lambda cm: cm.table._table_header, cm0t)]), \ "missing or duplicate headers without metric (thresholds only CMs)" assert all([any(m in header for header in map(lambda cm: cm.table._table_header, cm0t) for m in max_metrics)]), \ "got duplicate CM headers, although all metrics are different" print("\n\n======= MULTINOMIAL ========\n") response = "RACE" predictors = list(set(fr.names) - {"ID", response}) model = H2OGradientBoostingEstimator(distribution="multinomial", ntrees=2, max_depth=3, min_rows=1, learn_rate=0.01, nbins=20) model.train(x=predictors, y=response, training_frame=fr) predicted = h2o.assign(model.predict(fr)[1:], "pred") actual = h2o.assign(fr[response].asfactor(), "act") domain = fr[response].levels()[0] m0 = model.model_performance(train=True) m1 = h2o.make_metrics(predicted, actual, domain=domain) m2 = h2o.make_metrics(predicted, actual) assert abs(m0.mse() - m1.mse()) < 1e-5 assert abs(m0.rmse() - m1.rmse()) < 1e-5 assert abs(m0.logloss() - m1.logloss()) < 1e-5 assert abs(m0.mean_per_class_error() - m1.mean_per_class_error()) < 1e-5 assert abs(m2.mse() - m1.mse()) < 1e-5 assert abs(m2.rmse() - m1.rmse()) < 1e-5 assert abs(m2.logloss() - m1.logloss()) < 1e-5 assert abs(m2.mean_per_class_error() - m1.mean_per_class_error()) < 1e-5
def pyunit_make_metrics(): fr = h2o.import_file(pyunit_utils.locate("smalldata/logreg/prostate.csv")) fr["CAPSULE"] = fr["CAPSULE"].asfactor() fr["RACE"] = fr["RACE"].asfactor() fr.describe() response = "AGE" predictors = list(set(fr.names) - {"ID", response}) print("\n\n======= REGRESSION ========\n") for distr in ["gaussian", "poisson", "laplace", "gamma"]: print("distribution: %s" % distr) model = H2OGradientBoostingEstimator(distribution=distr, ntrees=2, max_depth=3, min_rows=1, learn_rate=0.1, nbins=20) model.train(x=predictors, y=response, training_frame=fr) predicted = h2o.assign(model.predict(fr), "pred") actual = fr[response] m0 = model.model_performance(train=True) m1 = h2o.make_metrics(predicted, actual, distribution=distr) m2 = h2o.make_metrics(predicted, actual) print("model performance:") print(m0) print("make_metrics (distribution=%s):" % distr) print(m1) print("make_metrics (distribution=None):") print(m2) assert abs(m0.mae() - m1.mae()) < 1e-5 assert abs(m0.mse() - m1.mse()) < 1e-5 assert abs(m0.rmse() - m1.rmse()) < 1e-5 assert abs(m0.mean_residual_deviance() - m1.mean_residual_deviance()) < 1e-5 assert abs(m0.rmsle() - m1.rmsle()) < 1e-5 assert abs(m2.mae() - m1.mae()) < 1e-5 assert abs(m2.mse() - m1.mse()) < 1e-5 assert abs(m2.rmse() - m1.rmse()) < 1e-5 assert (abs(m1.mean_residual_deviance() - m2.mean_residual_deviance()) < 1e-7) == (distr == "gaussian") assert abs(m2.rmsle() - m1.rmsle()) < 1e-5 print("\n\n======= BINOMIAL ========\n") response = "CAPSULE" predictors = list(set(fr.names) - {"ID", response}) model = H2OGradientBoostingEstimator(distribution="bernoulli", ntrees=2, max_depth=3, min_rows=1, learn_rate=0.01, nbins=20, seed=1) model.train(x=predictors, y=response, training_frame=fr) predicted = h2o.assign(model.predict(fr)[2], "pred") actual = h2o.assign(fr[response].asfactor(), "act") domain = ["0", "1"] m0 = model.model_performance(train=True) m1 = h2o.make_metrics(predicted, actual, domain=domain) m2 = h2o.make_metrics(predicted, actual) print("m0:") print(m0) print("m1:") print(m1) print("m2:") print(m2) assert m0.accuracy()[0][1] + m0.error()[0][1] == 1 assert len(m0.accuracy(thresholds='all')) == len(m0.fprs) assert m0.accuracy().value == m1.accuracy().value == m0.accuracy()[0][1] assert m0.accuracy().value + m0.error().value == 1 assert isinstance(m0.accuracy(thresholds=0.4).value, float) assert m0.accuracy(thresholds=0.4).value == m1.accuracy( thresholds=0.4).value == m0.accuracy(thresholds=0.4)[0][1] assert m0.accuracy(thresholds=0.4).value + m0.error( thresholds=0.4).value == 1 assert isinstance(m0.accuracy(thresholds=[0.4]).value, list) assert len(m0.accuracy(thresholds=[0.4]).value) == 1 assert m0.accuracy(thresholds=[0.4]).value[0] == m0.accuracy( thresholds=0.4).value assert isinstance(m0.accuracy(thresholds=[0.4, 0.5]).value, list) assert len(m0.accuracy(thresholds=[0.4, 0.5]).value) == 2 assert m0.accuracy(thresholds=[0.4, 0.5]).value == [ m0.accuracy(thresholds=0.4).value, m0.accuracy(thresholds=0.5).value ] # Testing base metric methods # FIXME: check the same failures for other ModelMetrics impl. and then fix'emall or move them out of base class... base_metrics_methods_failing_on_H2OBinomialModelMetrics = [ 'aic', 'mae', 'mean_per_class_error', 'mean_residual_deviance', 'rmsle' ] for metric_method in ( m for m in base_metric_methods if m not in base_metrics_methods_failing_on_H2OBinomialModelMetrics ): m0mm = getattr(m0, metric_method)() m1mm = getattr(m1, metric_method)() m2mm = getattr(m2, metric_method)() assert m0mm == m1mm or abs(m0mm - m1mm) < 1e-5, \ "{} is different for model_performance and make_metrics on [0, 1] domain".format(metric_method) assert m1mm == m2mm or abs(m1mm - m2mm) < 1e-5, \ "{} is different for make_metrics on [0, 1] domain and make_metrics without domain".format(metric_method) # FIXME: for binomial mean_per_class_error is strangely accessible as an array assert abs(m0.mean_per_class_error()[0][1] - m1.mean_per_class_error()[0][1]) < 1e-5 assert abs(m2.mean_per_class_error()[0][1] - m1.mean_per_class_error()[0][1]) < 1e-5 failures = 0 for metric_method in base_metrics_methods_failing_on_H2OBinomialModelMetrics: for m in [m0, m1, m2]: try: assert isinstance(getattr(m, metric_method)(), float) except: failures += 1 assert failures == 3 * len( base_metrics_methods_failing_on_H2OBinomialModelMetrics) # Testing binomial-only metric methods binomial_only_metric_methods = [ 'accuracy', 'F0point5', 'F1', 'F2', 'mcc', 'max_per_class_error', 'mean_per_class_error', 'precision', 'recall', 'specificity', 'fallout', 'missrate', 'sensitivity', 'fpr', 'fnr', 'tpr', 'tnr' ] for metric_method in (m for m in binomial_only_metric_methods): # FIXME: not sure that returning a 2d-array is justified when not passing any threshold m0mm = getattr(m0, metric_method)()[0] m1mm = getattr(m1, metric_method)()[0] m2mm = getattr(m2, metric_method)()[0] assert m0mm == m1mm or abs(m0mm[1] - m1mm[1]) < 1e-5, \ "{} is different for model_performance and make_metrics on [0, 1] domain".format(metric_method) assert m1mm == m2mm or abs(m1mm[1] - m2mm[1]) < 1e-5, \ "{} is different for make_metrics on [0, 1] domain and make_metrics without domain".format(metric_method) # Testing confusion matrix cm0 = m0.confusion_matrix(metrics=max_metrics) assert len(cm0) == len(max_metrics) assert all([any(m in header for header in map(lambda cm: cm.table._table_header, cm0) for m in max_metrics)]), \ "got duplicate CM headers, although all metrics are different" cm0t = m0.confusion_matrix(metrics=max_metrics, thresholds=[.3, .6]) assert len(cm0t) == 2 + len(max_metrics) assert 2 == sum([not any(m in header for m in max_metrics) for header in map(lambda cm: cm.table._table_header, cm0t)]), \ "missing or duplicate headers without metric (thresholds only CMs)" assert all([any(m in header for header in map(lambda cm: cm.table._table_header, cm0t) for m in max_metrics)]), \ "got duplicate CM headers, although all metrics are different" print("\n\n======= MULTINOMIAL ========\n") response = "RACE" predictors = list(set(fr.names) - {"ID", response}) model = H2OGradientBoostingEstimator(distribution="multinomial", ntrees=2, max_depth=3, min_rows=1, learn_rate=0.01, nbins=20) model.train(x=predictors, y=response, training_frame=fr) predicted = h2o.assign(model.predict(fr)[1:], "pred") actual = h2o.assign(fr[response].asfactor(), "act") domain = fr[response].levels()[0] m0 = model.model_performance(train=True) m1 = h2o.make_metrics(predicted, actual, domain=domain) m2 = h2o.make_metrics(predicted, actual) assert abs(m0.mse() - m1.mse()) < 1e-5 assert abs(m0.rmse() - m1.rmse()) < 1e-5 assert abs(m0.logloss() - m1.logloss()) < 1e-5 assert abs(m0.mean_per_class_error() - m1.mean_per_class_error()) < 1e-5 assert abs(m2.mse() - m1.mse()) < 1e-5 assert abs(m2.rmse() - m1.rmse()) < 1e-5 assert abs(m2.logloss() - m1.logloss()) < 1e-5 assert abs(m2.mean_per_class_error() - m1.mean_per_class_error()) < 1e-5
def pyunit_make_metrics(): fr = h2o.import_file(pyunit_utils.locate("smalldata/logreg/prostate.csv")) fr["CAPSULE"] = fr["CAPSULE"].asfactor() fr["RACE"] = fr["RACE"].asfactor() fr.describe() response = "AGE" predictors = list(set(fr.names) - {"ID", response}) print("\n\n======= REGRESSION ========\n") for distr in ["gaussian", "poisson", "laplace", "gamma"]: print("distribution: %s" % distr) model = H2OGradientBoostingEstimator(distribution=distr, ntrees=2, max_depth=3, min_rows=1, learn_rate=0.1, nbins=20) model.train(x=predictors, y=response, training_frame=fr) predicted = h2o.assign(model.predict(fr), "pred") actual = fr[response] m0 = model.model_performance(train=True) m1 = h2o.make_metrics(predicted, actual, distribution=distr) m2 = h2o.make_metrics(predicted, actual) print("model performance:") print(m0) print("make_metrics (distribution=%s):" % distr) print(m1) print("make_metrics (distribution=None):") print(m2) assert abs(m0.mae() - m1.mae()) < 1e-5 assert abs(m0.mse() - m1.mse()) < 1e-5 assert abs(m0.rmse() - m1.rmse()) < 1e-5 assert abs(m0.mean_residual_deviance() - m1.mean_residual_deviance()) < 1e-5 assert abs(m0.rmsle() - m1.rmsle()) < 1e-5 assert abs(m2.mae() - m1.mae()) < 1e-5 assert abs(m2.mse() - m1.mse()) < 1e-5 assert abs(m2.rmse() - m1.rmse()) < 1e-5 assert (abs(m1.mean_residual_deviance() - m2.mean_residual_deviance()) < 1e-7) == (distr == "gaussian") assert abs(m2.rmsle() - m1.rmsle()) < 1e-5 print("\n\n======= BINOMIAL ========\n") response = "CAPSULE" predictors = list(set(fr.names) - {"ID", response}) model = H2OGradientBoostingEstimator() model.train(x=predictors, y=response, distribution="bernoulli", training_frame=fr, ntrees=2, max_depth=3, min_rows=1, learn_rate=0.01, nbins=20) predicted = h2o.assign(model.predict(fr)[2], "pred") actual = h2o.assign(fr[response].asfactor(), "act") domain = ["0", "1"] m0 = model.model_performance(train=True) m1 = h2o.make_metrics(predicted, actual, domain=domain) m2 = h2o.make_metrics(predicted, actual) print("m0:") print(m0) print("m1:") print(m1) print("m2:") print(m2) assert abs(m0.auc() - m1.auc()) < 1e-5 assert abs(m0.mse() - m1.mse()) < 1e-5 assert abs(m0.rmse() - m1.rmse()) < 1e-5 assert abs(m0.logloss() - m1.logloss()) < 1e-5 assert abs(m0.mean_per_class_error()[0][1] - m1.mean_per_class_error()[0][1]) < 1e-5 assert abs(m2.auc() - m1.auc()) < 1e-5 assert abs(m2.mse() - m1.mse()) < 1e-5 assert abs(m2.rmse() - m1.rmse()) < 1e-5 assert abs(m2.logloss() - m1.logloss()) < 1e-5 assert abs(m2.mean_per_class_error()[0][1] - m1.mean_per_class_error()[0][1]) < 1e-5 print("\n\n======= MULTINOMIAL ========\n") response = "RACE" predictors = list(set(fr.names) - {"ID", response}) model = H2OGradientBoostingEstimator() model.train(x=predictors, y=response, distribution="multinomial", training_frame=fr, ntrees=2, max_depth=3, min_rows=1, learn_rate=0.01, nbins=20) predicted = h2o.assign(model.predict(fr)[1:], "pred") actual = h2o.assign(fr[response].asfactor(), "act") domain = fr[response].levels()[0] m0 = model.model_performance(train=True) m1 = h2o.make_metrics(predicted, actual, domain=domain) m2 = h2o.make_metrics(predicted, actual) assert abs(m0.mse() - m1.mse()) < 1e-5 assert abs(m0.rmse() - m1.rmse()) < 1e-5 assert abs(m0.logloss() - m1.logloss()) < 1e-5 assert abs(m0.mean_per_class_error() - m1.mean_per_class_error()) < 1e-5 assert abs(m2.mse() - m1.mse()) < 1e-5 assert abs(m2.rmse() - m1.rmse()) < 1e-5 assert abs(m2.logloss() - m1.logloss()) < 1e-5 assert abs(m2.mean_per_class_error() - m1.mean_per_class_error()) < 1e-5
def execute(h2o, params, config): frame_id = config.get('frame_id') df = h2o.get_frame(frame_id) train = int(params.get('train_ratio')) test = params.get('test_ratio') if test is None or len(test) == 0: test = 0 else: test = int(test) valid = params.get('valid_ratio') if valid is None or len(valid) == 0: valid = 0 else: valid = int(valid) seed = params.get('seed') if seed is None or len(seed) == 0: seed = None else: seed = int(seed) train_ratio = train / (train + test + valid) test_ratio = test / (train + test + valid) valid_ratio = valid / (train + test + valid) if valid == 0 and test == 0: return {'frame_id': frame_id} elif valid == 0: df_train, df_test = df.split_frame(ratios=[train_ratio], seed=seed) df_valid = None elif test == 0: df_train, df_valid = df.split_frame(ratios=[train_ratio], seed=seed) df_test = None else: df_train, df_test, df_valid = df.split_frame( ratios=[train_ratio, test_ratio], seed=seed) train_frame_id = append_frame_id(frame_id, params.get('train_suffix')) h2o.assign(df_train, train_frame_id) if df_test is None: test_frame_id = None else: test_frame_id = append_frame_id(frame_id, params.get('test_suffix')) h2o.assign(df_test, test_frame_id) if df_valid is None: valid_frame_id = None else: valid_frame_id = append_frame_id(frame_id, params.get('valid_suffix')) h2o.assign(df_valid, valid_frame_id) return { 'frame_id': train_frame_id, 'train_frame_id': train_frame_id, 'test_frame_id': test_frame_id, 'valid_frame_id': valid_frame_id, }
def pyunit_apply_assign(): fr = h2o.import_file(pyunit_utils.locate("smalldata/logreg/prostate.csv")) bool_fr = fr.apply(lambda x: x['PSA'] > x['VOL'],axis=1) h2o.assign(fr.cbind(bool_fr), 'supp_fr') print h2o.get_frame('supp_fr')