def import_svmlight(path, headers=""): raw = h2o.lazy_import(path) if settings.debug and len(headers) < 100: print utils.time() + "import with headers: " + str(headers) #parsesetup = h2o.parse_setup(raw,column_names=headers) parsesetup = h2o.parse_setup( raw ) # Issue: H2O 3.8 tests length of header vs. columns, but still imports the "pseudotarget" additionally parsesetup['parse_type'] = 'SVMLight' loaded_frame = h2o.parse_raw(parsesetup) if settings.debug: print "......HEader length: " + str(len(headers)) print "......Frame imported: " + str(loaded_frame.ncol) if (len(headers) > loaded_frame.ncol): n = len(headers) - loaded_frame.ncol print "Remove last " + str(n) + " header entries" del headers[-n:] loaded_frame.set_names(headers) #Workaround, Set names now print "First column: " + loaded_frame.names[ 0] #needed because lazy name setting if settings.debug and len(headers) < 100: loaded_frame.head(show=True) loaded_frame.pop(0) #remove first ('pseudotarget') columnn #if loaded_frame.ncol>len(headers)-1: #workaround: H2O reads info from svmlight into columns -> remove everything that is not in headers # delete = [] # for i in xrange(len(headers)-1,loaded_frame.ncol): # delete.append(loaded_frame.names[i]) # loaded_frame = remove_vecs(loaded_frame,delete) if settings.debug and len(headers) < 100: loaded_frame.head(show=True) return loaded_frame
def h2olazy_import(): """ Python API test: h2o.lazy_import(path) """ training_data = h2o.lazy_import( pyunit_utils.locate("smalldata/prostate/prostate_cat.csv")) assert_is_type(training_data, list)
def __init__(self, mojo_path=None): """ Create a new H2OMojoPipeline object. :param mojo_path path to a MOJO file. """ assert_is_type(mojo_path, str) self.pipeline_id = h2o.lazy_import(mojo_path)
def h2olazy_import(): """ Python API test: h2o.lazy_import(path) """ try: training_data = h2o.lazy_import( pyunit_utils.locate("smalldata/prostate/prostate_cat.csv")) except Exception as e: assert False, "h2o.lazy_import() command is not working."
def continuous_or_categorical(): fraw = h2o.lazy_import(tests.locate("smalldata/jira/hexdev_29.csv")) fsetup = h2o.parse_setup(fraw) fsetup["column_types"][0] = "ENUM" fsetup["column_types"][1] = "ENUM" fsetup["column_types"][2] = "ENUM" df_hex = h2o.parse_raw(fsetup) df_hex.summary() assert (df_hex['h1'].isfactor()) assert (df_hex['h2'].isfactor()) assert (df_hex['h3'].isfactor())
def from_file(file=str): """ Creates new Generic model by loading existing embedded model into library, e.g. from H2O MOJO. The imported model must be supported by H2O. :param file: A string containing path to the file to create the model from :return: H2OGenericEstimator instance representing the generic model """ from h2o import lazy_import, get_frame model_key = lazy_import(file) model_bytes_frame = get_frame(model_key[0]) model = H2OGenericEstimator(model_key=model_bytes_frame) model.train() return model
def hexdev_394(): path = tests.locate("smalldata/covtype/covtype.20k.data") trainraw = h2o.lazy_import(path) tsetup = h2o.parse_setup(trainraw) tsetup["column_types"][10] = "ENUM" tsetup["column_types"][11] = "ENUM" tsetup["column_types"][12] = "ENUM" train = h2o.parse_raw(tsetup) cols = train.col_names # This returned space for first column name x_cols = [colname for colname in cols if colname != "C55"] x_cols splits = train.split_frame() newtrain = splits[0] newvalid = splits[1] newtrain_x = newtrain[x_cols] newtrain_y = newtrain[54].asfactor() newvalid_x = newvalid[x_cols] newvalid_y = newvalid[54].asfactor() my_gbm = h2o.gbm(y=newtrain_y, validation_y=newvalid_y, x=newtrain_x, validation_x=newvalid_x, distribution = "multinomial", ntrees=100, learn_rate=0.1, max_depth=6) split1, split2 = train.split_frame() newtrain_x = split1[x_cols] newtrain_y = split1[54].asfactor() newvalid_x = split2[x_cols] newvalid_y = split2[54].asfactor() my_gbm = h2o.gbm(y=newtrain_y, validation_y=newvalid_y, x=newtrain_x, validation_x=newvalid_x, distribution = "multinomial", ntrees=100, learn_rate=0.1, max_depth=6) print "KEEPING FRAME???" print train._keep
def continuous_or_categorical(): fraw = h2o.lazy_import(h2o.locate("smalldata/jira/hexdev_29.csv")) fsetup = h2o.parse_setup(fraw) fsetup["column_types"][0] = "ENUM" fsetup["column_types"][1] = "ENUM" fsetup["column_types"][2] = "ENUM" df_hex = h2o.parse_raw(fsetup) df_hex.summary() assert (df_hex['h1'].isfactor()) assert (df_hex['h2'].isfactor()) assert (df_hex['h3'].isfactor())
def read_csv(file_path, destination_frame, header=(-1,0,1), separator="", column_names=None, column_types=None, na_strings=None): """ Build an H2OFrame from parsing a CSV at file_path. This path is relative to the H2O cluster, NOT the local Python process :param file_path: A remote path to a data source. Data is cluster-local. :param destination_frame: The result *Key* name in the H2O cluster """ rawkey = h2o.lazy_import(file_path) res = H2OFrame._parse(rawkey, destination_frame, header, separator, column_names, column_types, na_strings) nrows = res.nrow ncols = res.ncol if isinstance(file_path, str): print "Imported {}. Parsed {} rows and {} cols".format(file_path,"{:,}".format(nrows), "{:,}".format(ncols)) else: h2o.H2ODisplay([["File"+str(i+1),f] for i,f in enumerate(file_path)],None, "Parsed {} rows and {} cols".format("{:,}".format(nrows), "{:,}".format(ncols))) return res
def test(x, y, output_test, strip_part, algo_name, generic_algo_name): airlines = h2o.import_file( path=pyunit_utils.locate("smalldata/testng/airlines_train.csv")) gbm = H2ODeepLearningEstimator(epochs=1) gbm.train(x=x, y=y, training_frame=airlines, validation_frame=airlines) print(gbm) with Capturing() as original_output: gbm.show() original_model_filename = tempfile.mkdtemp() original_model_filename = gbm.download_mojo(original_model_filename) key = h2o.lazy_import(original_model_filename) fr = h2o.get_frame(key[0]) generic_mojo_model = H2OGenericEstimator(model_key=fr) generic_mojo_model.train() compare_params(gbm, generic_mojo_model) print(generic_mojo_model) with Capturing() as generic_output: generic_mojo_model.show() output_test(str(original_output), str(generic_output), strip_part, algo_name, generic_algo_name) predictions = generic_mojo_model.predict(airlines) assert predictions is not None assert predictions.nrows == 24421 assert generic_mojo_model._model_json["output"][ "model_summary"] is not None assert len(generic_mojo_model._model_json["output"] ["model_summary"]._cell_values) > 0 # Test constructor generating the model from existing MOJO file generic_mojo_model_from_file = H2OGenericEstimator.from_file( original_model_filename) assert generic_mojo_model_from_file is not None predictions = generic_mojo_model_from_file.predict(airlines) assert predictions is not None assert predictions.nrows == 24421 assert generic_mojo_model_from_file._model_json["output"][ "model_summary"] is not None assert len(generic_mojo_model_from_file._model_json["output"] ["model_summary"]._cell_values) > 0 generic_mojo_filename = tempfile.mkdtemp("zip", "genericMojo") generic_mojo_filename = generic_mojo_model_from_file.download_mojo( path=generic_mojo_filename) assert os.path.getsize(generic_mojo_filename) == os.path.getsize( original_model_filename)
def stackedensemble_mojo_model_test(): train = h2o.import_file( pyunit_utils.locate("smalldata/iris/iris_train.csv")) test = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris_test.csv")) x = train.columns y = "species" nfolds = 2 gbm = H2OGradientBoostingEstimator(nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True) gbm.train(x=x, y=y, training_frame=train) rf = H2ORandomForestEstimator(nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True) rf.train(x=x, y=y, training_frame=train) se = H2OStackedEnsembleEstimator(training_frame=train, validation_frame=test, base_models=[gbm.model_id, rf.model_id]) se.train(x=x, y=y, training_frame=train) print(se) with Capturing() as original_output: se.show() original_model_filename = tempfile.mkdtemp() original_model_filename = se.download_mojo(original_model_filename) key = h2o.lazy_import(original_model_filename) fr = h2o.get_frame(key[0]) generic_mojo_model = H2OGenericEstimator(model_key=fr) generic_mojo_model.train() compare_params(se, generic_mojo_model) predictions = generic_mojo_model.predict(test) assert predictions is not None # Test constructor generating the model from existing MOJO file generic_mojo_model_from_file = H2OGenericEstimator.from_file( original_model_filename) assert generic_mojo_model_from_file is not None predictions = generic_mojo_model_from_file.predict(test) assert predictions is not None generic_mojo_filename = tempfile.mkdtemp("zip", "genericMojo") generic_mojo_filename = generic_mojo_model_from_file.download_mojo( path=generic_mojo_filename) assert os.path.getsize(generic_mojo_filename) == os.path.getsize( original_model_filename)
def test(x, y, output_test, strip_part, algo_name, generic_algo_name): airlines = h2o.import_file( path=pyunit_utils.locate("smalldata/testng/airlines_train.csv")) xgb = H2OXGBoostEstimator(ntrees=1, nfolds=3) xgb.train(x=x, y=y, training_frame=airlines, validation_frame=airlines) print(xgb) with Capturing() as original_output: xgb.show() original_model_filename = tempfile.mkdtemp() original_model_filename = xgb.download_mojo(original_model_filename) key = h2o.lazy_import(original_model_filename) fr = h2o.get_frame(key[0]) model = H2OGenericEstimator(model_key=fr) model.train() print(model) with Capturing() as generic_output: model.show() output_test(str(original_output), str(generic_output), strip_part, algo_name, generic_algo_name) predictions = model.predict(airlines) assert predictions is not None assert predictions.nrows == 24421 assert model._model_json["output"]["variable_importances"] is not None assert len( model._model_json["output"]["variable_importances"]._cell_values) > 0 assert model._model_json["output"]["model_summary"] is not None assert len(model._model_json["output"]["model_summary"]._cell_values) > 0 # Test constructor generating the model from existing MOJO file model = H2OGenericEstimator.from_file(original_model_filename) assert model is not None predictions = model.predict(airlines) assert predictions is not None assert predictions.nrows == 24421 assert model._model_json["output"]["variable_importances"] is not None assert len( model._model_json["output"]["variable_importances"]._cell_values) > 0 assert model._model_json["output"]["model_summary"] is not None assert len(model._model_json["output"]["model_summary"]._cell_values) > 0 generic_mojo_filename = tempfile.mkdtemp("zip", "genericMojo") generic_mojo_filename = model.download_mojo(path=generic_mojo_filename) assert os.path.getsize(generic_mojo_filename) == os.path.getsize( original_model_filename)
def hexdev_394(): path = tests.locate("smalldata/covtype/covtype.20k.data") trainraw = h2o.lazy_import(path) tsetup = h2o.parse_setup(trainraw) tsetup["column_types"][10] = "ENUM" tsetup["column_types"][11] = "ENUM" tsetup["column_types"][12] = "ENUM" train = h2o.parse_raw(tsetup) cols = train.col_names # This returned space for first column name x_cols = [colname for colname in cols if colname != "C55"] x_cols splits = train.split_frame() newtrain = splits[0] newvalid = splits[1] newtrain_x = newtrain[x_cols] newtrain_y = newtrain[54].asfactor() newvalid_x = newvalid[x_cols] newvalid_y = newvalid[54].asfactor() my_gbm = h2o.gbm(y=newtrain_y, validation_y=newvalid_y, x=newtrain_x, validation_x=newvalid_x, distribution="multinomial", ntrees=100, learn_rate=0.1, max_depth=6) split1, split2 = train.split_frame() newtrain_x = split1[x_cols] newtrain_y = split1[54].asfactor() newvalid_x = split2[x_cols] newvalid_y = split2[54].asfactor() my_gbm = h2o.gbm(y=newtrain_y, validation_y=newvalid_y, x=newtrain_x, validation_x=newvalid_x, distribution="multinomial", ntrees=100, learn_rate=0.1, max_depth=6) print "KEEPING FRAME???" print train._keep
def mojo_model_test(): # GBM airlines = h2o.import_file( path=pyunit_utils.locate("smalldata/testng/airlines_train.csv")) gbm = H2OGradientBoostingEstimator(ntrees=1) gbm.train(x=["Origin", "Dest"], y="IsDepDelayed", training_frame=airlines) original_model_filename = tempfile.mkdtemp() original_model_filename = gbm.download_mojo(original_model_filename) key = h2o.lazy_import(original_model_filename) fr = h2o.get_frame(key[0]) model = H2OGenericEstimator(model_key=fr) model.train() predictions = model.predict(airlines) assert predictions is not None assert predictions.nrows == 24421 assert model._model_json["output"]["variable_importances"] is not None assert len( model._model_json["output"]["variable_importances"]._cell_values) > 0 assert model._model_json["output"]["model_summary"] is not None assert len(model._model_json["output"]["model_summary"]._cell_values) > 0 # Test constructor generating the model from existing MOJO file model = H2OGenericEstimator.from_file(original_model_filename) assert model is not None predictions = model.predict(airlines) assert predictions is not None assert predictions.nrows == 24421 assert model._model_json["output"]["variable_importances"] is not None assert len( model._model_json["output"]["variable_importances"]._cell_values) > 0 assert model._model_json["output"]["model_summary"] is not None assert len(model._model_json["output"]["model_summary"]._cell_values) > 0 generic_mojo_filename = tempfile.mkdtemp("zip", "genericMojo") generic_mojo_filename = model.download_mojo(path=generic_mojo_filename) assert os.path.getsize(generic_mojo_filename) == os.path.getsize( original_model_filename)
def mojo_model_test(): # GBM airlines = h2o.import_file(path=pyunit_utils.locate("smalldata/testng/airlines_train.csv")) gbm = H2OGradientBoostingEstimator(ntrees = 1) gbm.train(x = ["Origin", "Dest"], y = "IsDepDelayed", training_frame=airlines) original_model_filename = tempfile.mkdtemp() original_model_filename = gbm.download_mojo(original_model_filename) key = h2o.lazy_import(original_model_filename) fr = h2o.get_frame(key[0]) model = H2OGenericEstimator(model_key = fr) model.train() predictions = model.predict(airlines) assert predictions is not None assert predictions.nrows == 24421 assert model._model_json["output"]["variable_importances"] is not None assert len(model._model_json["output"]["variable_importances"]._cell_values) > 0 assert model._model_json["output"]["model_summary"] is not None assert len(model._model_json["output"]["model_summary"]._cell_values) > 0 # Test constructor generating the model from existing MOJO file model = H2OGenericEstimator.from_file(original_model_filename) assert model is not None predictions = model.predict(airlines) assert predictions is not None assert predictions.nrows == 24421 assert model._model_json["output"]["variable_importances"] is not None assert len(model._model_json["output"]["variable_importances"]._cell_values) > 0 assert model._model_json["output"]["model_summary"] is not None assert len(model._model_json["output"]["model_summary"]._cell_values) > 0 generic_mojo_filename = tempfile.mkdtemp("zip", "genericMojo"); generic_mojo_filename = model.download_mojo(path=generic_mojo_filename) assert os.path.getsize(generic_mojo_filename) == os.path.getsize(original_model_filename)
def _import_parse(path, destination_frame, header, sep, column_names, column_types, na_strings): rawkey = h2o.lazy_import(path) return H2OFrame._parse(rawkey,destination_frame, header, sep, column_names, column_types, na_strings)
def h2olazy_import(): """ Python API test: h2o.lazy_import(path) """ training_data = h2o.lazy_import(pyunit_utils.locate("smalldata/prostate/prostate_cat.csv")) assert_is_type(training_data, list)