def concat(): df1 = h2o.create_frame(integer_fraction=1,binary_fraction=0,categorical_fraction=0,seed=1) df2 = h2o.create_frame(integer_fraction=1,binary_fraction=0,categorical_fraction=0,seed=2) df3 = h2o.create_frame(integer_fraction=1,binary_fraction=0,categorical_fraction=0,seed=3) print(df1) print(df2) print(df3) #Frame to Frame concat (column-wise) df123 = df1.concat([df2,df3]) rows, cols = df123.dim print(rows,cols) print(df123) assert rows == 10000 and cols == 30, "unexpected dimensions in column concatenation for a Frame" #Frame to Frame concat (row wise) df123_row = df1.concat([df2,df3], axis = 0) rows2, cols2 = df123_row.dim print(rows2,cols2) print(df123_row) assert rows2 == 30000 and cols2 == 10, "unexpected dimensions in row concatenation for a Frame" #Frame to Vec concat (column wise) yy = df2[0] zz = df3[0] hdf = df1.concat([yy,zz]) rows3, cols3 = hdf.dim print(rows3,cols3) print(hdf) assert rows3 == 10000 and cols3 == 12, "unexpected dimensions in Frame to Vec concatenation" #Vec to Vec concat (column wise) xx = df1[0] yy = df2[0] zz = df3[0] hdf2 = xx.concat([yy,zz]) rows4, cols4 = hdf2.dim print(rows4,cols4) print(hdf2) assert rows4 == 10000 and cols4 == 3, "unexpected dimensions in Vec to Vec concatenation" #Frame to Vec concat (row wise) yy = df2[0,:] zz = df3[0,:] hdf3 = df1.concat([yy,zz],axis=0) rows5, cols5 = hdf3.dim print(rows5,cols5) print(hdf3) assert rows5 == 10002 and cols5 == 10, "unexpected dimensions in Frame to Vec concatenation" #Vec to Vec concat (row wise) xx = df1[0,:] yy = df2[0,:] zz = df3[0,:] hdf4 = xx.concat([yy,zz],axis=0) rows6, cols6 = hdf4.dim print(rows6,cols6) print(hdf4) assert rows6 == 3 and cols6 == 10, "unexpected dimensions in Vec to Vec concatenation"
def check_big_merge(): h2o.remove_all() nrow = 1000000 ncol = 2 iRange = 100000 frame1 = h2o.create_frame(rows=nrow, cols=ncol, integer_fraction=1, seed=12345, integer_range=iRange, missing_fraction=0.0) frame2 = h2o.create_frame(rows=nrow, cols=ncol, integer_fraction=1, seed=54321, integer_range=iRange, missing_fraction=0.0) frame1.set_names(["C1", "C2"]) frame2.set_names(["C1", "C3"]) mergedExact = frame1.merge(frame2, by_x=["C1"], by_y=["C1"], all_x=False, all_y=False) mergedLeft = frame1.merge(frame2, by_x=["C1"], by_y=["C1"], all_x=True) assert mergedExact.nrow < mergedLeft.nrow, "Expected row numbers are wrong"
def pubdev_6304(): fractions = dict() fractions["real_fraction"] = 0 # Right now we are dropping string columns, so no point in having them. fractions["categorical_fraction"] = 1 fractions["integer_fraction"] = 0 fractions["time_fraction"] = 0 fractions["string_fraction"] = 0 # Right now we are dropping string columns, so no point in having them. fractions["binary_fraction"] = 0 # this used to get an error message try: traindata = h2o.create_frame(rows=100, cols=2, missing_fraction=0, has_response=False, factors=9999999, seed=12345, **fractions) except Exception as ex: sys.exit(1) # this get an error message try: traindata = h2o.create_frame(rows=100, cols=2, missing_fraction=0, has_response=False, factors=19999999, seed=12345, **fractions) sys.exit(1) # should have thrown an error except Exception as ex: # expect an error here print(ex) if 'Number of factors must be <= 10,000,000' in ex.args[0].dev_msg: sys.exit(0) # correct error message else: sys.exit(1) # something else is wrong.
def test_parser_svmlight_column_skip(): # generate a big frame with all datatypes and save it to svmlight nrow = 10000 ncol = 50 seed = 12345 f1 = h2o.create_frame(rows=nrow, cols=ncol, real_fraction=0.5, integer_fraction=0.5, missing_fraction=0.2, has_response=False, seed=seed) f2 = h2o.create_frame(rows=nrow, cols=1, real_fraction=1, integer_fraction=0, missing_fraction=0, has_response=False, seed=seed) f2.set_name(0, "target") f1 = f2.cbind(f1) tmpdir = os.path.normpath( os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results")) if not (os.path.isdir(tmpdir)): os.mkdir(tmpdir) savefilenamewithpath = os.path.join(tmpdir, 'out.svm') pyunit_utils.write_H2OFrame_2_SVMLight(savefilenamewithpath, f1) # write h2o frame to svm format skip_all = list(range(ncol)) skip_even = list(range(0, ncol, 2)) try: loadFileSkipAll = h2o.upload_file(savefilenamewithpath, skipped_columns=skip_all) sys.exit(1) # should have failed here except: pass try: importFileSkipAll = h2o.import_file(savefilenamewithpath, skipped_columns=skip_all) sys.exit(1) # should have failed here except: pass try: importFileSkipSome = h2o.import_file(savefilenamewithpath, skipped_columns=skip_even) sys.exit(1) # should have failed here except: pass # check for correct parsing only checkCorrectSkips(savefilenamewithpath, f1)
def word2vec_export(): print("###### WORD2VEC ######") words = h2o.create_frame(rows=1000, cols=1, string_fraction=1.0, missing_fraction=0.0) embeddings = h2o.create_frame(rows=1000, cols=100, real_fraction=1.0, missing_fraction=0.0) frame = words.cbind(embeddings) model = H2OWord2vecEstimator(pre_trained=frame) model.train(training_frame=frame) expect_error(model.download_pojo, model="Word2Vec", format="POJO") model.download_mojo(path=RESULT_DIR)
def random_dataset(nrow, ncol, realFrac=0.4, intFrac=0.3, enumFrac=0.3, factorR=10, integerR=100, responseFactor=1, misFrac=0.01, randSeed=None): fractions = dict() if (ncol == 1) and (realFrac >= 1.0): fractions[ "real_fraction"] = 1 # Right now we are dropping string columns, so no point in having them. fractions["categorical_fraction"] = 0 fractions["integer_fraction"] = 0 fractions["time_fraction"] = 0 fractions[ "string_fraction"] = 0 # Right now we are dropping string columns, so no point in having them. fractions["binary_fraction"] = 0 return h2o.create_frame(rows=nrow, cols=ncol, missing_fraction=misFrac, has_response=True, response_factors=responseFactor, integer_range=integerR, seed=randSeed, **fractions) real_part = pyunit_utils.random_dataset_real_only(nrow, (int)(realFrac * ncol), misFrac=misFrac, randSeed=randSeed) cnames = ['c_' + str(ind) for ind in range(real_part.ncol)] real_part.set_names(cnames) enumFrac = enumFrac + (1 - realFrac) / 2 intFrac = 1 - enumFrac fractions[ "real_fraction"] = 0 # Right now we are dropping string columns, so no point in having them. fractions["categorical_fraction"] = enumFrac fractions["integer_fraction"] = intFrac fractions["time_fraction"] = 0 fractions[ "string_fraction"] = 0 # Right now we are dropping string columns, so no point in having them. fractions["binary_fraction"] = 0 df = h2o.create_frame(rows=nrow, cols=(ncol - real_part.ncol), missing_fraction=misFrac, has_response=True, response_factors=responseFactor, integer_range=integerR, seed=randSeed, **fractions) return real_part.cbind(df)
def word2vec_get_model(): print("Test retrieving a word2vec model by a key") words = h2o.create_frame(rows=1000,cols=1,string_fraction=1.0,missing_fraction=0.0) embeddings = h2o.create_frame(rows=1000,cols=100,real_fraction=1.0,missing_fraction=0.0) word_embeddings = words.cbind(embeddings) w2v_model = H2OWord2vecEstimator(pre_trained=word_embeddings) w2v_model.train(training_frame=word_embeddings) model_id = w2v_model.model_id model = h2o.get_model(model_id) assert model, "Model was retrived"
def word2vec_to_frame(): print("Test converting a word2vec model to a Frame") words = h2o.create_frame(rows=1000,cols=1,string_fraction=1.0,missing_fraction=0.0) embeddings = h2o.create_frame(rows=1000,cols=100,real_fraction=1.0,missing_fraction=0.0) word_embeddings = words.cbind(embeddings) w2v_model = H2OWord2vecEstimator(pre_trained=word_embeddings) w2v_model.train(training_frame=word_embeddings) w2v_frame = w2v_model.to_frame() word_embeddings.names = w2v_frame.names assert word_embeddings.as_data_frame().equals(word_embeddings.as_data_frame()), "Source and generated embeddings match"
def create_frame_test(): """Test `h2o.create_frame()`.""" for _ in range(10): r = random.randint(1, 1000) c = random.randint(1, 1000) frame = h2o.create_frame(rows=r, cols=c) assert frame.nrow == r and frame.ncol == c, \ "Expected {0} rows and {1} cols, but got {2} rows and {3} cols.".format(r, c, frame.nrow, frame.ncol) def assert_coltypes(frame, freal, fenum, fint, fbin, ftime, fstring): # The server does not report columns as binary -- instead they are integer. fint += fbin fbin = 0 type_counts = defaultdict(int) for ft in viewvalues(frame.types): type_counts[ft] += 1 print("Created table with column counts: {%s}" % ", ".join("%s: %d" % t for t in type_counts.items())) for ct in ["real", "enum", "int", "time", "string"]: assert abs(type_counts[ct] - locals()["f" + ct] * frame.ncol) < 1, \ "Wrong column count of type %s: %d" % (ct, type_counts[ct]) f1 = h2o.create_frame(rows=10, cols=1000, real_fraction=1) assert_coltypes(f1, 1, 0, 0, 0, 0, 0) f2 = h2o.create_frame(rows=10, cols=1000, binary_fraction=0.5, time_fraction=0.5) assert_coltypes(f2, 0, 0, 0, 0.5, 0.5, 0) f3 = h2o.create_frame(rows=10, cols=1000, string_fraction=0.2, time_fraction=0.8) assert_coltypes(f3, 0, 0, 0, 0, 0.8, 0.2) f4 = h2o.create_frame(rows=10, cols=1000, real_fraction=0.9) assert_coltypes(f4, 0.9, 0.04, 0.04, 0.02, 0, 0) f5 = h2o.create_frame(rows=2, cols=1000, integer_fraction=0.75000000000001, string_fraction=0.25000000000001) assert_coltypes(f5, 0, 0, 0.75, 0, 0, 0.25) try: h2o.create_frame(rows=10, cols=1000, real_fraction=0.1, categorical_fraction=0.1, integer_fraction=0.1, binary_fraction=0.1, time_fraction=0.1, string_fraction=0.1) assert False, "The data frame should not have been created!" except H2OValueError: pass try: h2o.create_frame(rows=10, cols=1000, real_fraction=0.5, categorical_fraction=0.5, integer_fraction=0.1) assert False, "The data frame should not have been created!" except H2OValueError: pass
def test_transform(): valid_values = ["none", "standardize", "normalize", "demean", "descale"] df = h2o.create_frame(rows=100, cols=4, categorical_fraction=0.4, integer_fraction=0, binary_fraction=0, real_range=100, integer_range=100, missing_fraction=0, seed=1234) model = H2OAggregatorEstimator(target_num_exemplars=5) try: for val in valid_values: model.transform = val model.train(training_frame=df) except: assert False, "Aggregator model should be able to process all valid transform values" # Try with invalid value try: model = H2OAggregatorEstimator(target_num_exemplars=5, transform="some_invalid_value") assert False, "Passing invalid value of transform should throw an error" except: pass
def test_parser_svmlight_column_skip_not_supported(): print("Test that functions calling fail if skipped_columns is passed with svm file.") # generate a frame nrow = 10 ncol = 10 seed = 12345 f1 = h2o.create_frame(rows=nrow, cols=ncol, real_fraction=0.5, integer_fraction=0.5, missing_fraction=0, has_response=False, seed=seed) results_path = pyunit_utils.locate("results") savefilenamewithpath = os.path.join(results_path, 'out.svm') pyunit_utils.write_H2OFrame_2_SVMLight(savefilenamewithpath, f1) # write h2o frame to svm format try: print("Test upload SVM file. " "Expected result is Java exception error: skipped_columns not supported for AVRO and SVMlight") h2o.upload_file(savefilenamewithpath, skipped_columns=[5]) assert False, "Test should have thrown an exception due skipped_columns parameter is present" # should have failed here except H2OResponseError as e: assert "skipped_columns are not supported" in str(e.args[0].exception_msg), "Exception message is different" print("Test OK, finished with H2OResponseError") try: print("Test import SVM file. " "Expected result is Java exception error: skipped_columns not supported for AVRO and SVMlight") h2o.import_file(savefilenamewithpath, skipped_columns=[5]) assert False, "Test should have thrown an exception due skipped_columns parameter is present" # should have failed here except H2OResponseError as e: assert "skipped_columns are not supported" in e.args[0].exception_msg, "Exception message is different" print("Test OK, finished with H2OResponseError")
def javapredict_dynamic_data(): # Generate random dataset dataset_params = {} dataset_params['rows'] = random.sample(range(5000,15001),1)[0] dataset_params['cols'] = random.sample(range(10,21),1)[0] dataset_params['categorical_fraction'] = round(random.random(),1) left_over = (1 - dataset_params['categorical_fraction']) dataset_params['integer_fraction'] = round(left_over - round(random.uniform(0,left_over),1),1) if dataset_params['integer_fraction'] + dataset_params['categorical_fraction'] == 1: if dataset_params['integer_fraction'] > dataset_params['categorical_fraction']: dataset_params['integer_fraction'] = dataset_params['integer_fraction'] - 0.1 else: dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1 dataset_params['missing_fraction'] = random.uniform(0,0.5) dataset_params['has_response'] = True dataset_params['randomize'] = True dataset_params['factors'] = random.randint(2,2000) print "Dataset parameters: {0}".format(dataset_params) append_response = False distribution = random.sample(['bernoulli','multinomial','gaussian','poisson','tweedie','gamma'], 1)[0] if distribution == 'gaussian': dataset_params['response_factors'] = 1 elif distribution == 'bernoulli': dataset_params['response_factors'] = 2 elif distribution == 'multinomial': dataset_params['response_factors'] = random.randint(3,100) else: dataset_params['has_response'] = False response = h2o.H2OFrame.fromPython([random.randint(1,1000) for r in range(0,dataset_params['rows'])]) append_response = True print "Distribution: {0}".format(distribution) train = h2o.create_frame(**dataset_params) if append_response: train = response.cbind(train) train.set_name(0,"response") if distribution == 'bernoulli' or distribution == 'multinomial': train['response'] = train['response'].asfactor() train = train.impute("response", method="mode") print "Training dataset:" print train # Save dataset to results directory results_dir = pyunit_utils.locate("results") h2o.download_csv(train,os.path.join(results_dir,"training_dataset.log")) # Generate random parameters params = {} if random.randint(0,1): params['ntrees'] = random.sample(range(1,21),1)[0] if random.randint(0,1): params['max_depth'] = random.sample(range(1,11),1)[0] if random.randint(0,1): params['min_rows'] = random.sample(range(1,11),1)[0] if random.randint(0,1): params['nbins'] = random.sample(range(2,21),1)[0] if random.randint(0,1): params['nbins_cats'] = random.sample(range(2,1025),1)[0] if random.randint(0,1): params['learn_rate'] = random.random() params['distribution'] = distribution print "Parameter list: {0}".format(params) x = train.names x.remove("response") y = "response" pyunit_utils.javapredict(algo="gbm", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)
def createData(nrows, ncols): hdfs_name_node = pyunit_utils.hadoop_namenode() hdfs_airlines_file = "/datasets/airlines_all.05p.csv" url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_airlines_file) airlines = h2o.import_file(url) myX = ["Year", "Month", "DayofMonth", "DayOfWeek", "Distance"] myY = "IsDepDelayed" allCols = list(myX) allCols.append(myY) airlines = airlines[allCols] num_new_features = ncols - airlines.ncol sample_data = h2o.create_frame(rows=nrows, cols=num_new_features, categorical_fraction=0, seed=1234, seed_for_column_types=1234) new_rows = nrows - airlines.nrow if (nrows > 0): extra_rows = airlines[0:nrows, :] airlines = airlines.rbind(extra_rows) airlines = airlines[0:nrows, :] full_data = airlines.cbind(sample_data) return full_data
def random_dataset(response_type, verbose=True, NTESTROWS=200, missing_fraction=0.0, seed=None): """Create and return a random dataset.""" if verbose: print("\nCreating a dataset for a %s problem:" % response_type) fractions = { 'real_fraction': 0.925363793458878, 'categorical_fraction': 0.9625390964218535, 'integer_fraction': 0.5693588274554572, 'time_fraction': 0.19987260017514685, 'string_fraction': 0.893090913162827, 'binary_fraction': 0.12909731789008272 } fractions[ "string_fraction"] = 0 # Right now we are dropping string columns, so no point in having them. fractions["binary_fraction"] /= 3 fractions["time_fraction"] /= 2 sum_fractions = sum(fractions.values()) for k in fractions: fractions[k] /= sum_fractions response_factors = 1 df = h2o.create_frame(rows=25000 + NTESTROWS, cols=20, missing_fraction=missing_fraction, has_response=True, response_factors=response_factors, positive_response=True, factors=10, seed=seed, **fractions) return df
def test_binary(): df = h2o.create_frame( rows=1000, cols=10, categorical_fraction=0.6, integer_fraction=0, binary_fraction=0, real_range=100, integer_range=100, missing_fraction=0.1, factors=5, seed=1234 ) params = { "target_num_exemplars": 100, "rel_tol_num_exemplars": 0.5, "categorical_encoding": "binary", "transform": "normalize" } agg = H2OAggregatorEstimator(**params) agg.train(training_frame=df) assert agg.aggregated_frame is not None, "Trained model should produce not empty aggregated frame" assert is_consistent(df.nrows, agg.aggregated_frame), \ "Exemplar counts should sum up to number of training rows" assert correct_num_exemplars(agg.aggregated_frame, **params), \ "Generated number of exemplars should match target value"
def test_high_cardinality_eigen(): df = h2o.create_frame(rows=10000, cols=10, categorical_fraction=0.6, integer_fraction=0, binary_fraction=0, real_range=100, integer_range=100, missing_fraction=0, factors=10, seed=1234) autoencoder = H2OAutoEncoderEstimator(categorical_encoding="eigen", reproducible=True, hidden=[50, 30], epochs=5, seed=42) autoencoder.train(training_frame=df) mojo = pyunit_utils.download_mojo(autoencoder) autoencoder_mojo = h2o.import_mojo(mojo["mojo_zip_path"]) preds_ae_h2o = autoencoder.predict(df) preds_ae_mojo = autoencoder_mojo.predict(df) assert_frame_equal(preds_ae_mojo.as_data_frame(), preds_ae_h2o.as_data_frame())
def test_show_time(): df = h2o.H2OFrame.from_python( {"A": [1, 2, 3], "B": ["a", "a", "b"], "C": ["hello", "all", "world"], "D": ["12MAR2015:11:00:00", "13MAR2015:12:00:00", "14MAR2015:13:00:00"]}, column_types={"A": "numeric", "B": "enum", "C": "string", "D": "time"} ) out = df.__unicode__() print(out) assert "2015-03-12 11:00:00" in out assert "2015-03-13 12:00:00" in out assert "2015-03-14 13:00:00" in out df2 = h2o.create_frame(cols=6, rows=10, time_fraction=1, missing_fraction=0.1) out2 = df2.__unicode__() print(out2) assert "e+" not in out2 assert "E+" not in out2 lines = out2.splitlines()[2:-2] # skip header (first 2 lines) + footer (last 2 lines) regex = re.compile(r"(\d+)-(\d+)-(\d+) (\d+):(\d+):(\d+)") for l in lines: for entry in l.split(" "): entry = entry.strip() if entry == "": continue # skip missing entries m = re.match(regex, entry) assert m is not None, "Failed to recognize time expression '%s'" % entry year = int(m.group(1)) month = int(m.group(2)) day = int(m.group(3)) assert 1970 <= year <= 2020 assert 1 <= month <= 12 assert 1 <= day <= 31
def test_transform(): valid_values = ["none", "standardize", "normalize", "demean", "descale"] df = h2o.create_frame( rows=100, cols=4, categorical_fraction=0.4, integer_fraction=0, binary_fraction=0, real_range=100, integer_range=100, missing_fraction=0, seed=1234 ) model = H2OAggregatorEstimator(target_num_exemplars=5) try: for val in valid_values: model.transform = val model.train(training_frame=df) except: assert False, "Aggregator model should be able to process all valid transform values" # Try with invalid value try: model = H2OAggregatorEstimator(target_num_exemplars=5, transform="some_invalid_value") assert False, "Passing invalid value of transform should throw an error" except: pass
def random_dataset(nrow, ncol, realFrac=0.4, intFrac=0.3, enumFrac=0.3, factorR=10, integerR=100, responseFactor=1, misFrac=0.01, randSeed=None): fractions = dict() fractions[ "real_fraction"] = realFrac # Right now we are dropping string columns, so no point in having them. fractions["categorical_fraction"] = enumFrac fractions["integer_fraction"] = intFrac fractions["time_fraction"] = 0 fractions[ "string_fraction"] = 0 # Right now we are dropping string columns, so no point in having them. fractions["binary_fraction"] = 0 df = h2o.create_frame(rows=nrow, cols=ncol, missing_fraction=misFrac, has_response=True, response_factors=responseFactor, factors=factorR, integer_range=integerR, real_range=integerR, seed=randSeed, **fractions) print(df.types) return df
def random_dataset(response_type, verbose=True): """Create and return a random dataset.""" if verbose: print("\nCreating a dataset for a %s problem:" % response_type) fractions = { k + "_fraction": random.random() for k in "real categorical integer time string binary".split() } fractions[ "string_fraction"] = 0 # Right now we are dropping string columns, so no point in having them. fractions["binary_fraction"] /= 3 fractions["time_fraction"] /= 2 sum_fractions = sum(fractions.values()) for k in fractions: fractions[k] /= sum_fractions response_factors = (1 if response_type == "regression" else 2 if response_type == "binomial" else random.randint(3, 10)) df = h2o.create_frame(rows=random.randint(15000, 25000) + NTESTROWS, cols=random.randint(3, 20), missing_fraction=random.uniform(0, 0.05), has_response=True, response_factors=response_factors, positive_response=True, factors=10, **fractions) if verbose: print() df.show() return df
def createData(nrows, ncols): hdfs_name_node = pyunit_utils.hadoop_namenode() hdfs_airlines_file = "/datasets/airlines_all.05p.csv" url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_airlines_file) airlines = h2o.import_file(url) myX = ["Year", "Month", "DayofMonth", "DayOfWeek", "Distance"] myY = "IsDepDelayed" allCols = list(myX) allCols.append(myY) airlines = airlines[allCols] num_new_features = ncols - airlines.ncol sample_data = h2o.create_frame(rows = nrows, cols = num_new_features, categorical_fraction = 0, seed = 1234, seed_for_column_types = 1234) new_rows = nrows - airlines.nrow if (nrows > 0): extra_rows = airlines[0:nrows, : ] airlines = airlines.rbind(extra_rows) airlines = airlines[0:nrows, : ] full_data = airlines.cbind(sample_data) return full_data
def javapredict_dynamic_data(): # Generate random dataset dataset_params = {} dataset_params['rows'] = random.sample(list(range(100, 200)), 1)[0] dataset_params['cols'] = random.sample(list(range(10, 21)), 1)[0] dataset_params['categorical_fraction'] = round(random.random(), 1) left_over = (1 - dataset_params['categorical_fraction']) dataset_params['integer_fraction'] = round( left_over - round(random.uniform(0, left_over), 1), 1) if dataset_params['integer_fraction'] + dataset_params[ 'categorical_fraction'] == 1: if dataset_params['integer_fraction'] > dataset_params[ 'categorical_fraction']: dataset_params[ 'integer_fraction'] = dataset_params['integer_fraction'] - 0.1 else: dataset_params['categorical_fraction'] = dataset_params[ 'categorical_fraction'] - 0.1 dataset_params['missing_fraction'] = random.uniform(0, 0.01) dataset_params['has_response'] = True dataset_params['randomize'] = True dataset_params['factors'] = random.randint(2, 50) print("Dataset parameters: {0}".format(dataset_params)) train = h2o.create_frame(**dataset_params) print("Training dataset:") print(train) # Save dataset to results directory results_dir = pyunit_utils.locate("results") h2o.download_csv( train, os.path.join(results_dir, "pca_dynamic_training_dataset.log")) # Generate random parameters params = {} if random.randint(0, 1): params['max_iterations'] = random.sample(list(range(1, 1000)), 1)[0] if random.randint(0, 1): params['transform'] = random.sample( ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"], 1)[0] realNcol = train.ncol - 1 params['k'] = random.sample(list(range(1, min(realNcol, train.nrow))), 1)[0] print("Parameter list: {0}".format(params)) x = train.names x.remove("response") y = "response" pyunit_utils.javapredict(algo="pca", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)
def test_cat_encoding(): valid_values = [ "auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "enum_limited", # "sort_by_response" TODO: This is invalid parameter, remove it ] df = h2o.create_frame( rows=100, cols=4, categorical_fraction=0.4, integer_fraction=0, binary_fraction=0, real_range=100, integer_range=100, missing_fraction=0, seed=1234 ) model = H2OAggregatorEstimator(target_num_exemplars=5) try: for val in valid_values: model.categorical_encoding = val model.train(training_frame=df) except: assert False, "Aggregator model should be able to process all valid categorical_encoding values" # Try with invalid value try: model = H2OAggregatorEstimator(target_num_exemplars=5, categorical_encoding="some_invalid_value") assert False, "Passing invalid value of categorical_encoding should throw an error" except: pass
def javapredict_dynamic_data(): # Generate random dataset dataset_params = {} dataset_params['rows'] = random.sample(list(range(5000,15001)),1)[0] dataset_params['cols'] = random.sample(list(range(10,21)),1)[0] dataset_params['categorical_fraction'] = round(random.random(),1) left_over = (1 - dataset_params['categorical_fraction']) dataset_params['integer_fraction'] = round(left_over - round(random.uniform(0,left_over),1),1) if dataset_params['integer_fraction'] + dataset_params['categorical_fraction'] == 1: if dataset_params['integer_fraction'] > dataset_params['categorical_fraction']: dataset_params['integer_fraction'] = dataset_params['integer_fraction'] - 0.1 else: dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1 dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1 dataset_params['missing_fraction'] = random.uniform(0,0.5) dataset_params['has_response'] = True dataset_params['randomize'] = True dataset_params['factors'] = random.randint(2,2000) print("Dataset parameters: {0}".format(dataset_params)) append_response = False family = random.sample(['binomial','gaussian','poisson','tweedie','gamma'], 1)[0] if family == 'binomial': dataset_params['response_factors'] = 2 elif family == 'gaussian': dataset_params['response_factors'] = 1 else: dataset_params['has_response'] = False response = h2o.H2OFrame([[random.randint(1,1000)] for r in range(0,dataset_params['rows'])]) append_response = True print("Family: {0}".format(family)) train = h2o.create_frame(**dataset_params) if append_response: train = response.cbind(train) train.set_name(0,"response") if family == 'binomial': train['response'] = train['response'].asfactor() results_dir = pyunit_utils.locate("results") h2o.download_csv(train["response"],os.path.join(results_dir,"glm_dynamic_preimputed_response.log")) train = train.impute("response", method="mode") print("Training dataset:") print(train) # Save dataset to results directory h2o.download_csv(train,os.path.join(results_dir,"glm_dynamic_training_dataset.log")) # Generate random parameters params = {} if random.randint(0,1): params['alpha'] = random.random() params['family'] = family if params['family'] == "tweedie": if random.randint(0,1): params['tweedie_variance_power'] = round(random.random()+1,6) params['tweedie_link_power'] = 1 - params['tweedie_variance_power'] print("Parameter list: {0}".format(params)) x = list(range(1,train.ncol)) y = "response" pyunit_utils.javapredict(algo="glm", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)
def javapredict_dynamic_data(): # Generate random dataset dataset_params = {} dataset_params['rows'] = random.sample(list(range(5000,15001)),1)[0] dataset_params['cols'] = random.sample(list(range(10,21)),1)[0] dataset_params['categorical_fraction'] = round(random.random(),1) left_over = (1 - dataset_params['categorical_fraction']) dataset_params['integer_fraction'] = round(left_over - round(random.uniform(0,left_over),1),1) if dataset_params['integer_fraction'] + dataset_params['categorical_fraction'] == 1: if dataset_params['integer_fraction'] > dataset_params['categorical_fraction']: dataset_params['integer_fraction'] = dataset_params['integer_fraction'] - 0.1 else: dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1 dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1 dataset_params['missing_fraction'] = random.uniform(0,0.5) dataset_params['has_response'] = True dataset_params['randomize'] = True dataset_params['factors'] = random.randint(2,2000) print("Dataset parameters: {0}".format(dataset_params)) append_response = False family = random.sample(['binomial','gaussian','poisson','tweedie','gamma'], 1)[0] if family == 'binomial': dataset_params['response_factors'] = 2 elif family == 'gaussian': dataset_params['response_factors'] = 1 else: dataset_params['has_response'] = False response = h2o.H2OFrame([random.randint(1,1000) for r in range(0,dataset_params['rows'])]) append_response = True print("Family: {0}".format(family)) train = h2o.create_frame(**dataset_params) if append_response: train = response.cbind(train) train.set_name(0,"response") if family == 'binomial': train['response'] = train['response'].asfactor() results_dir = pyunit_utils.locate("results") h2o.download_csv(train["response"],os.path.join(results_dir,"glm_dynamic_preimputed_response.log")) train = train.impute("response", method="mode") print("Training dataset:") print(train) # Save dataset to results directory h2o.download_csv(train,os.path.join(results_dir,"glm_dynamic_training_dataset.log")) # Generate random parameters params = {} if random.randint(0,1): params['alpha'] = random.random() params['family'] = family if params['family'] == "tweedie": if random.randint(0,1): params['tweedie_variance_power'] = round(random.random()+1,6) params['tweedie_link_power'] = 1 - params['tweedie_variance_power'] print("Parameter list: {0}".format(params)) x = list(range(1,train.ncol)) y = "response" pyunit_utils.javapredict(algo="glm", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)
def test_csv_parser_column_skip(): # generate a big frame with all datatypes and save it to csv. Load it back with different skipped_columns settings nrow = 10000 ncol = 100 seed = 12345 frac1 = 0.16 frac2 = 0.2 f1 = h2o.create_frame(rows=nrow, cols=ncol, real_fraction=frac1, categorical_fraction=frac1, integer_fraction=frac1, binary_fraction=frac1, time_fraction=frac1, string_fraction=frac2, missing_fraction=0.1, has_response=False, seed=seed) tmpdir = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results")) if not (os.path.isdir(tmpdir)): os.mkdir(tmpdir) savefilenamewithpath = os.path.join(tmpdir, 'in.csv') h2o.download_csv(f1, savefilenamewithpath) # load in whole dataset skip_all = list(range(f1.ncol)) skip_even = list(range(0, f1.ncol, 2)) skip_odd = list(range(1, f1.ncol, 2)) skip_start_end = [0, f1.ncol - 1] skip_except_last = list(range(0, f1.ncol - 2)) skip_except_first = list(range(1, f1.ncol)) temp = list(range(0, f1.ncol)) random.shuffle(temp) skip_random = [] for index in range(0, f1.ncol // 2): skip_random.append(temp[index]) skip_random.sort() try: loadFileSkipAll = h2o.upload_file(savefilenamewithpath, skipped_columns=skip_all) sys.exit(1) # should have failed here except: pass try: importFileSkipAll = h2o.import_file(savefilenamewithpath, skipped_columns=skip_all) sys.exit(1) # should have failed here except: pass # skip even columns pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_even) # skip odd columns pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_odd) # skip the very beginning and the very end. pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_start_end) # skip all except the last column pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_except_last) # skip all except the very first column pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_except_first) # randomly skipped half the columns pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_random)
def genMergedSeparaData(MergedRows, intUpper, intLow, doubleUpper, doubleLow, bProb): # first generate the single column that will be the merge key merged = h2o.create_frame(rows=MergedRows, cols=3, integer_fraction=1, integer_range=intUpper - intLow) print("Done, save with Flow")
def create_frame_test(ip, port): # REALLY basic test TODO: add more checks r = random.randint(1, 1000) c = random.randint(1, 1000) frame = h2o.create_frame(rows=r, cols=c) assert frame.nrow == r and frame.ncol == c, "Expected {0} rows and {1} cols, but got {2} rows and {3} " \ "cols.".format(r,c,frame.nrow,frame.ncol)
def pubdev_5112(): words = h2o.create_frame(rows=10, cols=1, string_fraction=1.0, missing_fraction=0.0) embeddings = h2o.create_frame(rows=10, cols=100, real_fraction=1.0, missing_fraction=0.0) word_embeddings = words.cbind(embeddings) w2v_model = H2OWord2vecEstimator.from_external(external=word_embeddings) model_id = w2v_model.model_id model = h2o.get_model(model_id) assert model, "Worder2Vec model without a training frame was retrived" # Only leading column should be of type String leading_column_string_error = False try: string_frame = h2o.create_frame(rows=10, cols=10, real_fraction=1.0, missing_fraction=0.0) H2OWord2vecEstimator.from_external(external=string_frame) except H2OValueError: leading_column_string_error = True assert leading_column_string_error, "Word2Vec pre-trained model should be checked for the leading column" \ " to be string" # Other columns should be non-string type multiple_string_columns_error = False try: string_frame = h2o.create_frame(rows=10, cols=10, string_fraction=1.0, missing_fraction=0.0) H2OWord2vecEstimator.from_external(external=string_frame) except H2OValueError: multiple_string_columns_error = True assert multiple_string_columns_error, "Word2Vec pre-trained model should be checked for columns not to have a" \ " String type except for the leading column"
def test_csv_parser_column_skip(): # generate a big frame with all datatypes and save it to csv. Load it back with different skipped_columns settings nrow = 10000 ncol = 100 seed = 12345 frac1 = 0.16 frac2 = 0.2 f1 = h2o.create_frame(rows=nrow, cols=ncol, real_fraction=frac1, categorical_fraction=frac1, integer_fraction=frac1, binary_fraction=frac1, time_fraction=frac1, string_fraction=frac2, missing_fraction=0.1, has_response=False, seed=seed) tmpdir = os.path.normpath( os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results")) if not (os.path.isdir(tmpdir)): os.mkdir(tmpdir) savefilenamewithpath = os.path.join(tmpdir, 'in.csv') h2o.download_csv(f1, savefilenamewithpath) # load in whole dataset skip_all = list(range(f1.ncol)) skip_start_end = [0, f1.ncol - 1] skip_except_last = list(range(0, f1.ncol - 2)) skip_except_first = list(range(1, f1.ncol)) temp = list(range(0, f1.ncol)) random.shuffle(temp) skip_random = [] for index in range(0, f1.ncol // 2): skip_random.append(temp[index]) skip_random.sort() try: importFileSkipAll = h2o.import_file(savefilenamewithpath, skipped_columns=skip_all) sys.exit(1) # should have failed here except: pass # skip the very beginning and the very end. pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_start_end) # skip all except the last column pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_except_last) # skip all except the very first column pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_except_first) # randomly skipped half the columns pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_random)
def pubdev_5180(): frame = h2o.create_frame(binary_fraction=1, binary_ones_fraction=0.5, missing_fraction=0, rows=1, cols=1) exp_str = ExprNode("assign", 123456789123456789123456789, frame)._get_ast_str() assert exp_str.find('123456789123456789L') == -1
def isax(): df = h2o.create_frame(rows=1,cols=256,real_fraction=1.0,missing_fraction=0.0,seed=123) df2 = df.cumsum(axis=1) res = df2.isax(num_words=10,max_cardinality=10) res.show() answer = "0^10_0^10_0^10_0^10_5^10_7^10_8^10_9^10_9^10_8^10" assert answer == res[0,0], "expected isax index to be " + answer + " but got" + res[0,0] + " instead." h2o.remove(df) h2o.remove(df2) h2o.remove(res)
def javapredict_dynamic_data(): dataset_params = {} dataset_params['rows'] = 13183 dataset_params['cols'] = 13 dataset_params['categorical_fraction'] = 0.4 dataset_params['integer_fraction'] = 0.3 dataset_params['missing_fraction'] = 0.27539154084819495 dataset_params['has_response'] = True dataset_params['randomize'] = True dataset_params['factors'] = 819 print("Dataset parameters: {0}".format(dataset_params)) problem = 2 print( "Model-building exercise (0:regression, 1:binomial, 2:multinomial): {0}" .format(problem)) if problem == 'binomial': dataset_params['response_factors'] = 2 elif problem == 'regression': dataset_params['response_factors'] = 1 else: dataset_params['response_factors'] = 16 train = h2o.create_frame(**dataset_params) if problem == 'binomial' or problem == 'multinomial': train['response'] = train['response'].asfactor() results_dir = pyunit_utils.locate("results") h2o.download_csv( train["response"], os.path.join(results_dir, "drf_dynamic_preimputed_response.log")) train.impute("response", method="mode") print("Training dataset:") print(train) # Save dataset to results directory h2o.download_csv( train, os.path.join(results_dir, "drf_dynamic_training_dataset.log")) params = {} params['nbins'] = 5 params['min_rows'] = 7 params['mtries'] = 4 params['sample_rate'] = 0.7867986759373544 params['seed'] = 1304644573760597606 print("Parameter list: {0}".format(params)) x = list(range(1, train.ncol)) y = "response" pyunit_utils.javapredict(algo="random_forest", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)
def test_parser_svmlight_column_skip(): # generate a big frame with all datatypes and save it to svmlight nrow = 10000 ncol = 50 seed=12345 f1 = h2o.create_frame(rows=nrow, cols=ncol, real_fraction=0.5, integer_fraction=0.5, missing_fraction=0.2, has_response=False, seed=seed) f2 = h2o.create_frame(rows=nrow, cols=1, real_fraction=1, integer_fraction=0, missing_fraction=0, has_response=False, seed=seed) f2.set_name(0,"target") f1 = f2.cbind(f1) tmpdir = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results")) if not(os.path.isdir(tmpdir)): os.mkdir(tmpdir) savefilenamewithpath = os.path.join(tmpdir, 'out.svm') pyunit_utils.write_H2OFrame_2_SVMLight(savefilenamewithpath, f1) # write h2o frame to svm format skip_all = list(range(ncol)) skip_even = list(range(0, ncol, 2)) try: loadFileSkipAll = h2o.upload_file(savefilenamewithpath, skipped_columns = skip_all) sys.exit(1) # should have failed here except: pass try: importFileSkipAll = h2o.import_file(savefilenamewithpath, skipped_columns = skip_all) sys.exit(1) # should have failed here except: pass try: importFileSkipSome = h2o.import_file(savefilenamewithpath, skipped_columns = skip_even) sys.exit(1) # should have failed here except: pass # check for correct parsing only checkCorrectSkips(savefilenamewithpath, f1)
def irf_tree_Test(): cat_frame = h2o.create_frame(cols=10, categorical_fraction=1, seed=42) # check all columns are categorical assert set(cat_frame.types.values()) == set(['enum']) iso_model = H2OIsolationForestEstimator(seed=42) iso_model.train(training_frame=cat_frame) tree = H2OTree(iso_model, 5) check_tree(tree, 5, None) print(tree)
def javapredict_dynamic_data(): # Generate random dataset dataset_params = {} dataset_params['rows'] = random.sample(list(range(5000,15001)),1)[0] dataset_params['cols'] = random.sample(list(range(10,21)),1)[0] dataset_params['categorical_fraction'] = round(random.random(),1) left_over = (1 - dataset_params['categorical_fraction']) dataset_params['integer_fraction'] = round(left_over - round(random.uniform(0,left_over),1),1) if dataset_params['integer_fraction'] + dataset_params['categorical_fraction'] == 1: if dataset_params['integer_fraction'] > dataset_params['categorical_fraction']: dataset_params['integer_fraction'] = dataset_params['integer_fraction'] - 0.1 else: dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1 dataset_params['missing_fraction'] = random.uniform(0,0.5) dataset_params['has_response'] = True dataset_params['randomize'] = True dataset_params['factors'] = random.randint(2,2000) print("Dataset parameters: {0}".format(dataset_params)) problem = random.sample(list(range(0,3)),1) print("Model-building exercise (0:regression, 1:binomial, 2:multinomial): {0}".format(problem)) if problem == 'binomial': dataset_params['response_factors'] = 2 elif problem == 'regression': dataset_params['response_factors'] = 1 else: dataset_params['response_factors'] = random.randint(3,100) train = h2o.create_frame(**dataset_params) if problem == 'binomial' or problem == 'multinomial': train['response'] = train['response'].asfactor() results_dir = pyunit_utils.locate("results") h2o.download_csv(train["response"],os.path.join(results_dir,"drf_dynamic_preimputed_response.log")) train.impute("response", method="mode") print("Training dataset:") print(train) # Save dataset to results directory h2o.download_csv(train,os.path.join(results_dir,"drf_dynamic_training_dataset.log")) # Generate random parameters params = {} if random.randint(0,1): params['ntrees'] = random.sample(list(range(1,21)),1)[0] if random.randint(0,1): params['max_depth'] = random.sample(list(range(1,11)),1)[0] if random.randint(0,1): params['min_rows'] = random.sample(list(range(1,11)),1)[0] if random.randint(0,1): params['nbins'] = random.sample(list(range(2,21)),1)[0] if random.randint(0,1): params['nbins_cats'] = random.sample(list(range(2,1025)),1)[0] if random.randint(0,1): params['mtries'] = random.sample(list(range(1,dataset_params['cols']+1)),1)[0] if random.randint(0,1): params['sample_rate'] = random.random() print("Parameter list: {0}".format(params)) x = list(range(1,train.ncol)) y = "response" pyunit_utils.javapredict(algo="random_forest", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)
def h2o_H2OFrame_concat(): """ Python API test: h2o.frame.H2OFrame.concat(frames, axis=1) Copied from pyunit_concat.py """ df1 = h2o.create_frame(integer_fraction=1,binary_fraction=0,categorical_fraction=0,seed=1) df2 = h2o.create_frame(integer_fraction=1,binary_fraction=0,categorical_fraction=0,seed=2) df3 = h2o.create_frame(integer_fraction=1,binary_fraction=0,categorical_fraction=0,seed=3) # frame to frame concat (column-wise) df123 = df1.concat([df2,df3]) assert_is_type(df123, H2OFrame) # check return type assert df123.shape==(df1.nrows, df1.ncols+df2.ncols+df3.ncols), "h2o.H2OFrame.concat command is not working."# #Frame to Frame concat (row wise) df123_row = df1.concat([df2,df3], axis = 0) assert_is_type(df123_row, H2OFrame) # check return type assert df123_row.shape==(df1.nrows+df2.nrows+df3.nrows, df1.ncols), \ "h2o.H2OFrame.concat command is not working."
def javapredict_dynamic_data(): # Generate random dataset dataset_params = {} dataset_params['rows'] = random.sample(list(range(5000, 15001)), 1)[0] dataset_params['cols'] = random.sample(list(range(10, 21)), 1)[0] dataset_params['categorical_fraction'] = round(random.random(), 1) left_over = (1 - dataset_params['categorical_fraction']) dataset_params['integer_fraction'] = round( left_over - round(random.uniform(0, left_over), 1), 1) if dataset_params['integer_fraction'] + dataset_params[ 'categorical_fraction'] == 1: if dataset_params['integer_fraction'] > dataset_params[ 'categorical_fraction']: dataset_params[ 'integer_fraction'] = dataset_params['integer_fraction'] - 0.1 else: dataset_params['categorical_fraction'] = dataset_params[ 'categorical_fraction'] - 0.1 dataset_params['missing_fraction'] = random.uniform(0, 0.5) dataset_params['has_response'] = True dataset_params['randomize'] = True dataset_params['factors'] = random.randint(2, 2000) dataset_params['response_factors'] = random.randint(3, 100) print("Dataset parameters: {0}".format(dataset_params)) train = h2o.create_frame(**dataset_params) print("Training dataset:") print(train) # Save dataset to results directory results_dir = pyunit_utils.locate("results") h2o.download_csv( train, os.path.join(results_dir, "nb_dynamic_training_dataset.log")) # Generate random parameters params = {} params['laplace'] = 0 if random.randint(0, 1): params['laplace'] = random.uniform(0, 11) print("Parameter list: {0}".format(params)) x = train.names x.remove("response") y = "response" pyunit_utils.javapredict(algo="naive_bayes", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)
def create_frame_test(ip,port): # REALLY basic test TODO: add more checks r = random.randint(1,1000) c = random.randint(1,1000) frame = h2o.create_frame(rows=r, cols=c) assert frame.nrow() == r and frame.ncol() == c, "Expected {0} rows and {1} cols, but got {2} rows and {3} " \ "cols.".format(r,c,frame.nrow(),frame.ncol())
def h2ocreate_frame(): """ Python API test: h2o.create_frame(frame_id=None, rows=10000, cols=10, randomize=True, real_fraction=None, categorical_fraction=None, integer_fraction=None, binary_fraction=None, time_fraction=None, string_fraction=None, value=0, real_range=100, factors=100, integer_range=100, binary_ones_fraction=0.02, missing_fraction=0.01, has_response=False, response_factors=2, positive_response=False, seed=None, seed_for_column_types=None) Copied from pyunit_NOPASS_javapredict_dynamic_data_paramsDL.py """ try: # Generate random dataset dataset_params = {} dataset_params['rows'] = random.sample(list(range(50, 150)), 1)[0] dataset_params['cols'] = random.sample(list(range(3, 6)), 1)[0] dataset_params['categorical_fraction'] = round(random.random(), 1) left_over = (1 - dataset_params['categorical_fraction']) dataset_params['integer_fraction'] = round( left_over - round(random.uniform(0, left_over), 1), 1) if dataset_params['integer_fraction'] + dataset_params[ 'categorical_fraction'] == 1: if dataset_params['integer_fraction'] > dataset_params[ 'categorical_fraction']: dataset_params['integer_fraction'] = dataset_params[ 'integer_fraction'] - 0.1 else: dataset_params['categorical_fraction'] = dataset_params[ 'categorical_fraction'] - 0.1 dataset_params['missing_fraction'] = random.uniform(0, 0.5) dataset_params['has_response'] = False dataset_params['randomize'] = True dataset_params['factors'] = random.randint(2, 5) print("Dataset parameters: {0}".format(dataset_params)) distribution = random.sample( ['bernoulli', 'multinomial', 'gaussian', 'poisson', 'gamma'], 1)[0] if distribution == 'bernoulli': dataset_params['response_factors'] = 2 elif distribution == 'gaussian': dataset_params['response_factors'] = 1 elif distribution == 'multinomial': dataset_params['response_factors'] = random.randint(3, 5) else: dataset_params['has_response'] = False print("Distribution: {0}".format(distribution)) train = h2o.create_frame(**dataset_params) assert_is_type(train, H2OFrame) assert train.ncol == dataset_params[ 'cols'], "h2o.create_frame() create frame with wrong column number." assert train.nrow == dataset_params[ 'rows'], "h2o.create_frame() create frame with wrong row number." except Exception as e: assert False, "h2o.create_frame() command not is working."
def whichmaxmin(): #Make H2O frame f1 = h2o.create_frame(rows = 10000, cols = 100, categorical_fraction = 0, missing_fraction = 0,seed=1234) #Make comparable pandas frame f2 = f1.as_data_frame(use_pandas=True) ############################################################# #Col wise max which_max_col = f1.idxmax() which_max_col = which_max_col.transpose() which_max_col_pd = f2.idxmax(axis=0) which_max_col_pd = h2o.H2OFrame(pd.DataFrame(which_max_col_pd,columns=["C1"])) diff_max_col_idx = which_max_col - which_max_col_pd assert diff_max_col_idx.sum() == 0 #Col wise min which_min_col = f1.idxmin() which_min_col = which_min_col.transpose() which_min_col_pd = f2.idxmin(axis=0) which_min_col_pd = h2o.H2OFrame(pd.DataFrame(which_min_col_pd,columns=["C1"])) diff_min_col_idx = which_min_col - which_min_col_pd assert diff_min_col_idx.sum() == 0 ############################################################# #Row wise max which_max_row = f1.idxmax(axis=1) which_max_row_pd = f2.idxmax(axis=1) which_max_row_pd = h2o.H2OFrame(pd.DataFrame(which_max_row_pd,columns=["C1"])) which_max_row_pd = which_max_row_pd.ascharacter().lstrip("C").asnumeric() - 1 #Had to clean up before comparison (indexing was +1) diff_max_row_idx = which_max_row - which_max_row_pd assert diff_max_row_idx.sum() == 0 #Row wise min which_min_row = f1.idxmin(axis=1) which_min_row_pd = f2.idxmin(axis=1) which_min_row_pd = h2o.H2OFrame(pd.DataFrame(which_min_row_pd,columns=["C1"])) which_min_row_pd = which_min_row_pd.ascharacter().lstrip("C").asnumeric() - 1 #Had to clean up before comparison (indexing was +1) diff_min_row_idx = which_min_row - which_min_row_pd assert diff_min_row_idx.sum() == 0
def generate_models(n_models, n_rows, n_cols, n_rows_per_model, n_trees, max_depth, target_dir): target_dir = os.path.abspath(target_dir) if not os.path.exists(target_dir): os.makedirs(target_dir) assert n_rows_per_model <= n_rows, "Not enough rows to train any model" assert n_rows <= n_rows_per_model * n_models, "Too many rows" assert os.path.isdir(target_dir), "%s is not a directory" % target_dir genmodel_jar = os.path.abspath( "../../../h2o-genmodel/build/libs/h2o-genmodel-all.jar") assert os.path.exists(genmodel_jar), "Cannot find " + genmodel_jar # Step 1: generate the dataset. df = h2o.create_frame(rows=n_rows, cols=n_cols, missing_fraction=0, integer_fraction=1, has_response=True, response_factors=1, positive_response=True) assert df.names == ["response"] + ["C%d" % n for n in range(1, n_cols + 1)] assert df.types["response"] == "real" assert all(v == "int" for k, v in df.types.items() if k != "response") print("Dataset created (%d x %d).\n" % (df.nrow, df.ncol)) # Step 2: train and save the models for i in range(n_models): estimator = random.choice( [H2ORandomForestEstimator, H2OGradientBoostingEstimator]) start_row = random.randint(0, n_rows - n_rows_per_model) end_row = start_row + n_rows_per_model # Step 2.a: train a model on a random subset of the frame `df` time0 = time.time() print("%-4d %-30s" % (i + 1, estimator.__name__), end="") model = estimator(ntrees=n_trees, max_depth=max_depth) model.train(training_frame=df[start_row:end_row, :]) print(" %.3fs" % (time.time() - time0), end="") # Step 2.b: save the model to a file model_file = h2o.api("GET /3/Models/%s/data" % model.model_id, save_to=target_dir) assert os.path.exists(model_file) simple_file = model_file[len(target_dir) + 1:] if model_file.startswith( target_dir + "/") else model_file print(" => %s (%d bytes)" % (simple_file, os.stat(model_file).st_size)) # Step 2.c h2o.remove(model)
def pubdev_6304(): fractions = dict() fractions[ "real_fraction"] = 0 # Right now we are dropping string columns, so no point in having them. fractions["categorical_fraction"] = 1 fractions["integer_fraction"] = 0 fractions["time_fraction"] = 0 fractions[ "string_fraction"] = 0 # Right now we are dropping string columns, so no point in having them. fractions["binary_fraction"] = 0 # this used to get an error message try: traindata = h2o.create_frame(rows=100, cols=2, missing_fraction=0, has_response=False, factors=9999999, seed=12345, **fractions) except Exception as ex: sys.exit(1) # this get an error message try: traindata = h2o.create_frame(rows=100, cols=2, missing_fraction=0, has_response=False, factors=19999999, seed=12345, **fractions) sys.exit(1) # should have thrown an error except Exception as ex: # expect an error here print(ex) if 'Number of factors must be <= 10,000,000' in ex.args[0].dev_msg: sys.exit(0) # correct error message else: sys.exit(1) # something else is wrong.
def h2o_H2OFrame_isax(): """ Python API test: h2o.frame.H2OFrame.isax(num_words, max_cardinality, optimize_card=False) copied from pyunit_isax.py """ df = h2o.create_frame(rows=1,cols=256,real_fraction=1.0,missing_fraction=0.0,seed=123) df2 = df.cumsum(axis=1) res = df2.isax(num_words=10,max_cardinality=10, optimize_card=False) res.show() answer = "0^10_0^10_0^10_0^10_5^10_7^10_8^10_9^10_9^10_8^10" assert_is_type(res, H2OFrame) # check return type assert answer == res[0,0], "expected isax index to be " + answer + " but got" + res[0,0] + " instead."
def fillna(): NUM_COLS = 3 df = h2o.create_frame(rows=1000000, cols=NUM_COLS, real_fraction=1.0, real_range=100, missing_fraction=0.2, seed=123) # Pandas comparison pdf = df.as_data_frame() filledpdf = pdf.fillna(method="ffill",axis=0,limit=3) filledpdfh2o = h2o.H2OFrame(filledpdf, column_types=["float"]*NUM_COLS) filled = df.fillna(method="forward",axis=0,maxlen=3) assert abs((filled - filledpdfh2o).sum(return_frame=False)) < 1e-11, "Difference between Pandas pivot too high"
def javapredict_dynamic_data(): dataset_params = {} dataset_params["rows"] = 13183 dataset_params["cols"] = 13 dataset_params["categorical_fraction"] = 0.4 dataset_params["integer_fraction"] = 0.3 dataset_params["missing_fraction"] = 0.27539154084819495 dataset_params["has_response"] = True dataset_params["randomize"] = True dataset_params["factors"] = 819 print("Dataset parameters: {0}".format(dataset_params)) problem = 2 print("Model-building exercise (0:regression, 1:binomial, 2:multinomial): {0}".format(problem)) if problem == "binomial": dataset_params["response_factors"] = 2 elif problem == "regression": dataset_params["response_factors"] = 1 else: dataset_params["response_factors"] = 16 train = h2o.create_frame(**dataset_params) if problem == "binomial" or problem == "multinomial": train["response"] = train["response"].asfactor() results_dir = pyunit_utils.locate("results") h2o.download_csv(train["response"], os.path.join(results_dir, "drf_dynamic_preimputed_response.log")) train.impute("response", method="mode") print("Training dataset:") print(train) # Save dataset to results directory h2o.download_csv(train, os.path.join(results_dir, "drf_dynamic_training_dataset.log")) params = {} params["nbins"] = 5 params["min_rows"] = 7 params["mtries"] = 4 params["sample_rate"] = 0.7867986759373544 params["seed"] = 1304644573760597606 print("Parameter list: {0}".format(params)) x = list(range(1, train.ncol)) y = "response" pyunit_utils.javapredict( algo="random_forest", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params )
def pyunit_unique(): iris = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris.csv")) uniques = iris[4].unique() rows, cols = uniques.dim assert rows == 3 and cols == 1, "Expected 3 rows and 1 column, but got {0} rows and {1} column".format(rows,cols) assert "Iris-setosa" in uniques[0], "Expected Iris-setosa to be in the set of unique species, but it wasn't" assert "Iris-virginica" in uniques[0], "Expected Iris-virginica to be in the set of unique species, but it wasn't" assert "Iris-versicolor" in uniques[0], "Expected Iris-versicolor to be in the set of unique species, but it wasn't" fr = h2o.create_frame(rows=5, cols=1, time_fraction=1) assert fr.type(0) == "time" uf = fr.unique() assert uf.type(0) == "time" uf.refresh() assert uf.type(0) == "time"
def sort(): df = h2o.create_frame(rows=10, cols=3, factors=10, categorical_fraction=1.0/3, time_fraction=1.0/3, real_fraction=1.0/3, real_range=100, missing_fraction=0.0, seed=123) df1 = df.sort("C1") assert df1[0,0] == 433225652950 # 1983-09-24 04:27:32 assert df1[9,0] == 1532907020199 # 2018-07-29 23:30:20 df2 = df.sort("C2") assert df2[0,1] == "c1.l1" assert df2[9,1] == "c1.l9" h2o.remove_all()
def javapredict_dynamic_data(): # Generate random dataset dataset_params = {} dataset_params['rows'] = random.sample(range(5000,15001),1)[0] dataset_params['cols'] = random.sample(range(10,21),1)[0] dataset_params['categorical_fraction'] = round(random.random(),1) left_over = (1 - dataset_params['categorical_fraction']) dataset_params['integer_fraction'] = round(left_over - round(random.uniform(0,left_over),1),1) if dataset_params['integer_fraction'] + dataset_params['categorical_fraction'] == 1: if dataset_params['integer_fraction'] > dataset_params['categorical_fraction']: dataset_params['integer_fraction'] = dataset_params['integer_fraction'] - 0.1 else: dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1 dataset_params['missing_fraction'] = random.uniform(0,0.5) dataset_params['has_response'] = True dataset_params['randomize'] = True dataset_params['factors'] = random.randint(2,2000) print "Dataset parameters: {0}".format(dataset_params) train = h2o.create_frame(**dataset_params) print "Training dataset:" print train # Save dataset to results directory results_dir = pyunit_utils.locate("results") h2o.download_csv(train,os.path.join(results_dir,"kmeans_dynamic_training_dataset.log")) # Generate random parameters params = {} params['k'] = random.sample(range(1,10),1)[0] if random.randint(0,1): params['max_iterations'] = random.sample(range(1,1000),1)[0] if random.randint(0,1): params['standardize'] = random.sample([True, False],1)[0] if random.randint(0,1): params['seed'] = random.sample(range(1,1000),1)[0] if random.randint(0,1): params['init'] = random.sample(['Random','PlusPlus','Furthest'],1)[0] print "Parameter list: {0}".format(params) x = train.names x.remove("response") y = "response" pyunit_utils.javapredict(algo="kmeans", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)
def javapredict_dynamic_data(): # Generate random dataset dataset_params = {} dataset_params['rows'] = random.sample(list(range(100,200)),1)[0] dataset_params['cols'] = random.sample(list(range(10,21)),1)[0] dataset_params['categorical_fraction'] = round(random.random(),1) left_over = (1 - dataset_params['categorical_fraction']) dataset_params['integer_fraction'] = round(left_over - round(random.uniform(0,left_over),1),1) if dataset_params['integer_fraction'] + dataset_params['categorical_fraction'] == 1: if dataset_params['integer_fraction'] > dataset_params['categorical_fraction']: dataset_params['integer_fraction'] = dataset_params['integer_fraction'] - 0.1 else: dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1 dataset_params['missing_fraction'] = random.uniform(0,0.01) dataset_params['has_response'] = True dataset_params['randomize'] = True dataset_params['factors'] = random.randint(2,50) print("Dataset parameters: {0}".format(dataset_params)) train = h2o.create_frame(**dataset_params) print("Training dataset:") print(train) # Save dataset to results directory results_dir = pyunit_utils.locate("results") h2o.download_csv(train,os.path.join(results_dir,"pca_dynamic_training_dataset.log")) # Generate random parameters params = {} if random.randint(0,1): params['max_iterations'] = random.sample(list(range(1,1000)),1)[0] if random.randint(0,1): params['transform'] = random.sample(["NONE","STANDARDIZE","NORMALIZE","DEMEAN","DESCALE"],1)[0] params['k'] = random.sample(list(range(1,min(train.ncol,train.nrow))),1)[0] print("Parameter list: {0}".format(params)) x = train.names x.remove("response") y = "response" pyunit_utils.javapredict(algo="pca", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)
def javapredict_dynamic_data(): # Generate random dataset dataset_params = {} dataset_params['rows'] = random.sample(list(range(5000,15001)),1)[0] dataset_params['cols'] = random.sample(list(range(10,21)),1)[0] dataset_params['categorical_fraction'] = round(random.random(),1) left_over = (1 - dataset_params['categorical_fraction']) dataset_params['integer_fraction'] = round(left_over - round(random.uniform(0,left_over),1),1) if dataset_params['integer_fraction'] + dataset_params['categorical_fraction'] == 1: if dataset_params['integer_fraction'] > dataset_params['categorical_fraction']: dataset_params['integer_fraction'] = dataset_params['integer_fraction'] - 0.1 else: dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1 dataset_params['missing_fraction'] = random.uniform(0,0.5) dataset_params['has_response'] = True dataset_params['randomize'] = True dataset_params['factors'] = random.randint(2,2000) dataset_params['response_factors'] = random.randint(3,100) print("Dataset parameters: {0}".format(dataset_params)) train = h2o.create_frame(**dataset_params) print("Training dataset:") print(train) # Save dataset to results directory results_dir = pyunit_utils.locate("results") h2o.download_csv(train,os.path.join(results_dir,"nb_dynamic_training_dataset.log")) # Generate random parameters params = {} params['laplace'] = 0 if random.randint(0,1): params['laplace'] = random.uniform(0,11) print("Parameter list: {0}".format(params)) x = train.names x.remove("response") y = "response" pyunit_utils.javapredict(algo="naive_bayes", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)
def generate_models(n_models, n_rows, n_cols, n_rows_per_model, n_trees, max_depth, target_dir): target_dir = os.path.abspath(target_dir) if not os.path.exists(target_dir): os.makedirs(target_dir) assert n_rows_per_model <= n_rows, "Not enough rows to train any model" assert n_rows <= n_rows_per_model * n_models, "Too many rows" assert os.path.isdir(target_dir), "%s is not a directory" % target_dir genmodel_jar = os.path.abspath("../../../h2o-genmodel/build/libs/h2o-genmodel-all.jar") assert os.path.exists(genmodel_jar), "Cannot find " + genmodel_jar # Step 1: generate the dataset. df = h2o.create_frame(rows=n_rows, cols=n_cols, missing_fraction=0, integer_fraction=1, has_response=True, response_factors=1, positive_response=True) assert df.names == ["response"] + ["C%d" % n for n in range(1, n_cols + 1)] assert df.types["response"] == "real" assert all(v == "int" for k, v in df.types.items() if k != "response") print("Dataset created (%d x %d).\n" % (df.nrow, df.ncol)) # Step 2: train and save the models for i in range(n_models): estimator = random.choice([H2ORandomForestEstimator, H2OGradientBoostingEstimator]) start_row = random.randint(0, n_rows - n_rows_per_model) end_row = start_row + n_rows_per_model # Step 2.a: train a model on a random subset of the frame `df` time0 = time.time() print("%-4d %-30s" % (i + 1, estimator.__name__), end="") model = estimator(ntrees=n_trees, max_depth=max_depth) model.train(training_frame=df[start_row:end_row, :]) print(" %.3fs" % (time.time() - time0), end="") # Step 2.b: save the model to a file model_file = h2o.api("GET /3/Models/%s/data" % model.model_id, save_to=target_dir) assert os.path.exists(model_file) simple_file = model_file[len(target_dir) + 1:] if model_file.startswith(target_dir + "/") else model_file print(" => %s (%d bytes)" % (simple_file, os.stat(model_file).st_size)) # Step 2.c h2o.remove(model)