def group_by(): # Connect to a pre-existing cluster h2o_iris = h2o.import_file(path=h2o.locate("smalldata/iris/iris_wheader.csv")) pd_iris = pd.read_csv(h2o.locate("smalldata/iris/iris_wheader.csv")) na_handling = ["ignore","rm","all"] col_names = h2o_iris.col_names[0:4] print "Running smoke test" # smoke test for na in na_handling: grouped = h2o_iris.group_by("class") grouped \ .count(na=na) \ .min( na=na) \ .max( na=na) \ .mean( na=na) \ .var( na=na) \ .sd( na=na) \ .ss( na=na) \ .sum( na=na) print grouped.get_frame()
def getModelKmeans(ip,port): # Connect to a pre-existing cluster h2o.init(ip,port) # connect to localhost:54321 #Log.info("Importing benign.csv data...\n") benign_h2o = h2o.import_frame(path=h2o.locate("smalldata/logreg/benign.csv")) #benign_h2o.summary() benign_sci = np.genfromtxt(h2o.locate("smalldata/logreg/benign.csv"), delimiter=",") # Impute missing values with column mean imp = Imputer(missing_values='NaN', strategy='mean', axis=0) benign_sci = imp.fit_transform(benign_sci) for i in range(2,7): # Log.info("H2O K-Means") km_h2o = h2o.kmeans(x=benign_h2o, k=i) km_h2o.show() #TODO: impement h2o.getModel() model = h2o.getModel(km_h2o._key) model.show() km_sci = KMeans(n_clusters=i, init='k-means++', n_init=1) km_sci.fit(benign_sci) print "sckit centers" print km_sci.cluster_centers_
def frame_slicing(ip,port): iris = h2o.import_file(path=h2o.locate("smalldata/iris/iris_wheader.csv")) prostate = h2o.import_file(path=h2o.locate("smalldata/prostate/prostate.csv.zip")) airlines = h2o.import_file(path=h2o.locate("smalldata/airlines/allyears2k.zip")) iris.show() prostate.show() airlines.show() ################################################################### # H2OFrame[int] (column slice) res1 = iris[0] assert abs(res1[8,:] - 4.4) < 1e-10, "incorrect values" # H2OFrame[int,int] res2 = prostate[13, 3] assert abs(res2 - 1) < 1e-10, "incorrect values" # H2OFrame[int, slice] res3 = airlines[12, 0:3] assert abs(res3[0,0] - 1987) < 1e-10 and abs(res3[0,1] - 10) < 1e-10 and abs(res3[0,2] - 29) < 1e-10, \ "incorrect values" # H2OFrame[slice, int] res4 = iris[5:8, 1] assert abs(res4[0,:] - 3.9) < 1e-10 and abs(res4[1,:] - 3.4) < 1e-10 and abs(res4[2,:] - 3.4) < 1e-10, "incorrect values" # H2OFrame[slice, slice] res5 = prostate[5:8, 0:3] assert abs(res5[0,0] - 6) < 1e-10 and abs(res5[1,1] - 0) < 1e-10 and abs(res5[2,2] - 61) < 1e-10, "incorrect values"
def plot_test(): kwargs = {} kwargs['server'] = True air = h2o.import_file(h2o.locate("smalldata/airlines/AirlinesTrain.csv.zip")) # Constructing test and train sets by sampling (20/80) s = air[0].runif() air_train = air[s <= 0.8] air_valid = air[s > 0.8] myX = ["Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth", "fDayOfWeek"] myY = "IsDepDelayed" air_gbm = h2o.gbm(x=air_train[myX], y=air_train[myY], validation_x=air_valid[myX], validation_y=air_valid[myY], distribution="bernoulli", ntrees=100, max_depth=3, learn_rate=0.01) # Plot ROC for training and validation sets air_gbm.plot(type="roc", train=True, **kwargs) air_gbm.plot(type="roc", valid=True, **kwargs) air_test = h2o.import_file(h2o.locate("smalldata/airlines/AirlinesTest.csv.zip")) perf = air_gbm.model_performance(air_test) #Plot ROC for test set perf.plot(type="roc", **kwargs)
def fiftycatGBM(ip,port): # Training set has only 45 categories cat1 through cat45 #Log.info("Importing 50_cattest_train.csv data...\n") train = h2o.import_file(path=h2o.locate("smalldata/gbm_test/50_cattest_train.csv")) train["y"] = train["y"].asfactor() #Log.info("Summary of 50_cattest_train.csv from H2O:\n") #train.summary() # Train H2O GBM Model: #Log.info(paste("H2O GBM with parameters:\nntrees = 10, max_depth = 20, nbins = 20\n", sep = "")) model = h2o.gbm(x=train[["x1","x2"]], y=train["y"], distribution="bernoulli", ntrees=10, max_depth=5, nbins=20) model.show() # Test dataset has all 50 categories cat1 through cat50 #Log.info("Importing 50_cattest_test.csv data...\n") test = h2o.import_file(path=h2o.locate("smalldata/gbm_test/50_cattest_test.csv")) #Log.info("Summary of 50_cattest_test.csv from H2O:\n") #test.summary() # Predict on test dataset with GBM model: #Log.info("Performing predictions on test dataset...\n") predictions = model.predict(test) predictions.show() # Get the confusion matrix and AUC #Log.info("Confusion matrix of predictions (max accuracy):\n") performance = model.model_performance(test) test_cm = performance.confusion_matrix() test_auc = performance.auc()
def anomaly(ip, port): h2o.init(ip, port) print "Deep Learning Anomaly Detection MNIST" train = h2o.import_frame(h2o.locate("bigdata/laptop/mnist/train.csv.gz")) test = h2o.import_frame(h2o.locate("bigdata/laptop/mnist/test.csv.gz")) predictors = range(0,784) resp = 784 # unsupervised -> drop the response column (digit: 0-9) train = train[predictors] test = test[predictors] # 1) LEARN WHAT'S NORMAL # train unsupervised Deep Learning autoencoder model on train_hex ae_model = h2o.deeplearning(x=train[predictors], training_frame=train, activation="Tanh", autoencoder=True, hidden=[50], l1=1e-5, ignore_const_cols=False, epochs=1) # 2) DETECT OUTLIERS # anomaly app computes the per-row reconstruction error for the test data set # (passing it through the autoencoder model and computing mean square error (MSE) for each row) test_rec_error = ae_model.anomaly(test) # 3) VISUALIZE OUTLIERS # Let's look at the test set points with low/median/high reconstruction errors. # We will now visualize the original test set points and their reconstructions obtained # by propagating them through the narrow neural net. # Convert the test data into its autoencoded representation (pass through narrow neural net) test_recon = ae_model.predict(test)
def iris_h2o_vs_sciKmeans(ip,port): # Connect to a pre-existing cluster h2o.init(ip,port) # connect to localhost:54321 iris_h2o = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv")) iris_sci = np.genfromtxt(h2o.locate("smalldata/iris/iris.csv"), delimiter=',') iris_sci = iris_sci[:,0:4] s =[[4.9,3.0,1.4,0.2], [5.6,2.5,3.9,1.1], [6.5,3.0,5.2,2.0]] start = h2o.H2OFrame(s) start_key = start.send_frame() h2o_km = h2o.kmeans(x=iris_h2o[0:4], k=3, user_points=start_key, standardize=False) sci_km = KMeans(n_clusters=3, init=np.asarray(s), n_init=1) sci_km.fit(iris_sci) # Log.info("Cluster centers from H2O:") print "Cluster centers from H2O:" h2o_centers = h2o_km.centers() print h2o_centers # Log.info("Cluster centers from scikit:") print "Cluster centers from scikit:" sci_centers = sci_km.cluster_centers_.tolist() print sci_centers for hcenter, scenter in zip(h2o_centers, sci_centers): for hpoint, spoint in zip(hcenter,scenter): assert (hpoint- spoint) < 1e-10, "expected centers to be the same"
def offsets_and_distributions(ip,port): # cars cars = h2o.upload_file(h2o.locate("smalldata/junit/cars_20mpg.csv")) cars = cars[cars["economy_20mpg"].isna() == 0] cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() offset = h2o.H2OFrame(python_obj=[[.5] for x in range(398)]) offset.setNames(["x1"]) cars = cars.cbind(offset) # insurance insurance = h2o.import_file(h2o.locate("smalldata/glm_test/insurance.csv")) insurance["offset"] = insurance["Holders"].log() # bernoulli - offset not supported #dl = h2o.deeplearning(x=cars[2:8], y=cars["economy_20mpg"], distribution="bernoulli", offset_column="x1", # training_frame=cars) #predictions = dl.predict(cars) # gamma dl = h2o.deeplearning(x=insurance[0:3], y=insurance["Claims"], distribution="gamma", offset_column="offset", training_frame=insurance) predictions = dl.predict(insurance) # gaussian dl = h2o.deeplearning(x=insurance[0:3], y=insurance["Claims"], distribution="gaussian", offset_column="offset", training_frame=insurance) predictions = dl.predict(insurance) # poisson dl = h2o.deeplearning(x=insurance[0:3], y=insurance["Claims"], distribution="poisson", offset_column="offset", training_frame=insurance) predictions = dl.predict(insurance) # tweedie dl = h2o.deeplearning(x=insurance.names[0:3], y="Claims", distribution="tweedie", offset_column="offset", training_frame=insurance) predictions = dl.predict(insurance)
def fiftycatRF(ip, port): # Training set has only 45 categories cat1 through cat45 # Log.info("Importing 50_cattest_train.csv data...\n") train = h2o.import_file(path=h2o.locate("smalldata/gbm_test/50_cattest_train.csv")) train["y"] = train["y"].asfactor() # Log.info("Summary of 50_cattest_train.csv from H2O:\n") # train.summary() # Train H2O DRF Model: # Log.info(paste("H2O DRF with parameters:\nclassification = TRUE, ntree = 50, depth = 20, nbins = 500\n", sep = "")) model = h2o.random_forest(x=train[["x1", "x2"]], y=train["y"], ntrees=50, max_depth=20, nbins=500) # Test dataset has all 50 categories cat1 through cat50 # Log.info("Importing 50_cattest_test.csv data...\n") test = h2o.import_file(path=h2o.locate("smalldata/gbm_test/50_cattest_test.csv")) # Log.info("Summary of 50_cattest_test.csv from H2O:\n") # test.summary() # Predict on test dataset with DRF model: # Log.info("Performing predictions on test dataset...\n") preds = model.predict(test) preds.head() # Get the confusion matrix and AUC # Log.info("Confusion matrix of predictions (max accuracy):\n") perf = model.model_performance(test) perf.show() cm = perf.confusion_matrix() print(cm)
def link_functions_binomial(ip,port): # Connect to h2o h2o.init(ip,port) print("Read in prostate data.") h2o_data = h2o.import_frame(path=h2o.locate("smalldata/prostate/prostate_complete.csv.zip")) h2o_data.head() sm_data = pd.read_csv(zipfile.ZipFile(h2o.locate("smalldata/prostate/prostate_complete.csv.zip")).open("prostate_complete.csv")).as_matrix() sm_data_response = sm_data[:,2] sm_data_features = sm_data[:,[1,3,4,5,6,7,8,9]] print("Testing for family: BINOMIAL") print("Set variables for h2o.") myY = "CAPSULE" myX = ["ID","AGE","RACE","GLEASON","DCAPS","PSA","VOL","DPROS"] print("Create models with canonical link: LOGIT") h2o_model = h2o.glm(x=h2o_data[myX], y=h2o_data[myY].asfactor(), family="binomial", link="logit",alpha=[0.5], Lambda=[0]) sm_model = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Binomial(sm.families.links.logit)).fit() print("Compare model deviances for link function logit") h2o_deviance = h2o_model._model_json['output']['residual_deviance'] / h2o_model._model_json['output']['null_deviance'] sm_deviance = sm_model.deviance / sm_model.null_deviance assert h2o_deviance - sm_deviance < 0.01, "expected h2o to have an equivalent or better deviance measures"
def wide_dataset_large(ip,port): # Connect to h2o h2o.init(ip,port) print("Reading in Arcene training data for binomial modeling.") trainDataResponse = np.genfromtxt(h2o.locate("smalldata/arcene/arcene_train_labels.labels"), delimiter=' ') trainDataResponse = np.where(trainDataResponse == -1, 0, 1) trainDataFeatures = np.genfromtxt(h2o.locate("smalldata/arcene/arcene_train.data"), delimiter=' ') trainData = h2o.H2OFrame(np.column_stack((trainDataResponse, trainDataFeatures)).tolist()) print("Run model on 3250 columns of Arcene with strong rules off.") model = h2o.glm(x=trainData[1:3250], y=trainData[0].asfactor(), family="binomial", lambda_search=False, alpha=[1]) print("Test model on validation set.") validDataResponse = np.genfromtxt(h2o.locate("smalldata/arcene/arcene_valid_labels.labels"), delimiter=' ') validDataResponse = np.where(validDataResponse == -1, 0, 1) validDataFeatures = np.genfromtxt(h2o.locate("smalldata/arcene/arcene_valid.data"), delimiter=' ') validData = h2o.H2OFrame(np.column_stack((validDataResponse, validDataFeatures)).tolist()) prediction = model.predict(validData) print("Check performance of predictions.") performance = model.model_performance(validData) print("Check that prediction AUC better than guessing (0.5).") assert performance.auc() > 0.5, "predictions should be better then pure chance"
def link_functions_gamma(ip,port): # Connect to h2o h2o.init(ip,port) print("Read in prostate data.") h2o_data = h2o.import_frame(path=h2o.locate("smalldata/prostate/prostate_complete.csv.zip")) h2o_data.head() sm_data = pd.read_csv(zipfile.ZipFile(h2o.locate("smalldata/prostate/prostate_complete.csv.zip")).open("prostate_complete.csv")).as_matrix() sm_data_response = sm_data[:,5] sm_data_features = sm_data[:,[1,2,3,4,6,7,8,9]] print("Testing for family: GAMMA") print("Set variables for h2o.") myY = "DPROS" myX = ["ID","AGE","RACE","GLEASON","DCAPS","PSA","VOL","CAPSULE"] print("Create models with canonical link: INVERSE") h2o_model_in = h2o.glm(x=h2o_data[myX], y=h2o_data[myY], family="gamma", link="inverse",alpha=[0.5], Lambda=[0]) sm_model_in = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Gamma(sm.families.links.inverse_power)).fit() print("Compare model deviances for link function inverse") h2o_deviance_in = h2o_model_in._model_json['output']['residual_deviance'] / h2o_model_in._model_json['output']['null_deviance'] sm_deviance_in = sm_model_in.deviance / sm_model_in.null_deviance assert h2o_deviance_in - sm_deviance_in < 0.01, "expected h2o to have an equivalent or better deviance measures" print("Create models with canonical link: LOG") h2o_model_log = h2o.glm(x=h2o_data[myX], y=h2o_data[myY], family="gamma", link="log",alpha=[0.5], Lambda=[0]) sm_model_log = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Gamma(sm.families.links.log)).fit() print("Compare model deviances for link function log") h2o_deviance_log = h2o_model_log._model_json['output']['residual_deviance'] / h2o_model_log._model_json['output']['null_deviance'] sm_deviance_log = sm_model_log.deviance / sm_model_log.null_deviance assert h2o_deviance_log - sm_deviance_log < 0.01, "expected h2o to have an equivalent or better deviance measures"
def link_functions_gaussian(ip,port): print("Read in prostate data.") h2o_data = h2o.import_file(path=h2o.locate("smalldata/prostate/prostate_complete.csv.zip")) h2o_data.head() sm_data = pd.read_csv(zipfile.ZipFile(h2o.locate("smalldata/prostate/prostate_complete.csv.zip")). open("prostate_complete.csv")).as_matrix() sm_data_response = sm_data[:,9] sm_data_features = sm_data[:,1:9] print("Testing for family: GAUSSIAN") print("Set variables for h2o.") myY = "GLEASON" myX = ["ID","AGE","RACE","CAPSULE","DCAPS","PSA","VOL","DPROS"] print("Create models with canonical link: IDENTITY") h2o_model = h2o.glm(x=h2o_data[myX], y=h2o_data[myY], family="gaussian", link="identity",alpha=[0.5], Lambda=[0]) sm_model = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Gaussian(sm.families.links.identity)).fit() print("Compare model deviances for link function identity") h2o_deviance = h2o_model.residual_deviance() / h2o_model.null_deviance() sm_deviance = sm_model.deviance / sm_model.null_deviance assert h2o_deviance - sm_deviance < 0.01, "expected h2o to have an equivalent or better deviance measures"
def pubdev_1953(): # small_test = [h2o.locate("bigdata/laptop/citibike-nyc/2013-10.csv")] # data = h2o.import_file(path=small_test) # startime = data["starttime"] # secsPerDay=1000*60*60*24 # data["Days"] = (startime/secsPerDay).floor() # grouped = data.group_by(["Days","start station name"]) # bpd = grouped.count(name="bikes").get_frame() # secs = bpd["Days"]*secsPerDay # bpd["Month"] = secs.month().asfactor() # bpd["DayOfWeek"] = secs.dayOfWeek() # wthr1 = h2o.import_file(path=[h2o.locate("bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2013.csv"), h2o.locate("bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2014.csv")]) # wthr2 = wthr1[["Year Local","Month Local","Day Local","Hour Local","Dew Point (C)","Humidity Fraction","Precipitation One Hour (mm)","Temperature (C)","Weather Code 1/ Description"]] # wthr2.set_name(wthr2.index("Precipitation One Hour (mm)"), "Rain (mm)") # wthr2.set_name(wthr2.index("Weather Code 1/ Description"), "WC1") # wthr3 = wthr2[ wthr2["Hour Local"]==12 ] # wthr3["msec"] = h2o.H2OFrame.mktime(year=wthr3["Year Local"], month=wthr3["Month Local"]-1, day=wthr3["Day Local"]-1, hour=wthr3["Hour Local"]) # secsPerDay=1000*60*60*24 # wthr3["Days"] = (wthr3["msec"]/secsPerDay).floor() # wthr4 = wthr3.drop("Year Local").drop("Month Local").drop("Day Local").drop("Hour Local").drop("msec") # rain = wthr4["Rain (mm)"] # rain[ rain.isna() ] = 0 # bpd_with_weather = bpd.merge(wthr4,allLeft=True,allRite=False) # r = bpd_with_weather['Days'].runif(seed=356964763) # train = bpd_with_weather[ r < 0.6] # test = bpd_with_weather[(0.6 <= r) & (r < 0.9)] predictors = ['DayOfWeek', 'WC1', 'start station name', 'Temperature (C)', 'Days', 'Month', 'Humidity Fraction', 'Rain (mm)', 'Dew Point (C)'] train = h2o.import_file(h2o.locate("smalldata/glm_test/citibike_small_train.csv")) test = h2o.import_file(h2o.locate("smalldata/glm_test/citibike_small_test.csv")) glm0 = h2o.glm(x=train[predictors], y=train["bikes"], validation_x=test[predictors], validation_y=test["bikes"], family="poisson")
def smallcatGBM(ip,port): # Training set has 26 categories from A to Z # Categories A, C, E, G, ... are perfect predictors of y = 1 # Categories B, D, F, H, ... are perfect predictors of y = 0 #Log.info("Importing alphabet_cattest.csv data...\n") alphabet = h2o.import_file(path=h2o.locate("smalldata/gbm_test/alphabet_cattest.csv")) alphabet["y"] = alphabet["y"].asfactor() #Log.info("Summary of alphabet_cattest.csv from H2O:\n") #alphabet.summary() # Prepare data for scikit use trainData = np.loadtxt(h2o.locate("smalldata/gbm_test/alphabet_cattest.csv"), delimiter=',', skiprows=1, converters={0:lambda s: ord(s.split("\"")[1])}) trainDataResponse = trainData[:,1] trainDataFeatures = trainData[:,0] # Train H2O GBM Model: #Log.info("H2O GBM (Naive Split) with parameters:\nntrees = 1, max_depth = 1, nbins = 100\n") gbm_h2o = h2o.gbm(x=alphabet[['X']], y=alphabet["y"], distribution="bernoulli", ntrees=1, max_depth=1, nbins=100) gbm_h2o.show() # Train scikit GBM Model: # Log.info("scikit GBM with same parameters:") gbm_sci = ensemble.GradientBoostingClassifier(n_estimators=1, max_depth=1, max_features=None) gbm_sci.fit(trainDataFeatures[:,np.newaxis],trainDataResponse)
def dim_checks(): # Log.info("Uploading logreg/princeton/cuse.dat") h2o_data = h2o.import_file(path=h2o.locate("smalldata/logreg/prostate.csv")) np_data = np.loadtxt(h2o.locate("smalldata/logreg/prostate.csv"), delimiter=',', skiprows=1) h2o_rows, h2o_cols = h2o_data.dim np_rows, np_cols = list(np_data.shape) print 'The dimensions of h2o frame is: {0} x {1}'.format(h2o_rows, h2o_cols) print 'The dimensions of numpy array is: {0} x {1}'.format(np_rows, np_cols) assert [h2o_rows, h2o_cols] == [np_rows, np_cols], "expected equal number of columns and rows" # Log.info("Slice out a column and data frame it, try dim on it...") h2o_slice = h2o_data[4] np_slice = np_data[:,4] h2o_rows, h2o_cols = h2o_slice.dim np_rows = np_slice.shape[0] print 'The dimensions of h2o column slice is: {0} x {1}'.format(h2o_rows, h2o_cols) print 'The dimensions of numpy array column slice is: {0} x 1'.format(np_rows) assert [h2o_rows, h2o_cols] == [np_rows, 1], "expected equal number of columns and rows" # Log.info("OK, now try an operator, e.g. '&', and then check dimensions agao...") h2oColAmpFive = h2o_slice & 5 assert h2oColAmpFive.nrow == h2o_rows, "expected the number of rows to remain unchanged"
def sdev(ip,port): # Connect to h2o h2o.init(ip,port) iris_h2o = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv")) iris_np = np.genfromtxt(h2o.locate("smalldata/iris/iris_wheader.csv"), delimiter=',', skip_header=1, usecols=(0, 1, 2, 3)) sd_np = np.std(iris_np, axis=0, ddof=1) for i in range(4): sd_h2o = iris_h2o[i].sd() assert abs(sd_np[i] - sd_h2o) < 1e-10, "expected standard deviations to be the same" try: iris_h2o[4].sd() assert False, "expected an error. column is categorical." except EnvironmentError: assert True try: iris_h2o[0:2].sd() assert False, "expected an error. more than one column." except EnvironmentError: assert True
def asnumeric(ip,port): # Connect to h2o h2o.init(ip,port) h2oframe = h2o.import_frame(path=h2o.locate("smalldata/junit/cars.csv")) rows = h2oframe.nrow() h2oframe['cylinders'] = h2oframe['cylinders'].ascharacter() assert h2oframe["cylinders"].isfactor(), "expected the column to be a factor" # H2OFrame case h2oframe = h2o.asnumeric(h2oframe) h2oframe['cylinders'] = h2oframe['cylinders'] - h2oframe['cylinders'] h2oframe = h2oframe[h2oframe['cylinders'] == 0] assert h2oframe.nrow() == rows, "expected the same number of rows as before {0}, but got {1}".format(rows, h2oframe.nrow()) h2oframe = h2o.import_frame(path=h2o.locate("smalldata/junit/cars.csv")) h2oframe['cylinders'] = h2oframe['cylinders'].ascharacter() assert h2oframe["cylinders"].isfactor(), "expected the column to be a factor" # H2OVec case h2oframe['cylinders'] = h2o.asnumeric(h2oframe['cylinders']) h2oframe['cylinders'] = h2oframe['cylinders'] - h2oframe['cylinders'] h2oframe = h2oframe[h2oframe['cylinders'] == 0] assert h2oframe.nrow() == rows, "expected the same number of rows as before {0}, but got {1}".format(rows, h2oframe.nrow())
def group_by(ip,port): # Connect to a pre-existing cluster h2o.init(ip,port) h2o_iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv")) pd_iris = pd.read_csv(h2o.locate("smalldata/iris/iris_wheader.csv")) h2o_agg_funcs = ["count","count_unique","first","last","min","max","mean","avg","sd","stdev","var","sum","ss"] na_handling = ["ignore","rm","all"] col_names = h2o_iris.col_names()[0:4] print "Running smoke test" # smoke test for a in h2o_agg_funcs: for n in na_handling: for c in col_names: print "group by : " + str(a) + "; " + str(n) + "; " + str(c) h2o.group_by(h2o_iris, ["class"], {"foo":[a,c,n]}) # h2o/pandas/numpy comparison test h2o_np_agg_dict = {"min":np.min, "max":np.max, "mean":np.mean, "sum":np.sum} for k in h2o_np_agg_dict.keys(): for c in col_names: print "group by comparison: " + str(k) + "; " + str(c) h2o_res = h2o.group_by(h2o_iris, ["class"], {"foo":[k,c,"all"]}) pd_res = pd_iris.groupby("class")[c].aggregate(h2o_np_agg_dict[k]) for i in range(3): h2o_val = h2o_res[i,1] pd_val = pd_res[h2o_res[i,0]] assert abs(h2o_val - pd_val) < 1e-06, \ "check unsuccessful! h2o computed {0} and pandas computed {1}. expected equal aggregate {2} " \ "values between h2o and pandas on column {3}".format(h2o_val,pd_val,k,c)
def milsong_checkpoint(ip,port): milsong_train = h2o.upload_file(h2o.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz")) milsong_valid = h2o.upload_file(h2o.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz")) distribution = "gaussian" # build first model ntrees1 = random.sample(range(50,100),1)[0] max_depth1 = random.sample(range(2,6),1)[0] min_rows1 = random.sample(range(10,16),1)[0] print "ntrees model 1: {0}".format(ntrees1) print "max_depth model 1: {0}".format(max_depth1) print "min_rows model 1: {0}".format(min_rows1) model1 = h2o.gbm(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees1,max_depth=max_depth1, min_rows=min_rows1, distribution=distribution,validation_x=milsong_valid[1:],validation_y=milsong_valid[0]) # save the model, then load the model model_path = h2o.save_model(model1, name="delete_model", force=True) restored_model = h2o.load_model(model_path) shutil.rmtree("delete_model") # continue building the model ntrees2 = ntrees1 + 50 max_depth2 = max_depth1 min_rows2 = min_rows1 print "ntrees model 2: {0}".format(ntrees2) print "max_depth model 2: {0}".format(max_depth2) print "min_rows model 2: {0}".format(min_rows2) model2 = h2o.gbm(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2, distribution=distribution,validation_x=milsong_valid[1:],validation_y=milsong_valid[0], checkpoint=restored_model._id) # build the equivalent of model 2 in one shot model3 = h2o.gbm(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2, distribution=distribution,validation_x=milsong_valid[1:],validation_y=milsong_valid[0])
def shuffling_large(ip,port): # Connect to h2o h2o.init(ip,port) print("Reading in Arcene training data for binomial modeling.") train_data = h2o.upload_file(path=h2o.locate("smalldata/arcene/shuffle_test_version/arcene.csv")) train_data_shuffled = h2o.upload_file(path=h2o.locate("smalldata/arcene/shuffle_test_version/arcene_shuffled.csv")) print("Create model on original Arcene dataset.") h2o_model = h2o.glm(x=train_data[0:1000], y=train_data[1000], family="binomial", lambda_search=True, alpha=[0.5], use_all_factor_levels=True) print("Create second model on original Arcene dataset.") h2o_model_2 = h2o.glm(x=train_data[0:1000], y=train_data[1000], family="binomial", lambda_search=True, alpha=[0.5], use_all_factor_levels=True) print("Create model on shuffled Arcene dataset.") h2o_model_s = h2o.glm(x=train_data_shuffled[0:1000], y=train_data_shuffled[1000], family="binomial", lambda_search=True, alpha=[0.5], use_all_factor_levels=True) print("Assert that number of predictors remaining and their respective coefficients are equal.") for x, y in zip(h2o_model._model_json['output']['coefficients_table'].cell_values,h2o_model_2._model_json['output']['coefficients_table'].cell_values): assert (type(x[1]) == type(y[1])) and (type(x[2]) == type(y[2])), "coefficients should be the same type" if isinstance(x[1],float): assert abs(x[1] - y[1]) < 5e-10, "coefficients should be equal" if isinstance(x[2],float): assert abs(x[2] - y[2]) < 5e-10, "coefficients should be equal" for x, y in zip(h2o_model._model_json['output']['coefficients_table'].cell_values,h2o_model_s._model_json['output']['coefficients_table'].cell_values): assert (type(x[1]) == type(y[1])) and (type(x[2]) == type(y[2])), "coefficients should be the same type" if isinstance(x[1],float): assert abs(x[1] - y[1]) < 5e-10, "coefficients should be equal" if isinstance(x[2],float): assert abs(x[2] - y[2]) < 5e-10, "coefficients should be equal"
def get_model_test(ip,port): # Connect to h2o h2o.init(ip,port) prostate = h2o.import_frame(path=h2o.locate("smalldata/logreg/prostate.csv")) r = prostate[0].runif() train = prostate[r < 0.70] test = prostate[r >= 0.30] # Regression regression_gbm1 = h2o.gbm(y=train[1], x=train[2:9], distribution="gaussian") predictions1 = regression_gbm1.predict(test) regression_gbm2 = h2o.get_model(regression_gbm1._key) assert regression_gbm2._model_json['output']['model_category'] == "Regression" predictions2 = regression_gbm2.predict(test) for r in range(predictions1.nrow()): p1 = predictions1[r,0] p2 = predictions2[r,0] assert p1 == p2, "expected regression predictions to be the same for row {0}, but got {1} and {2}" \ "".format(r, p1, p2) # Binomial train[1] = train[1].asfactor() bernoulli_gbm1 = h2o.gbm(y=train[1], x=train[2:9], distribution="bernoulli") predictions1 = bernoulli_gbm1.predict(test) bernoulli_gbm2 = h2o.get_model(bernoulli_gbm1._key) assert bernoulli_gbm2._model_json['output']['model_category'] == "Binomial" predictions2 = bernoulli_gbm2.predict(test) for r in range(predictions1.nrow()): p1 = predictions1[r,0] p2 = predictions2[r,0] assert p1 == p2, "expected binomial predictions to be the same for row {0}, but got {1} and {2}" \ "".format(r, p1, p2) # Clustering benign_h2o = h2o.import_frame(path=h2o.locate("smalldata/logreg/benign.csv")) km_h2o = h2o.kmeans(x=benign_h2o, k=3) benign_km = h2o.get_model(km_h2o._key) assert benign_km._model_json['output']['model_category'] == "Clustering" # Multinomial train[4] = train[4].asfactor() multinomial_dl1 = h2o.deeplearning(x=train[0:2], y=train[4], loss='CrossEntropy') predictions1 = multinomial_dl1.predict(test) multinomial_dl2 = h2o.get_model(multinomial_dl1._key) assert multinomial_dl2._model_json['output']['model_category'] == "Multinomial" predictions2 = multinomial_dl2.predict(test) for r in range(predictions1.nrow()): p1 = predictions1[r,0] p2 = predictions2[r,0] assert p1 == p2, "expected multinomial predictions to be the same for row {0}, but got {1} and {2}" \ "".format(r, p1, p2)
def test_locate(): iris_path = h2o.locate("smalldata/iris/iris.csv") try: h2o.locate("smalldata/iris/afilethatdoesnotexist.csv") assert False, "Expected h2o.locate to raise a ValueError" except ValueError: assert True
def frame_show(ip, port): iris = h2o.import_file(path=h2o.locate("smalldata/iris/iris_wheader.csv")) prostate = h2o.import_file(path=h2o.locate("smalldata/prostate/prostate.csv.zip")) airlines = h2o.import_file(path=h2o.locate("smalldata/airlines/allyears2k.zip")) iris.show() prostate.show() airlines.show()
def bernoulliGBM(ip,port): #Log.info("Importing prostate.csv data...\n") prostate_train = h2o.import_file(path=h2o.locate("smalldata/logreg/prostate_train.csv")) #Log.info("Converting CAPSULE and RACE columns to factors...\n") prostate_train["CAPSULE"] = prostate_train["CAPSULE"].asfactor() #Log.info("H2O Summary of prostate frame:\n") #prostate.summary() # Import prostate_train.csv as numpy array for scikit comparison trainData = np.loadtxt(h2o.locate("smalldata/logreg/prostate_train.csv"), delimiter=',', skiprows=1) trainDataResponse = trainData[:,0] trainDataFeatures = trainData[:,1:] ntrees = 100 learning_rate = 0.1 depth = 5 min_rows = 10 # Build H2O GBM classification model: #Log.info(paste("H2O GBM with parameters:\ndistribution = 'bernoulli', ntrees = ", ntrees, ", max_depth = 5, # min_rows = 10, learn_rate = 0.1\n", sep = "")) gbm_h2o = h2o.gbm(x=prostate_train[1:], y=prostate_train["CAPSULE"], ntrees=ntrees, learn_rate=learning_rate, max_depth=depth, min_rows=min_rows, distribution="bernoulli") # Build scikit GBM classification model #Log.info("scikit GBM with same parameters\n") gbm_sci = ensemble.GradientBoostingClassifier(learning_rate=learning_rate, n_estimators=ntrees, max_depth=depth, min_samples_leaf=min_rows, max_features=None) gbm_sci.fit(trainDataFeatures,trainDataResponse) #Log.info("Importing prostate_test.csv data...\n") prostate_test = h2o.import_file(path=h2o.locate("smalldata/logreg/prostate_test.csv")) #Log.info("Converting CAPSULE and RACE columns to factors...\n") prostate_test["CAPSULE"] = prostate_test["CAPSULE"].asfactor() # Import prostate_test.csv as numpy array for scikit comparison testData = np.loadtxt(h2o.locate("smalldata/logreg/prostate_test.csv"), delimiter=',', skiprows=1) testDataResponse = testData[:,0] testDataFeatures = testData[:,1:] # Score on the test data and compare results # scikit auc_sci = roc_auc_score(testDataResponse, gbm_sci.predict_proba(testDataFeatures)[:,1]) # h2o gbm_perf = gbm_h2o.model_performance(prostate_test) auc_h2o = gbm_perf.auc() #Log.info(paste("scikit AUC:", auc_sci, "\tH2O AUC:", auc_h2o)) assert auc_h2o >= auc_sci, "h2o (auc) performance degradation, with respect to scikit"
def headers(ip,port): # Connect to h2o h2o.init(ip,port) headers = h2o.import_frame(h2o.locate("smalldata/airlines/allyears2k_headers_only.csv")) headers_and = h2o.import_frame(h2o.locate("smalldata/airlines/allyears2k.zip"), col_names=headers) print headers.names() print headers_and.names() assert headers.names() == headers_and.names(), "Expected the same column names but got {0} and {1}". \ format(headers.names(), headers_and.names())
def headers(): headers = h2o.import_file(h2o.locate("smalldata/airlines/allyears2k_headers_only.csv")) headers_and = h2o.import_file(h2o.locate("smalldata/airlines/allyears2k.zip")) headers_and.set_names(headers.names) print headers.names print headers_and.names assert headers.names == headers_and.names, "Expected the same column names but got {0} and {1}".format( headers.names, headers_and.names )
def pubdev_1839(ip, port): train = h2o.import_file(h2o.locate("smalldata/jira/pubdev_1839_repro_train.csv")) test = h2o.import_file(h2o.locate("smalldata/jira/pubdev_1839_repro_test.csv")) glm0 = h2o.glm(x =train.drop("bikes"), y =train ["bikes"], validation_x=test .drop("bikes"), validation_y=test ["bikes"], family="poisson")
def deeplearning_autoencoder(ip, port): resp = 784 nfeatures = 20 # number of features (smallest hidden layer) train_hex = h2o.upload_file(h2o.locate("bigdata/laptop/mnist/train.csv.gz")) train_hex[resp] = train_hex[resp].asfactor() test_hex = h2o.upload_file(h2o.locate("bigdata/laptop/mnist/test.csv.gz")) test_hex[resp] = test_hex[resp].asfactor() # split data into two parts sid = train_hex[0].runif(1234) # unsupervised data for autoencoder train_unsupervised = train_hex[sid >= 0.5] train_unsupervised.drop(resp) train_unsupervised.describe() # supervised data for drf train_supervised = train_hex[sid < 0.5] train_supervised.describe() # train autoencoder ae_model = h2o.deeplearning(x=train_unsupervised[0:resp], activation="Tanh", autoencoder=True, hidden=[nfeatures], epochs=1, reproducible=True, #slow, turn off for real problems seed=1234) # conver train_supervised with autoencoder to lower-dimensional space train_supervised_features = ae_model.deepfeatures(train_supervised[0:resp]._frame(), 0) assert train_supervised_features.ncol() == nfeatures, "Dimensionality of reconstruction is wrong!" # Train DRF on extracted feature space drf_model = h2o.random_forest(x=train_supervised_features[0:20], y=train_supervised[resp], ntrees=10, min_rows=10, seed=1234) # Test the DRF model on the test set (processed through deep features) test_features = ae_model.deepfeatures(test_hex[0:resp]._frame(), 0) test_features = test_features.cbind(test_hex[resp])._frame() # Confusion Matrix and assertion cm = drf_model.confusion_matrix(test_features) cm.show() # 10% error +/- 0.001 assert abs(cm.cell_values[10][10] - 0.1078) < 0.001, "Error. Expected 0.1078, but got {0}".format(cm.cell_values[10][10])
def pub_445_long_request_uri(ip,port): # Connect to h2o h2o.init(ip,port) mnistTrain = h2o.import_frame(path=h2o.locate("bigdata/laptop/mnist/train.csv.gz")) mnistTest = h2o.import_frame(path=h2o.locate("bigdata/laptop/mnist/train.csv.gz")) mnistTrain[784]._name = "label" mnistTest[784]._name = "label" mnistModel = h2o.gbm(x=mnistTrain.drop("label"), y=mnistTrain["label"], validation_x=mnistTest.drop("label"), validation_y=mnistTest["label"], ntrees=100, max_depth=10)
def swpredsRF(ip, port): # Training set has two predictor columns # X1: 10 categorical levels, 100 observations per level; X2: Unif(0,1) noise # Ratio of y = 1 per Level: cat01 = 1.0 (strong predictor), cat02 to cat10 = 0.5 (weak predictors) #Log.info("Importing swpreds_1000x3.csv data...\n") swpreds = h2o.import_file( path=h2o.locate("smalldata/gbm_test/swpreds_1000x3.csv")) swpreds["y"] = swpreds["y"].asfactor() #Log.info("Summary of swpreds_1000x3.csv from H2O:\n") #swpreds.summary() # Train H2O DRF without Noise Column #Log.info("Distributed Random Forest with only Predictor Column") model1 = h2o.random_forest(x=swpreds[["X1"]], y=swpreds["y"], ntrees=50, max_depth=20, nbins=500) model1.show() perf1 = model1.model_performance(swpreds) print(perf1.auc()) # Train H2O DRF Model including Noise Column: #Log.info("Distributed Random Forest including Noise Column") model2 = h2o.random_forest(x=swpreds[["X1", "X2"]], y=swpreds["y"], ntrees=50, max_depth=20, nbins=500) model2.show() perf2 = model2.model_performance(swpreds) print(perf2.auc())
def var_test(ip, port): iris_h2o = h2o.import_file( path=h2o.locate("smalldata/iris/iris_wheader.csv")) iris_np = np.genfromtxt(h2o.locate("smalldata/iris/iris_wheader.csv"), delimiter=',', skip_header=1, usecols=(0, 1, 2, 3)) var_np = np.var(iris_np, axis=0, ddof=1) for i in range(4): var_h2o = iris_h2o[i].var() assert abs(var_np[i] - var_h2o) < 1e-10, "expected equal variances" var_cov_h2o = iris_h2o[0:4].var() var_cov_np = np.cov(iris_np, rowvar=0, ddof=1)
def nfold_predict(ip,port): fr = h2o.import_file(path=h2o.locate("smalldata/logreg/prostate_train.csv")) m = h2o.gbm(x=fr[2:], y=fr[1], nfolds=10, ntrees=10) xval_models = m.get_xval_models() fr["weights"]=1 preds = [model.predict(fr) for model in xval_models] (sum(preds)/10).show()
def multi_dim_slicing(ip, port): # Connect to a pre-existing cluster h2o.init(ip, port) prostate = h2o.import_frame( path=h2o.locate("smalldata/logreg/prostate.csv")) # prostate[int,int] case # 48,0,68,1,2,1,12.3,16.3,8 pros = prostate[47:51, 7] assert pros[0] == 16.3, "Incorrect slicing result" pros = prostate[172, 8] assert pros == 7, "Incorrect slicing result" # prostate[slice,int] case # rows: # 171,1,74,1,3,1,7,0,6 # 172,1,71,1,3,1,3.3,0,6 # 173,1,60,1,4,1,7.3,0,7 # 174,1,62,1,2,1,17.2,0,7 # 175,0,71,1,2,1,3.8,19,6 # 176,0,67,1,3,1,5.7,15.4,6 pros = prostate[170:176, 2] assert pros[0] == 74, "Incorrect slicing result" assert pros[1] == 71, "Incorrect slicing result" assert pros[2] == 60, "Incorrect slicing result" assert pros[3] == 62, "Incorrect slicing result" assert pros[4] == 71, "Incorrect slicing result" assert pros[5] == 67, "Incorrect slicing result" # # prostate[int,list] case # # 353,0,54,1,3,1,21.6,25,7 # # 226,0,68,1,2,1,12.7,0,7 # # 238,0,66,1,2,1,11,36.6,6 # pros = prostate[6,[352,225,237]] # assert (pros[0,0] - 12.7) < 1e-10, "Incorrect slicing result" # assert (pros[0,1] - 11) < 1e-10, "Incorrect slicing result" # assert (pros[0,2] - 21.6) < 1e-10, "Incorrect slicing result" # prostate [int,slice] case # 189,1,69,1,3,2,8,31.2,6 pros = prostate[188, 0:3] assert pros[0, 0] == 189, "Incorrect slicing result" assert pros[0, 1] + 1 == 2, "Incorrect slicing result" assert pros[0, 2] == 69, "Incorrect slicing result" # prostate [slice,slice] case # 84,0,75,1,2,1,11,35,7 # 85,0,75,1,1,1,9.9,15.4,7 # 86,1,75,1,3,1,3.7,0,6 pros = prostate[83:86, 1:4] assert pros[0, 0] == 0, "Incorrect slicing result" assert pros[0, 1] == 75, "Incorrect slicing result" assert pros[0, 2] - 1 == 0, "Incorrect slicing result" assert pros[1, 0] == 0, "Incorrect slicing result" assert pros[1, 1] + 75 == 150, "Incorrect slicing result" assert pros[1, 2] == 1, "Incorrect slicing result" assert pros[2, 0] + 1 == 2, "Incorrect slicing result" assert pros[2, 1] == 75, "Incorrect slicing result" assert pros[2, 2] == 1, "Incorrect slicing result"
def sub_gsub_check(ip, port): # Connect to a pre-existing cluster frame = h2o.import_file(path=h2o.locate("smalldata/iris/iris.csv")) # single column (frame) frame["C5"] = frame["C5"].gsub("s", "z") assert frame[ 0, 4] == "Iriz-zetoza", "Expected 'Iriz-zetoza', but got {0}".format( frame[0, 4]) frame["C5"] = frame["C5"].sub("z", "s") assert frame[ 1, 4] == "Iris-zetoza", "Expected 'Iris-zetoza', but got {0}".format( frame[1, 4]) # single column (vec) vec = frame["C5"] vec = vec.sub("z", "s") assert vec[ 2, 0] == "Iris-setoza", "Expected 'Iris-setoza', but got {0}".format( vec[2, 0]) vec = vec.gsub("s", "z") assert vec[ 3, 0] == "Iriz-zetoza", "Expected 'Iriz-zetoza', but got {0}".format( vec[3, 0])
def demo_prostateGBM(ip, port): # Connect to a pre-existing cluster h2o.init(ip, port) # execute ipython notebook h2o.ipy_notebook_exec(h2o.locate("h2o-py/demos/prostate_gbm.ipynb"), save_and_norun=False)
def rf_balance_classes(ip, port): # Connect to a pre-existing cluster h2o.init(ip, port) # execute ipython notebook h2o.ipy_notebook_exec(h2o.locate("h2o-py/demos/rf_balance_classes.ipynb"), save_and_norun=False)
def link_functions_tweedie_basic(ip, port): print "Read in prostate data." hdf = h2o.upload_file( h2o.locate("smalldata/prostate/prostate_complete.csv.zip")) print "Testing for family: TWEEDIE" print "Set variables for h2o." y = "CAPSULE" x = ["AGE", "RACE", "DCAPS", "PSA", "VOL", "DPROS", "GLEASON"] print "Create models with canonical link: TWEEDIE" model_h2o_tweedie = h2o.glm(x=hdf[x], y=hdf[y], family="tweedie", link="tweedie", alpha=[0.5], Lambda=[0]) print "Compare model deviances for link function tweedie (using precomputed values from R)" deviance_h2o_tweedie = model_h2o_tweedie.residual_deviance( ) / model_h2o_tweedie.null_deviance() assert 0.721452 - deviance_h2o_tweedie <= 0.01, "h2o's residual/null deviance is more than 0.01 lower than R's. h2o: " \ "{0}, r: {1}".format(deviance_h2o_tweedie, 0.721452)
def expr_show(ip, port): # Connect to h2o h2o.init(ip, port) iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv")) print "iris:" iris.show() ################################################################### # expr[int], expr._data is pending res = 2 - iris res2 = res[0] print "res2:" res2.show() # expr[int], expr._data is remote res3 = res[0] print "res3:" res3.show() # expr[int], expr._data is local expr = Expr([1, 2, 3]) print "expr:" expr.show() # expr[tuple], expr._data is local expr = Expr([[1, 2, 3], [4, 5, 6]]) print "expr:" expr.show()
def center_scale(ip, port): # Connect to h2o h2o.init(ip, port) iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv"))[0:4] # frame (default args) foo = iris.scale() # TODO: the below assertion fails. Should it? #assert abs(foo[0,0] - -0.8976739) < 1e-6 and abs(foo[0,1] - 1.01560199) < 1e-6 and abs(foo[0,2] - -1.335752) < 1e-6 \ # and abs(foo[0,3] - -1.311052) < 1e-6, "h2o differed from r. h2o got {0}, {1}, {2}, and {3}" \ # "".format(foo[0,0],foo[0,1],foo[0,2],foo[0,3]) # frame (centers=True, scale=False) foo = iris.scale(center=True, scale=False) # frame (centers=False, scale=True) foo = iris.scale(center=False, scale=True) # frame (centers=False, scale=False) foo = iris.scale(center=False, scale=False) # vec (default args) foo = iris[0].scale() # vec (centers=True, scale=False) foo = iris[1].scale(center=True, scale=False) # vec (centers=False, scale=True) foo = iris[2].scale(center=False, scale=True) # vec (centers=False, scale=False) foo = iris[3].scale(center=False, scale=False)
def demo_citibike_small(ip, port): # Connect to a pre-existing cluster h2o.init(ip, port) # execute ipython notebook h2o.ipy_notebook_exec(h2o.locate("h2o-py/demos/citi_bike_small.ipynb"), save_and_norun=False)
def offset_gaussian(ip,port): # Connect to a pre-existing cluster insurance = h2o.import_file(h2o.locate("smalldata/glm_test/insurance.csv")) insurance["offset"] = insurance["Holders"].log() gbm = h2o.gbm(x=insurance[0:3], y=insurance["Claims"], distribution="gaussian", ntrees=600, max_depth=1, min_rows=1, learn_rate=.1, offset_column="offset", training_frame=insurance) predictions = gbm.predict(insurance) # Comparison result generated from R's gbm: # fit2 <- gbm(Claims ~ District + Group + Age+ offset(log(Holders)) , interaction.depth = 1,n.minobsinnode = 1, # shrinkage = .1,bag.fraction = 1,train.fraction = 1, # data = Insurance, distribution ="gaussian", n.trees = 600) # pg = predict(fit2, newdata = Insurance, type = "response", n.trees=600) # pr = pg - - log(Insurance$Holders) assert abs(44.33016 - gbm._model_json['output']['init_f']) < 1e-5, "expected init_f to be {0}, but got {1}". \ format(44.33016, gbm._model_json['output']['init_f']) assert abs(1491.135 - gbm.mse()) < 1e-2, "expected mse to be {0}, but got {1}".format(1491.135, gbm.mse()) assert abs(49.23438 - predictions.mean()) < 1e-2, "expected prediction mean to be {0}, but got {1}". \ format(49.23438, predictions.mean()) assert abs(-45.5720659304 - predictions.min()) < 1e-2, "expected prediction min to be {0}, but got {1}". \ format(-45.5720659304, predictions.min()) assert abs(207.387 - predictions.max()) < 1e-2, "expected prediction max to be {0}, but got {1}". \ format(207.387, predictions.max())
def benign(ip, port): # Connect to h2o h2o.init(ip, port) training_data = h2o.import_frame(h2o.locate("smalldata/logreg/benign.csv")) Y = 3 X = [x for x in range(2, 11) if x != Y] #Log.info("Build the model") model = h2o.glm(y=training_data[Y].asfactor(), x=training_data[X], family="binomial", alpha=[0], Lambda=[1e-5]) #Log.info("Check that the columns used in the model are the ones we passed in.") #Log.info("===================Columns passed in: ================") in_names = [training_data.names()[i] for i in X] #Log.info("===================Columns passed out: ================") out_names = [ model._model_json['output']['coefficients_table'].cell_values[c][0] for c in range(len(X)) ] assert in_names == out_names
def deep_learning_metrics_test(ip, port): h2o.init(ip, port) # connect to existing cluster df = h2o.import_frame(path=h2o.locate("smalldata/logreg/prostate.csv")) del df['ID'] # remove ID df['CAPSULE'] = df['CAPSULE'].asfactor() # make CAPSULE categorical vol = df['VOL'] vol[vol == 0] = None # 0 VOL means 'missing' r = vol.runif() # random train/test split train = df[r < 0.8] test = df[r >= 0.8] # See that the data is ready train.describe() train.head() test.describe() test.head() # Run DeepLearning print "Train a Deeplearning model: " dl = h2o.deeplearning(x = train[1:], y = train['CAPSULE'], epochs = 100, hidden = [10, 10, 10], loss = 'CrossEntropy') print "Binomial Model Metrics: " print dl.show() # print dl._model_json dl.model_performance(test).show()
def prep_airlines(ip, port): # Connect to a pre-existing cluster h2o.init(ip, port) air = h2o.import_frame( h2o.locate("smalldata/airlines/allyears2k_headers.zip")) numRows, numCols = air.dim() x_cols = [ "Month", "DayofMonth", "DayOfWeek", "CRSDepTime", "CRSArrTime", "UniqueCarrier", "CRSElapsedTime", "Origin", "Dest", "Distance" ] y_col = "SynthDepDelayed" noDepDelayedNAs = air[air["DepDelay"].isna() == 0] print "Dimensions of new dataset: {0}".format(noDepDelayedNAs.dim()) minutesOfDelayWeTolerate = 15 noDepDelayedNAs.cbind( noDepDelayedNAs["DepDelay"] > minutesOfDelayWeTolerate) noDepDelayedNAs[numCols] = noDepDelayedNAs[numCols].asfactor() noDepDelayedNAs._vecs[numCols].setName(y_col) gbm = h2o.gbm(x=noDepDelayedNAs[x_cols], y=noDepDelayedNAs[y_col], loss="bernoulli") gbm.show()
def link_incompatible_error(ip, port): print("Reading in original prostate data.") prostate = h2o.import_file( path=h2o.locate("smalldata/prostate/prostate.csv.zip")) print( "Throw error when trying to create model with incompatible logit link." ) try: h2o.model = h2o.glm(x=prostate[1:8], y=prostate[8], family="gaussian", link="logit") assert False, "expected an error" except EnvironmentError: assert True try: h2o.model = h2o.glm(x=prostate[1:8], y=prostate[8], family="tweedie", link="log") assert False, "expected an error" except EnvironmentError: assert True try: h2o.model = h2o.glm(x=prostate[2:9], y=prostate[1], family="binomial", link="inverse") assert False, "expected an error" except EnvironmentError: assert True
def offset_gamma(ip, port): # Connect to a pre-existing cluster insurance = h2o.import_file(h2o.locate("smalldata/glm_test/insurance.csv")) insurance["offset"] = insurance["Holders"].log() gbm = h2o.gbm(x=insurance[0:3], y=insurance["Claims"], distribution="gamma", ntrees=600, max_depth=1, min_rows=1, learn_rate=.1, offset_column="offset", training_frame=insurance) predictions = gbm.predict(insurance) # Comparison result generated from harrysouthworth's gbm: # fit2 = gbm(Claims ~ District + Group + Age+ offset(log(Holders)) , interaction.depth = 1,n.minobsinnode = 1,shrinkage = .1,bag.fraction = 1,train.fraction = 1, # data = Insurance, distribution ="gamma", n.trees = 600) # pr = predict(fit2, Insurance) # pr = exp(pr+log(Insurance$Holders)) assert abs(-1.714958 - gbm._model_json['output']['init_f']) < 1e-5, "expected init_f to be {0}, but got {1}". \ format(-1.714958, gbm._model_json['output']['init_f']) assert abs(50.1087 - predictions.mean()) < 1e-2, "expected prediction mean to be {0}, but got {1}". \ format(50.1087, predictions.mean()) assert abs(0.9133843 - predictions.min()) < 1e-4, "expected prediction min to be {0}, but got {1}". \ format(0.9133843, predictions.min()) assert abs(392.6667 - predictions.max()) < 0.1, "expected prediction max to be {0}, but got {1}". \ format(392.6667, predictions.max())
def expr_slicing(ip,port): # Connect to h2o h2o.init(ip,port) iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv")) iris.show() ################################################################### # expr[int] (column slice), expr is pending res = 2 - iris res2 = res[0] assert abs(res2[3,0] - -2.6) < 1e-10 and abs(res2[17,0] - -3.1) < 1e-10 and abs(res2[24,0] - -2.8) < 1e-10, \ "incorrect values" # expr[int,int], expr is remote assert abs(res[13, 3] - 1.9) < 1e-10, "incorrect values" # expr[int, slice], expr is remote res4 = res[12, 0:3] assert abs(res4[0,0] - -2.8) < 1e-10 and abs(res4[0,1] - -1.0) < 1e-10 and abs(res4[0,2] - 0.6) < 1e-10 and \ abs(res4[0,3] - 1.9) < 1e-10, "incorrect values" # expr[slice, int], expr is remote res5 = res[5:8, 1] assert abs(res5[0,0] - -1.9) < 1e-10 and abs(res5[1,0] - -1.4) < 1e-10 and abs(res5[2,0] - -1.4) < 1e-10 and \ abs(res5[3,0] - -0.9) < 1e-10, "incorrect values" # expr[slice, slice], expr is pending res = iris * 2 res6 = res[5:8, 0:3] assert abs(res6[0,0] - 10.8) < 1e-10 and abs(res6[1,1] - 6.8) < 1e-10 and abs(res6[2,2] - 3.0) < 1e-10 and \ abs(res6[3,3] - 0.4) < 1e-10, "incorrect values"
def demo_imputation(ip, port): # Connect to a pre-existing cluster h2o.init(ip, port) # execute ipython notebook h2o.ipy_notebook_exec(h2o.locate("h2o-py/demos/imputation.ipynb"), save_and_norun=False)
def weights_gamma(ip, port): htable = h2o.upload_file(h2o.locate("smalldata/gbm_test/moppe.csv")) htable["premiekl"] = htable["premiekl"].asfactor() htable["moptva"] = htable["moptva"].asfactor() htable["zon"] = htable["zon"] #gg = gbm(formula = medskad ~ premiekl + moptva + zon,data = table.1.2,distribution = "gamma", weights = table.1.2$antskad , # n.trees = 20,interaction.depth = 1,n.minobsinnode = 1,shrinkage = 1,bag.fraction = 1,train.fraction = 1) #pr = predict(gg,newdata = table.1.2,type = "response") #htable= as.h2o(table.1.2,destination_frame = "htable") hh = h2o.gbm(x=htable[0:3], y=htable["medskad"], training_frame=htable, distribution="gamma", weights_column="antskad", ntrees=20, max_depth=1, min_rows=1, learn_rate=1) ph = hh.predict(htable) assert abs(8.804447 - hh._model_json['output']['init_f']) < 1e-6 * 8.804447 assert abs(3751.01 - ph[0].min()) < 1e-4 * 3751.01 assert abs(15298.87 - ph[0].max()) < 1e-4 * 15298.87 assert abs(8121.98 - ph[0].mean()) < 1e-4 * 8121.98
def ls_test(ip, port): # Connect to h2o h2o.init(ip, port) iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv")) h2o.ls()
def perfectSeparation_balanced(ip, port): # Connect to h2o h2o.init(ip, port) print("Read in synthetic balanced dataset") data = h2o.import_frame( path=h2o.locate("smalldata/synthetic_perfect_separation/balanced.csv")) print("Fit model on dataset") model = h2o.glm(x=data[["x1", "x2"]], y=data["y"], family="binomial", lambda_search=True, use_all_factor_levels=True, alpha=[0.5], Lambda=[0]) print( "Extract models' coefficients and assert reasonable values (ie. no greater than 50)" ) print("Balanced dataset") coef = [ c[1] for c in model._model_json['output']['coefficients_table'].cell_values if c[0] != "Intercept" ] for c in coef: assert c < 50, "coefficient is too large"
def covtype(ip,port): # Connect to h2o h2o.init(ip,port) # Log.info("Importing covtype.20k.data...\n") covtype = h2o.import_frame(path=h2o.locate("smalldata/covtype/covtype.20k.data")) # myY = 54 myX = [x for x in range(0,54) if x not in [20,28]] # Set response to be indicator of a particular class res_class = random.randint(1,4) # Log.info(paste("Setting response column", myY, "to be indicator of class", res_class, "\n")) covtype[54] = (covtype[54] == res_class) #covtype.summary() # L2: alpha = 0, lambda = 0 covtype_mod1 = h2o.glm(y=covtype[myY], x=covtype[myX], family="binomial", n_folds=0, alpha=[0], Lambda=[0]) covtype_mod1.show() # Elastic: alpha = 0.5, lambda = 1e-4 covtype_mod2 = h2o.glm(y=covtype[myY], x=covtype[myX], family="binomial", n_folds=0, alpha=[0.5], Lambda=[1e-4]) covtype_mod2.show() # L1: alpha = 1, lambda = 1e-4 covtype_mod3 = h2o.glm(y=covtype[myY], x=covtype[myX], family="binomial", n_folds=0, alpha=[1], Lambda=[1e-4]) covtype_mod3.show()
def link_functions_tweedie_vpow(ip,port): # Load example data from HDtweedie, y = aggregate claim loss hdf = h2o.upload_file(h2o.locate("smalldata/glm_test/auto.csv")) y = "y" x = list(set(hdf.names) - set(["y"])) print "Testing for family: TWEEDIE" print "Create models with canonical link: TWEEDIE" # Iterate over different variance powers for tweedie vpower = [0, 1, 1.5] r_dev = [0.7516627, 0.6708826, 0.7733762] r_null = [221051.88369951, 32296.29783702, 20229.47425307] for ridx, vpow in enumerate(vpower): print "Fit h2o.glm:" h2ofit = h2o.glm(x=hdf[x], y=hdf[y], family="tweedie", link="tweedie", tweedie_variance_power=vpow, tweedie_link_power=1-vpow, alpha=[0.5], Lambda=[0]) print "Testing Tweedie variance power: {0}".format(vpow) print "Compare model deviances for link function tweedie" deviance_h2o_tweedie = h2ofit.residual_deviance() / h2ofit.null_deviance() assert r_dev[ridx] - deviance_h2o_tweedie <= 0.01, "h2o's residual/null deviance is more than 0.01 lower than " \ "R's. h2o: {0}, r: {1}".format(deviance_h2o_tweedie, r_dev[ridx]) print "compare null and residual deviance between R glm and h2o.glm for tweedie" assert abs(r_null[ridx] - h2ofit.null_deviance()) < 1e-6, "h2o's null deviance is not equal to R's. h2o: {0}, r: " \ "{1}".format(h2ofit.null_deviance(), r_null[ridx])
def vi_toy_test(ip, port): # Connect to h2o h2o.init(ip, port) toy_data = h2o.import_frame( path=h2o.locate("smalldata/gbm_test/toy_data_RF.csv")) #toy_data.summary() toy_data[6] = toy_data[6].asfactor() toy_data.show() rf = h2o.random_forest(x=toy_data[[0, 1, 2, 3, 4, 5]], y=toy_data[6], ntrees=500, max_depth=20, nbins=100, seed=0) ranking = [ rf._model_json['output']['variable_importances'].cell_values[v][0] for v in range(toy_data.ncol() - 1) ] print(ranking) assert tuple(ranking) == tuple( ["V3", "V2", "V6", "V5", "V1", "V4"]), "expected specific variable importance ranking"
def bigcatRF(ip, port): # Connect to h2o h2o.init(ip, port) # Training set has 100 categories from cat001 to cat100 # Categories cat001, cat003, ... are perfect predictors of y = 1 # Categories cat002, cat004, ... are perfect predictors of y = 0 #Log.info("Importing bigcat_5000x2.csv data...\n") bigcat = h2o.import_frame( path=h2o.locate("smalldata/gbm_test/bigcat_5000x2.csv")) bigcat["y"] = bigcat["y"].asfactor() #Log.info("Summary of bigcat_5000x2.csv from H2O:\n") #bigcat.summary() # Train H2O DRF Model: #Log.info("H2O DRF (Naive Split) with parameters:\nclassification = TRUE, ntree = 1, depth = 1, nbins = 100, nbins_cats=10\n") model = h2o.random_forest(x=bigcat[["X"]], y=bigcat["y"], ntrees=1, max_depth=1, nbins=100, nbins_cats=10) model.show()
def parametersKmeans(ip, port): # Connect to a pre-existing cluster h2o.init(ip, port) # connect to localhost:54321 #Log.info("Getting data...") iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv")) #Log.info("Create and and duplicate...") iris_km = h2o.kmeans(x=iris[0:4], k=3, seed=1234) parameters = iris_km._model_json['parameters'] param_dict = {} for p in range(len(parameters)): param_dict[parameters[p]['label']] = parameters[p]['actual_value'] iris_km_again = h2o.kmeans(x=iris[0:4], **param_dict) #Log.info("wss") wss = iris_km.withinss().sort() wss_again = iris_km_again.withinss().sort() assert wss == wss_again, "expected wss to be equal" #Log.info("centers") centers = iris_km.centers() centers_again = iris_km_again.centers() assert centers == centers_again, "expected centers to be the same"
def link_correct_default(ip,port): print("Reading in original prostate data.") h2o_data = h2o.upload_file(path=h2o.locate("smalldata/prostate/prostate.csv.zip")) print("Compare models with link unspecified and canonical link specified.") print("GAUSSIAN: ") h2o_model_unspecified = h2o.glm(x=h2o_data[1:8], y=h2o_data[8], family="gaussian") h2o_model_specified = h2o.glm(x=h2o_data[1:8], y=h2o_data[8], family="gaussian", link="identity") assert h2o_model_specified._model_json['output']['coefficients_table'].cell_values == \ h2o_model_unspecified._model_json['output']['coefficients_table'].cell_values, "coefficient should be equal" print("BINOMIAL: ") h2o_model_unspecified = h2o.glm(x=h2o_data[2:9], y=h2o_data[1], family="binomial") h2o_model_specified = h2o.glm(x=h2o_data[2:9], y=h2o_data[1], family="binomial", link="logit") assert h2o_model_specified._model_json['output']['coefficients_table'].cell_values == \ h2o_model_unspecified._model_json['output']['coefficients_table'].cell_values, "coefficient should be equal" print("POISSON: ") h2o_model_unspecified = h2o.glm(x=h2o_data[2:9], y=h2o_data[1], family="poisson") h2o_model_specified = h2o.glm(x=h2o_data[2:9], y=h2o_data[1], family="poisson", link="log") assert h2o_model_specified._model_json['output']['coefficients_table'].cell_values == \ h2o_model_unspecified._model_json['output']['coefficients_table'].cell_values, "coefficient should be equal" print("GAMMA: ") h2o_model_unspecified = h2o.glm(x=h2o_data[3:9], y=h2o_data[2], family="gamma") h2o_model_specified = h2o.glm(x=h2o_data[3:9], y=h2o_data[2], family="gamma", link="inverse") assert h2o_model_specified._model_json['output']['coefficients_table'].cell_values == \ h2o_model_unspecified._model_json['output']['coefficients_table'].cell_values, "coefficient should be equal"
def demo_chicago_crimes(ip, port): # Connect to a pre-existing cluster h2o.init(ip, port) # execute ipython notebook h2o.ipy_notebook_exec(h2o.locate("h2o-py/demos/chicago_crimes.ipynb"), save_and_norun=False)
def asfactor_basic(ip,port): #Log.info("Printing out the head of the cars datasets") h2oframe = h2o.import_file(path=h2o.locate("smalldata/junit/cars.csv")) h2oframe.show() h2oframe['cylinders'].show() foo = h2oframe["cylinders"] foo.show() h2oframe['cylinders'].asfactor().show() meow = h2oframe['cylinders'].asfactor() meow.show() foo = h2oframe["cylinders"].isfactor() assert not foo, "expected the foo H2OVec to be a not factor" h2oframe["cylinders"] = h2oframe['cylinders'].asfactor() h2oframe.show() bar = h2oframe["cylinders"].isfactor() assert bar, "expected the bar H2OVec to be a factor"