def attack(train, x): kwargs = {} # randomly select parameters and their corresponding values kwargs['k'] = random.randint(1,20) if random.randint(0,1): kwargs['model_id'] = "my_model" if random.randint(0,1): kwargs['max_iterations'] = random.randint(1,1000) if random.randint(0,1): kwargs['standardize'] = [True, False][random.randint(0,1)] if random.randint(0,1): method = random.randint(0,3) if method == 3: s = [] for p in range(kwargs['k']): s.append([random.uniform(train[c].mean()-100,train[c].mean()+100) for c in x]) start = h2o.H2OFrame(python_obj=s) kwargs['user_points'] = start else: kwargs['init'] = ["Furthest","Random", "PlusPlus"][method] if random.randint(0,1): kwargs['seed'] = random.randint(1,10000) # display the parameters and their corresponding values print "-----------------------" print "x: {0}".format(x) for k, v in zip(kwargs.keys(), kwargs.values()): if k == 'user_points': print k + ": " start.show() else: print k + ": {0}".format(v) h2o.kmeans(x=train[x], **kwargs) print "-----------------------"
def parametersKmeans(ip, port): # Connect to a pre-existing cluster h2o.init(ip, port) # connect to localhost:54321 #Log.info("Getting data...") iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv")) #Log.info("Create and and duplicate...") iris_km = h2o.kmeans(x=iris[0:4], k=3, seed=1234) parameters = iris_km._model_json['parameters'] param_dict = {} for p in range(len(parameters)): param_dict[parameters[p]['label']] = parameters[p]['actual_value'] iris_km_again = h2o.kmeans(x=iris[0:4], **param_dict) #Log.info("wss") wss = iris_km.withinss().sort() wss_again = iris_km_again.withinss().sort() assert wss == wss_again, "expected wss to be equal" #Log.info("centers") centers = iris_km.centers() centers_again = iris_km_again.centers() assert centers == centers_again, "expected centers to be the same"
def parametersKmeans(ip,port): # Connect to a pre-existing cluster h2o.init(ip,port) # connect to localhost:54321 #Log.info("Getting data...") iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv")) #Log.info("Create and and duplicate...") iris_km = h2o.kmeans(x=iris[0:4], k=3, seed=1234) parameters = iris_km._model_json['parameters'] param_dict = {} for p in range(len(parameters)): param_dict[parameters[p]['label']] = parameters[p]['actual_value'] iris_km_again = h2o.kmeans(x=iris[0:4], **param_dict) #Log.info("wss") wss = iris_km.withinss().sort() wss_again = iris_km_again.withinss().sort() assert wss == wss_again, "expected wss to be equal" #Log.info("centers") centers = iris_km.centers() centers_again = iris_km_again.centers() assert centers == centers_again, "expected centers to be the same"
def convergeKmeans(ip, port): # Connect to a pre-existing cluster h2o.init(ip, port) # connect to localhost:54321 # Log.info("Importing ozone.csv data...\n") ozone_h2o = h2o.import_frame( path=h2o.locate("smalldata/glm_test/ozone.csv")) #ozone_h2o.summary() miters = 5 ncent = 10 # Log.info(paste("Run k-means in a loop of", miters, "iterations with max_iter = 1")) # TODO: implement row slicing start = h2o.H2OFrame([[41, 190, 67, 7.4], [36, 118, 72, 8], [12, 149, 74, 12.6], [18, 313, 62, 11.5], [23, 299, 65, 8.6], [19, 99, 59, 13.8], [8, 19, 61, 20.1], [16, 256, 69, 9.7], [11, 290, 66, 9.2], [14, 274, 68, 10.9]]) start_key = start.send_frame() # expect error for 0 iterations try: h2o.kmeans(x=ozone_h2o, max_iterations=0) assert False, "expected an error" except EnvironmentError: assert True centers_key = start_key for i in range(miters): rep_fit = h2o.kmeans(x=ozone_h2o, k=ncent, user_points=centers_key, max_iterations=1) centers = h2o.H2OFrame(rep_fit.centers()) centers_key = centers.send_frame() # Log.info(paste("Run k-means with max_iter=miters")) all_fit = h2o.kmeans(x=ozone_h2o, k=ncent, user_points=start_key, max_iterations=miters) assert rep_fit.centers() == all_fit.centers( ), "expected the centers to be the same" # Log.info("Check cluster centers have converged") all_fit2 = h2o.kmeans(x=ozone_h2o, k=ncent, user_points=h2o.H2OFrame( all_fit.centers()).send_frame(), max_iterations=1) avg_change = sum([ sum([pow((e1 - e2), 2) for e1, e2 in zip(c1, c2)]) for c1, c2 in zip(all_fit.centers(), all_fit2.centers()) ]) / ncent assert avg_change < 1e-6 or all_fit._model_json['output'][ 'iterations'] < miters
def convergeKmeans(ip, port): # Connect to a pre-existing cluster h2o.init(ip, port) # connect to localhost:54321 # Log.info("Importing ozone.csv data...\n") ozone_h2o = h2o.import_frame( path=h2o.locate("smalldata/glm_test/ozone.csv")) #ozone_h2o.summary() miters = 5 ncent = 10 # Log.info(paste("Run k-means in a loop of", miters, "iterations with max_iter = 1")) start = ozone_h2o[0:10, 0:4] # expect error for 0 iterations try: h2o.kmeans(x=ozone_h2o, max_iterations=0) assert False, "expected an error" except EnvironmentError: assert True centers_key = start.eager() for i in range(miters): rep_fit = h2o.kmeans(x=ozone_h2o, k=ncent, user_points=centers_key, max_iterations=1) centers = h2o.H2OFrame(rep_fit.centers()) centers_key = centers.send_frame() # Log.info(paste("Run k-means with max_iter=miters")) all_fit = h2o.kmeans(x=ozone_h2o, k=ncent, user_points=start.eager(), max_iterations=miters) assert rep_fit.centers() == all_fit.centers( ), "expected the centers to be the same" # Log.info("Check cluster centers have converged") all_fit2 = h2o.kmeans(x=ozone_h2o, k=ncent, user_points=h2o.H2OFrame( all_fit.centers()).send_frame(), max_iterations=1) avg_change = sum([ sum([pow((e1 - e2), 2) for e1, e2 in zip(c1, c2)]) for c1, c2 in zip(all_fit.centers(), all_fit2.centers()) ]) / ncent assert avg_change < 1e-6 or all_fit._model_json['output'][ 'iterations'] < miters
def hdfs_kmeans_converge(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible( ) if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() hdfs_cross_file = "/datasets/runit/BigCross.data" print("Import BigCross.data from HDFS") url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_cross_file) cross_h2o = h2o.import_file(url) n = cross_h2o.nrow print("rows: {0}".format(n)) ncent = 3 miters = 10 print("Run k-means with k = {0} and max_iterations = {1}".format( ncent, miters)) cross1_km = h2o.kmeans(training_frame=cross_h2o, x=cross_h2o[0:57], k=ncent, max_iterations=miters) print(cross1_km) print( "Run k-means with init = final cluster centers and max_iterations = 1" ) init_centers = h2o.H2OFrame(cross1_km.centers()) cross2_km = h2o.kmeans(training_frame=cross_h2o, x=cross_h2o[0:57], k=ncent, user_points=init_centers, max_iterations=1) print(cross2_km) print("Check k-means converged or maximum iterations reached") c1 = h2o.H2OFrame(cross1_km.centers()) c2 = h2o.H2OFrame(cross2_km.centers()) avg_change = old_div(((c1 - c2)**2).sum(), ncent) iters = cross1_km._model_json['output']['model_summary'].cell_values[ 0][3] assert avg_change < 1e-6 or iters > miters, "Expected k-means to converge or reach max iterations. avg_change = " \ "{0} and iterations = {1}".format(avg_change, iters) else: raise EnvironmentError
def iris_h2o_vs_sciKmeans(ip,port): # Connect to a pre-existing cluster h2o.init(ip,port) # connect to localhost:54321 iris_h2o = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv")) iris_sci = np.genfromtxt(h2o.locate("smalldata/iris/iris.csv"), delimiter=',') iris_sci = iris_sci[:,0:4] s =[[4.9,3.0,1.4,0.2], [5.6,2.5,3.9,1.1], [6.5,3.0,5.2,2.0]] start = h2o.H2OFrame(s) start_key = start.send_frame() h2o_km = h2o.kmeans(x=iris_h2o[0:4], k=3, user_points=start_key, standardize=False) sci_km = KMeans(n_clusters=3, init=np.asarray(s), n_init=1) sci_km.fit(iris_sci) # Log.info("Cluster centers from H2O:") print "Cluster centers from H2O:" h2o_centers = h2o_km.centers() print h2o_centers # Log.info("Cluster centers from scikit:") print "Cluster centers from scikit:" sci_centers = sci_km.cluster_centers_.tolist() print sci_centers for hcenter, scenter in zip(h2o_centers, sci_centers): for hpoint, spoint in zip(hcenter,scenter): assert (hpoint- spoint) < 1e-10, "expected centers to be the same"
def benignKmeans(ip, port): # Connect to a pre-existing cluster h2o.init(ip, port) # connect to localhost:54321 # Log.info("Importing benign.csv data...\n") benign_h2o = h2o.import_frame( path=h2o.locate("smalldata/logreg/benign.csv")) #benign_h2o.summary() benign_sci = np.genfromtxt(h2o.locate("smalldata/logreg/benign.csv"), delimiter=",") # Impute missing values with column mean imp = Imputer(missing_values='NaN', strategy='mean', axis=0) benign_sci = imp.fit_transform(benign_sci) # Log.info(paste("H2O K-Means with ", i, " clusters:\n", sep = "")) for i in range(1, 7): benign_h2o_km = h2o.kmeans(x=benign_h2o, k=i) print "H2O centers" print benign_h2o_km.centers() benign_sci_km = KMeans(n_clusters=i, init='k-means++', n_init=1) benign_sci_km.fit(benign_sci) print "sckit centers" print benign_sci_km.cluster_centers_
def pyunit_model_params(): pros = h2o.import_file(pyunit_utils.locate("smalldata/prostate/prostate.csv")) m = h2o.kmeans(pros,k=4) print m.params print m.full_parameters
def getModelKmeans(ip, port): # Connect to a pre-existing cluster h2o.init(ip, port) # connect to localhost:54321 #Log.info("Importing benign.csv data...\n") benign_h2o = h2o.import_frame( path=h2o.locate("smalldata/logreg/benign.csv")) #benign_h2o.summary() benign_sci = np.genfromtxt(h2o.locate("smalldata/logreg/benign.csv"), delimiter=",") # Impute missing values with column mean imp = Imputer(missing_values='NaN', strategy='mean', axis=0) benign_sci = imp.fit_transform(benign_sci) for i in range(2, 7): # Log.info("H2O K-Means") km_h2o = h2o.kmeans(x=benign_h2o, k=i) km_h2o.show() #TODO: impement h2o.getModel() model = h2o.getModel(km_h2o._key) model.show() km_sci = KMeans(n_clusters=i, init='k-means++', n_init=1) km_sci.fit(benign_sci) print "sckit centers" print km_sci.cluster_centers_
def pyunit_model_params(ip, port): pros = h2o.import_file(h2o.locate("smalldata/prostate/prostate.csv")) m = h2o.kmeans(pros, k=4) print m.params print m.full_parameters
def iris_h2o_vs_sciKmeans(): # Connect to a pre-existing cluster # connect to localhost:54321 iris_h2o = h2o.import_file(path=tests.locate("smalldata/iris/iris.csv")) iris_sci = np.genfromtxt(tests.locate("smalldata/iris/iris.csv"), delimiter=',') iris_sci = iris_sci[:, 0:4] s = [[4.9, 3.0, 1.4, 0.2], [5.6, 2.5, 3.9, 1.1], [6.5, 3.0, 5.2, 2.0]] start = h2o.H2OFrame(s) h2o_km = h2o.kmeans(x=iris_h2o[0:4], k=3, user_points=start, standardize=False) sci_km = KMeans(n_clusters=3, init=np.asarray(s), n_init=1) sci_km.fit(iris_sci) # Log.info("Cluster centers from H2O:") print "Cluster centers from H2O:" h2o_centers = h2o_km.centers() print h2o_centers # Log.info("Cluster centers from scikit:") print "Cluster centers from scikit:" sci_centers = sci_km.cluster_centers_.tolist() print sci_centers for hcenter, scenter in zip(h2o_centers, sci_centers): for hpoint, spoint in zip(hcenter, scenter): assert (hpoint - spoint) < 1e-10, "expected centers to be the same"
def hdfs_kmeans_airlines(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible( ) if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() hdfs_file = "/datasets/airlines_all.csv" print "Import airlines_all.csv from HDFS" url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_file) airlines_h2o = h2o.import_file(url) n = airlines_h2o.nrow print "rows: {0}".format(n) print "Run k-means++ with k = 7 and max_iterations = 10" myX = range(8) + range(11, 16) + range(18, 21) + range(24, 29) + [9] airlines_km = h2o.kmeans(training_frame=airlines_h2o, x=airlines_h2o[myX], k=7, init="Furthest", max_iterations=10, standardize=True) print airlines_km else: raise (EnvironmentError, "Not running on H2O internal network. No access to HDFS.")
def hdfs_kmeans_airlines(ip, port): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. running_inside_h2o = tests.is_running_internal_to_h2o() if running_inside_h2o: hdfs_name_node = tests.get_h2o_internal_hdfs_name_node() hdfs_file = "/datasets/airlines_all.csv" print "Import airlines_all.csv from HDFS" url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_file) airlines_h2o = h2o.import_file(url) n = airlines_h2o.nrow print "rows: {0}".format(n) print "Run k-means++ with k = 7 and max_iterations = 10" myX = range(8) + range(11, 16) + range(18, 21) + range(24, 29) + [9] airlines_km = h2o.kmeans(training_frame=airlines_h2o, x=airlines_h2o[myX], k=7, init="Furthest", max_iterations=10, standardize=True) print airlines_km else: print "Not running on H2O internal network. No access to HDFS."
def getModelKmeans(ip,port): # Connect to a pre-existing cluster h2o.init(ip,port) # connect to localhost:54321 #Log.info("Importing benign.csv data...\n") benign_h2o = h2o.import_frame(path=h2o.locate("smalldata/logreg/benign.csv")) #benign_h2o.summary() benign_sci = np.genfromtxt(h2o.locate("smalldata/logreg/benign.csv"), delimiter=",") # Impute missing values with column mean imp = Imputer(missing_values='NaN', strategy='mean', axis=0) benign_sci = imp.fit_transform(benign_sci) for i in range(2,7): # Log.info("H2O K-Means") km_h2o = h2o.kmeans(x=benign_h2o, k=i) km_h2o.show() #TODO: impement h2o.getModel() model = h2o.getModel(km_h2o._key) model.show() km_sci = KMeans(n_clusters=i, init='k-means++', n_init=1) km_sci.fit(benign_sci) print "sckit centers" print km_sci.cluster_centers_
def convergeKmeans(ip,port): # Connect to a pre-existing cluster h2o.init(ip,port) # connect to localhost:54321 # Log.info("Importing ozone.csv data...\n") ozone_h2o = h2o.import_frame(path=h2o.locate("smalldata/glm_test/ozone.csv")) #ozone_h2o.summary() miters = 5 ncent = 10 # Log.info(paste("Run k-means in a loop of", miters, "iterations with max_iter = 1")) # TODO: implement row slicing start = h2o.H2OFrame([[41,190,67,7.4], [36,118,72,8], [12,149,74,12.6], [18,313,62,11.5], [23,299,65,8.6], [19,99,59,13.8], [8,19,61,20.1], [16,256,69,9.7], [11,290,66,9.2], [14,274,68,10.9]]) start_key = start.send_frame() # expect error for 0 iterations try: h2o.kmeans(x=ozone_h2o, max_iterations=0) assert False, "expected an error" except EnvironmentError: assert True centers_key = start_key for i in range(miters): rep_fit = h2o.kmeans(x=ozone_h2o, k=ncent, user_points=centers_key, max_iterations=1) centers = h2o.H2OFrame(rep_fit.centers()) centers_key = centers.send_frame() # Log.info(paste("Run k-means with max_iter=miters")) all_fit = h2o.kmeans(x=ozone_h2o, k=ncent, user_points=start_key, max_iterations=miters) assert rep_fit.centers() == all_fit.centers(), "expected the centers to be the same" # Log.info("Check cluster centers have converged") all_fit2 = h2o.kmeans(x=ozone_h2o, k=ncent, user_points=h2o.H2OFrame(all_fit.centers()).send_frame(), max_iterations=1) avg_change = sum([sum([pow((e1 - e2),2) for e1, e2 in zip(c1,c2)]) for c1, c2 in zip(all_fit.centers(),all_fit2.centers())])/ncent assert avg_change < 1e-6 or all_fit._model_json['output']['iterations'] < miters
def get_model_test(ip,port): # Connect to h2o h2o.init(ip,port) prostate = h2o.import_frame(path=h2o.locate("smalldata/logreg/prostate.csv")) r = prostate[0].runif() train = prostate[r < 0.70] test = prostate[r >= 0.30] # Regression regression_gbm1 = h2o.gbm(y=train[1], x=train[2:9], distribution="gaussian") predictions1 = regression_gbm1.predict(test) regression_gbm2 = h2o.get_model(regression_gbm1._key) assert regression_gbm2._model_json['output']['model_category'] == "Regression" predictions2 = regression_gbm2.predict(test) for r in range(predictions1.nrow()): p1 = predictions1[r,0] p2 = predictions2[r,0] assert p1 == p2, "expected regression predictions to be the same for row {0}, but got {1} and {2}" \ "".format(r, p1, p2) # Binomial train[1] = train[1].asfactor() bernoulli_gbm1 = h2o.gbm(y=train[1], x=train[2:9], distribution="bernoulli") predictions1 = bernoulli_gbm1.predict(test) bernoulli_gbm2 = h2o.get_model(bernoulli_gbm1._key) assert bernoulli_gbm2._model_json['output']['model_category'] == "Binomial" predictions2 = bernoulli_gbm2.predict(test) for r in range(predictions1.nrow()): p1 = predictions1[r,0] p2 = predictions2[r,0] assert p1 == p2, "expected binomial predictions to be the same for row {0}, but got {1} and {2}" \ "".format(r, p1, p2) # Clustering benign_h2o = h2o.import_frame(path=h2o.locate("smalldata/logreg/benign.csv")) km_h2o = h2o.kmeans(x=benign_h2o, k=3) benign_km = h2o.get_model(km_h2o._key) assert benign_km._model_json['output']['model_category'] == "Clustering" # Multinomial train[4] = train[4].asfactor() multinomial_dl1 = h2o.deeplearning(x=train[0:2], y=train[4], loss='CrossEntropy') predictions1 = multinomial_dl1.predict(test) multinomial_dl2 = h2o.get_model(multinomial_dl1._key) assert multinomial_dl2._model_json['output']['model_category'] == "Multinomial" predictions2 = multinomial_dl2.predict(test) for r in range(predictions1.nrow()): p1 = predictions1[r,0] p2 = predictions2[r,0] assert p1 == p2, "expected multinomial predictions to be the same for row {0}, but got {1} and {2}" \ "".format(r, p1, p2)
def hdfs_kmeans_converge(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. running_inside_h2o = tests.is_running_internal_to_h2o() if running_inside_h2o: hdfs_name_node = tests.get_h2o_internal_hdfs_name_node() hdfs_cross_file = "/datasets/runit/BigCross.data" print "Import BigCross.data from HDFS" url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_cross_file) cross_h2o = h2o.import_file(url) n = cross_h2o.nrow print "rows: {0}".format(n) ncent = 3 miters = 10 print "Run k-means with k = {0} and max_iterations = {1}".format( ncent, miters) cross1_km = h2o.kmeans(training_frame=cross_h2o, x=cross_h2o[0:57], k=ncent, max_iterations=miters) print cross1_km print "Run k-means with init = final cluster centers and max_iterations = 1" init_centers = h2o.H2OFrame(cross1_km.centers()) init_centers_key = init_centers.send_frame() cross2_km = h2o.kmeans(training_frame=cross_h2o, x=cross_h2o[0:57], k=ncent, user_points=init_centers_key, max_iterations=1) print cross2_km print "Check k-means converged or maximum iterations reached" c1 = h2o.H2OFrame(cross1_km.centers()) c2 = h2o.H2OFrame(cross2_km.centers()) avg_change = ((c1 - c2)**2).sum() / ncent iters = cross1_km._model_json['output']['model_summary'].cell_values[ 0][3] assert avg_change < 1e-6 or iters > miters, "Expected k-means to converge or reach max iterations. avg_change = " \ "{0} and iterations = {1}".format(avg_change, iters) else: print "Not running on H2O internal network. No access to HDFS."
def pyunit_model_params(): pros = h2o.import_file( pyunit_utils.locate("smalldata/prostate/prostate.csv")) m = h2o.kmeans(pros, k=4) print(m.params) print(m.full_parameters)
def km_num_iterations(ip,port): # Connect to a pre-existing cluster # connect to localhost:54321 prostate_h2o = h2o.import_file(path=h2o.locate("smalldata/logreg/prostate.csv")) prostate_km_h2o = h2o.kmeans(x=prostate_h2o[1:], k=3, max_iterations=4) num_iterations = prostate_km_h2o.num_iterations() assert num_iterations <= 4, "Expected 4 iterations, but got {0}".format(num_iterations)
def get_model_test(): prostate = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) r = prostate[0].runif() train = prostate[r < 0.70] test = prostate[r >= 0.70] # Regression regression_gbm1 = h2o.gbm(y=train[1], x=train[2:9], distribution="gaussian") predictions1 = regression_gbm1.predict(test) regression_gbm2 = h2o.get_model(regression_gbm1._id) assert regression_gbm2._model_json['output']['model_category'] == "Regression" predictions2 = regression_gbm2.predict(test) for r in range(predictions1.nrow): p1 = predictions1[r,0] p2 = predictions2[r,0] assert p1 == p2, "expected regression predictions to be the same for row {}, but got {} and {}".format(r, p1, p2) # Binomial train[1] = train[1].asfactor() bernoulli_gbm1 = h2o.gbm(y=train[1], x=train[2:], distribution="bernoulli") predictions1 = bernoulli_gbm1.predict(test) bernoulli_gbm2 = h2o.get_model(bernoulli_gbm1._id) assert bernoulli_gbm2._model_json['output']['model_category'] == "Binomial" predictions2 = bernoulli_gbm2.predict(test) for r in range(predictions1.nrow): p1 = predictions1[r,0] p2 = predictions2[r,0] assert p1 == p2, "expected binomial predictions to be the same for row {}, but got {} and {}".format(r, p1, p2) # Clustering benign_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/benign.csv")) km_h2o = h2o.kmeans(x=benign_h2o, k=3) benign_km = h2o.get_model(km_h2o._id) assert benign_km._model_json['output']['model_category'] == "Clustering" # Multinomial train[4] = train[4].asfactor() multinomial_dl1 = h2o.deeplearning(x=train[0:2], y=train[4], loss='CrossEntropy') predictions1 = multinomial_dl1.predict(test) multinomial_dl2 = h2o.get_model(multinomial_dl1._id) assert multinomial_dl2._model_json['output']['model_category'] == "Multinomial" predictions2 = multinomial_dl2.predict(test) for r in range(predictions1.nrow): p1 = predictions1[r,0] p2 = predictions2[r,0] assert p1 == p2, "expected multinomial predictions to be the same for row {0}, but got {1} and {2}" \ "".format(r, p1, p2)
def km_num_iterations(): # Connect to a pre-existing cluster # connect to localhost:54321 prostate_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) prostate_km_h2o = h2o.kmeans(x=prostate_h2o[1:], k=3, max_iterations=4) num_iterations = prostate_km_h2o.num_iterations() assert num_iterations <= 4, "Expected 4 iterations, but got {0}".format(num_iterations)
def hdfs_kmeans(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible( ) if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() hdfs_iris_file = "/datasets/runit/iris_wheader.csv" hdfs_covtype_file = "/datasets/runit/covtype.data" print("Import iris_wheader.csv from HDFS") url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_iris_file) iris_h2o = h2o.import_file(url) n = iris_h2o.nrow print("rows: {0}".format(n)) assert n == 150, "Wrong number of rows. Got {0}. Should have got {1}".format( n, 150) print("Running KMeans on iris") iris_km = h2o.kmeans(training_frame=iris_h2o, k=3, x=iris_h2o[0:4], max_iterations=10) print(iris_km) print("Importing covtype.data from HDFS") url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_covtype_file) covtype_h2o = h2o.import_file(url) n = covtype_h2o.nrow print("rows: {0}".format(n)) assert n == 581012, "Wrong number of rows. Got {0}. Should have got {1}".format( n, 581012) print("Running KMeans on covtype") covtype_km = h2o.kmeans(training_frame=covtype_h2o, x=covtype_h2o[0:55], k=8, max_iterations=10) print(covtype_km) else: raise EnvironmentError
def hdfs_kmeans(ip, port): h2o.init(ip, port) # Check if we are running inside the H2O network by seeing if we can touch # the namenode. running_inside_h2o = h2o.is_running_internal_to_h2o() if running_inside_h2o: hdfs_name_node = h2o.get_h2o_internal_hdfs_name_node() hdfs_iris_file = "/datasets/runit/iris_wheader.csv" hdfs_covtype_file = "/datasets/runit/covtype.data" print "Import iris_wheader.csv from HDFS" url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_iris_file) iris_h2o = h2o.import_frame(url) n = iris_h2o.nrow() print "rows: {0}".format(n) assert n == 150, "Wrong number of rows. Got {0}. Should have got {1}".format( n, 150) print "Running KMeans on iris" iris_km = h2o.kmeans(training_frame=iris_h2o, k=3, x=iris_h2o[0:4], max_iterations=10) print iris_km print "Importing covtype.data from HDFS" url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_covtype_file) covtype_h2o = h2o.import_frame(url) n = covtype_h2o.nrow() print "rows: {0}".format(n) assert n == 581012, "Wrong number of rows. Got {0}. Should have got {1}".format( n, 581012) print "Running KMeans on covtype" covtype_km = h2o.kmeans(training_frame=covtype_h2o, x=covtype_h2o[0:55], k=8, max_iterations=10) print covtype_km else: print "Not running on H2O internal network. No access to HDFS."
def km_num_iterations(ip,port): # Connect to a pre-existing cluster h2o.init(ip,port) # connect to localhost:54321 prostate_h2o = h2o.import_frame(path=h2o.locate("smalldata/logreg/prostate.csv")) prostate_km_h2o = h2o.kmeans(x=prostate_h2o[1:], k=3, max_iterations=2) num_iterations = prostate_km_h2o.num_iterations() #TODO: is there and off-by-one error here? assert num_iterations <= 4, "Expected 4 iterations, but got {0}".format(num_iterations)
def convergeKmeans(ip, port): # Connect to a pre-existing cluster # connect to localhost:54321 # Log.info("Importing ozone.csv data...\n") ozone_h2o = h2o.import_file(path=h2o.locate("smalldata/glm_test/ozone.csv")) # ozone_h2o.summary() miters = 5 ncent = 10 # Log.info(paste("Run k-means in a loop of", miters, "iterations with max_iter = 1")) start = ozone_h2o[0:10, 0:4] # expect error for 0 iterations try: h2o.kmeans(x=ozone_h2o, max_iterations=0) assert False, "expected an error" except EnvironmentError: assert True centers = start for i in range(miters): rep_fit = h2o.kmeans(x=ozone_h2o, k=ncent, user_points=centers, max_iterations=1) centers = h2o.H2OFrame(rep_fit.centers()) # Log.info(paste("Run k-means with max_iter=miters")) all_fit = h2o.kmeans(x=ozone_h2o, k=ncent, user_points=start, max_iterations=miters) assert rep_fit.centers() == all_fit.centers(), "expected the centers to be the same" # Log.info("Check cluster centers have converged") all_fit2 = h2o.kmeans(x=ozone_h2o, k=ncent, user_points=h2o.H2OFrame(all_fit.centers()), max_iterations=1) avg_change = ( sum( [ sum([pow((e1 - e2), 2) for e1, e2 in zip(c1, c2)]) for c1, c2 in zip(all_fit.centers(), all_fit2.centers()) ] ) / ncent ) assert avg_change < 1e-6 or all_fit._model_json["output"]["iterations"] == miters
def km_num_iterations(ip,port): # Connect to a pre-existing cluster h2o.init(ip,port) # connect to localhost:54321 prostate_h2o = h2o.import_frame(path=h2o.locate("smalldata/logreg/prostate.csv")) prostate_km_h2o = h2o.kmeans(x=prostate_h2o[1:], k=3, max_iterations=3) num_iterations = prostate_km_h2o.num_iterations() #TODO: is there and off-by-one error here? assert num_iterations == 4, "Expected 4 iterations, but got {0}".format(num_iterations)
def kmeans_mllib(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible( ) if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() hdfs_cross_file = "/datasets/runit/BigCross.data" print("Import BigCross.data from HDFS") url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_cross_file) cross_h2o = h2o.import_file(url) n = cross_h2o.nrow err_mllib = np.genfromtxt( pyunit_utils.locate("smalldata/mllib_bench/bigcross_wcsse.csv"), delimiter=",", skip_header=1) ncent = [int(err_mllib[r][0]) for r in range(len(err_mllib))] for k in ncent: print( "Run k-means++ with k = {0} and max_iterations = 10".format(k)) cross_km = h2o.kmeans(training_frame=cross_h2o, x=cross_h2o, k=k, init="PlusPlus", max_iterations=10, standardize=False) clust_mllib = np.genfromtxt( pyunit_utils.locate("smalldata/mllib_bench/bigcross_centers_" + str(k) + ".csv"), delimiter=",").tolist() clust_h2o = cross_km.centers() # Sort in ascending order by first dimension for comparison purposes clust_mllib.sort(key=lambda x: x[0]) clust_h2o.sort(key=lambda x: x[0]) print("\nMLlib Cluster Centers:\n") print(clust_mllib) print("\nH2O Cluster Centers:\n") print(clust_h2o) wcsse_mllib = err_mllib[err_mllib[0:4, 0].tolist().index(k)][1] wcsse_h2o = old_div(cross_km.tot_withinss(), n) print("\nMLlib Average Within-Cluster SSE: \n".format(wcsse_mllib)) print("H2O Average Within-Cluster SSE: \n".format(wcsse_h2o)) assert wcsse_h2o == wcsse_mllib, "Expected mllib and h2o to get the same wcsse. Mllib got {0}, and H2O " \ "got {1}".format(wcsse_mllib, wcsse_h2o) else: raise EnvironmentError
def hdfs_kmeans_converge(ip, port): h2o.init(ip, port) # Check if we are running inside the H2O network by seeing if we can touch # the namenode. running_inside_h2o = h2o.is_running_internal_to_h2o() if running_inside_h2o: hdfs_name_node = h2o.get_h2o_internal_hdfs_name_node() hdfs_cross_file = "/datasets/runit/BigCross.data" print "Import BigCross.data from HDFS" url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_cross_file) cross_h2o = h2o.import_frame(url) n = cross_h2o.nrow() print "rows: {0}".format(n) ncent = 3 miters = 10 print "Run k-means with k = {0} and max_iterations = {1}".format(ncent, miters) cross1_km = h2o.kmeans(training_frame=cross_h2o, x=cross_h2o[0:57], k=ncent, max_iterations=miters) print cross1_km print "Run k-means with init = final cluster centers and max_iterations = 1" init_centers = h2o.H2OFrame(cross1_km.centers()) init_centers_key = init_centers.send_frame() cross2_km = h2o.kmeans( training_frame=cross_h2o, x=cross_h2o[0:57], k=ncent, user_points=init_centers_key, max_iterations=1 ) print cross2_km print "Check k-means converged or maximum iterations reached" c1 = h2o.H2OFrame(cross1_km.centers()) c2 = h2o.H2OFrame(cross2_km.centers()) avg_change = ((c1 - c2) ** 2).sum() / ncent iters = cross1_km._model_json["output"]["model_summary"].cell_values[0][3] assert avg_change < 1e-6 or iters > miters, ( "Expected k-means to converge or reach max iterations. avg_change = " "{0} and iterations = {1}".format(avg_change, iters) ) else: print "Not running on H2O internal network. No access to HDFS."
def hdfs_kmeans_converge(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() hdfs_cross_file = "/datasets/runit/BigCross.data" print("Import BigCross.data from HDFS") url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_cross_file) cross_h2o = h2o.import_file(url) n = cross_h2o.nrow print("rows: {0}".format(n)) ncent = 3 miters = 10 print("Run k-means with k = {0} and max_iterations = {1}".format(ncent,miters)) cross1_km = h2o.kmeans(training_frame = cross_h2o, x=cross_h2o[0:57], k = ncent, max_iterations = miters) print(cross1_km) print("Run k-means with init = final cluster centers and max_iterations = 1") init_centers = h2o.H2OFrame(cross1_km.centers()) cross2_km = h2o.kmeans(training_frame = cross_h2o, x=cross_h2o[0:57], k = ncent, user_points=init_centers, max_iterations = 1) print(cross2_km) print("Check k-means converged or maximum iterations reached") c1 = h2o.H2OFrame(cross1_km.centers()) c2 = h2o.H2OFrame(cross2_km.centers()) avg_change = old_div(((c1-c2)**2).sum(), ncent) iters = cross1_km._model_json['output']['model_summary'].cell_values[0][3] assert avg_change < 1e-6 or iters > miters, "Expected k-means to converge or reach max iterations. avg_change = " \ "{0} and iterations = {1}".format(avg_change, iters) else: raise EnvironmentError
def parametersKmeans(ip,port): print "Getting data..." iris = h2o.import_file(path=h2o.locate("smalldata/iris/iris.csv")) print "Create and and duplicate..." iris_km = h2o.kmeans(x=iris[0:4], k=3, seed=1234) parameters = iris_km._model_json['parameters'] param_dict = {} for p in range(len(parameters)): param_dict[parameters[p]['label']] = parameters[p]['actual_value'] iris_km_again = h2o.kmeans(x=iris[0:4], **param_dict) print "wss" wss = iris_km.withinss().sort() wss_again = iris_km_again.withinss().sort() assert wss == wss_again, "expected wss to be equal" print "centers" centers = iris_km.centers() centers_again = iris_km_again.centers() assert centers == centers_again, "expected centers to be the same"
def kmeans_mllib(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() hdfs_cross_file = "/datasets/runit/BigCross.data" print "Import BigCross.data from HDFS" url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_cross_file) cross_h2o = h2o.import_file(url) n = cross_h2o.nrow err_mllib = np.genfromtxt( pyunit_utils.locate("smalldata/mllib_bench/bigcross_wcsse.csv"), delimiter=",", skip_header=1 ) ncent = [int(err_mllib[r][0]) for r in range(len(err_mllib))] for k in ncent: print "Run k-means++ with k = {0} and max_iterations = 10".format(k) cross_km = h2o.kmeans( training_frame=cross_h2o, x=cross_h2o, k=k, init="PlusPlus", max_iterations=10, standardize=False ) clust_mllib = np.genfromtxt( pyunit_utils.locate("smalldata/mllib_bench/bigcross_centers_" + str(k) + ".csv"), delimiter="," ).tolist() clust_h2o = cross_km.centers() # Sort in ascending order by first dimension for comparison purposes clust_mllib.sort(key=lambda x: x[0]) clust_h2o.sort(key=lambda x: x[0]) print "\nMLlib Cluster Centers:\n" print clust_mllib print "\nH2O Cluster Centers:\n" print clust_h2o wcsse_mllib = err_mllib[err_mllib[0:4, 0].tolist().index(k)][1] wcsse_h2o = cross_km.tot_withinss() / n print "\nMLlib Average Within-Cluster SSE: \n".format(wcsse_mllib) print "H2O Average Within-Cluster SSE: \n".format(wcsse_h2o) assert wcsse_h2o == wcsse_mllib, ( "Expected mllib and h2o to get the same wcsse. Mllib got {0}, and H2O " "got {1}".format(wcsse_mllib, wcsse_h2o) ) else: raise (EnvironmentError, "Not running on H2O internal network. No access to HDFS.")
def parametersKmeans(): print "Getting data..." iris = h2o.import_file(path=h2o.locate("smalldata/iris/iris.csv")) print "Create and and duplicate..." iris_km = h2o.kmeans(x=iris[0:4], k=3, seed=1234) parameters = iris_km._model_json["parameters"] param_dict = {} for p in range(len(parameters)): param_dict[parameters[p]["label"]] = parameters[p]["actual_value"] iris_km_again = h2o.kmeans(x=iris[0:4], **param_dict) print "wss" wss = iris_km.withinss().sort() wss_again = iris_km_again.withinss().sort() assert wss == wss_again, "expected wss to be equal" print "centers" centers = iris_km.centers() centers_again = iris_km_again.centers() assert centers == centers_again, "expected centers to be the same"
def emptyclusKmeans(ip, port): # Connect to a pre-existing cluster h2o.init(ip, port) # connect to localhost:54321 #Log.info("Importing ozone.csv data...\n") ozone_sci = np.loadtxt(h2o.locate("smalldata/glm_test/ozone.csv"), delimiter=',', skiprows=1) ozone_h2o = h2o.import_frame( path=h2o.locate("smalldata/glm_test/ozone.csv")) ncent = 10 nempty = random.randint(1, ncent / 2) #TODO: implement row slicing initial_centers = [[41, 190, 67, 7.4], [36, 118, 72, 8], [12, 149, 74, 12.6], [18, 313, 62, 11.5], [23, 299, 65, 8.6], [19, 99, 59, 13.8], [8, 19, 61, 20.1], [16, 256, 69, 9.7], [11, 290, 66, 9.2], [14, 274, 68, 10.9]] for i in random.sample(range(0, ncent - 1), nempty): initial_centers[i] = [ 100 * i for z in range(1, len(initial_centers[0]) + 1) ] initial_centers_h2o = h2o.H2OFrame(initial_centers) initial_centers_h2o_key = initial_centers_h2o.send_frame() initial_centers_sci = np.asarray(initial_centers) #Log.info("Initial cluster centers:") print "H2O initial centers:" initial_centers_h2o.show() print "scikit initial centers:" print initial_centers_sci # H2O can handle empty clusters and so can scikit #Log.info("Check that H2O can handle badly initialized centers") km_sci = KMeans(n_clusters=ncent, init=initial_centers_sci, n_init=1) km_sci.fit(preprocessing.scale(ozone_sci)) print "scikit final centers" print km_sci.cluster_centers_ km_h2o = h2o.kmeans(x=ozone_h2o, k=ncent, user_points=initial_centers_h2o_key, standardize=True) print "H2O final centers" print km_h2o.centers()
def hdfs_kmeans(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() hdfs_iris_file = "/datasets/runit/iris_wheader.csv" hdfs_covtype_file = "/datasets/runit/covtype.data" print("Import iris_wheader.csv from HDFS") url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_iris_file) iris_h2o = h2o.import_file(url) n = iris_h2o.nrow print("rows: {0}".format(n)) assert n == 150, "Wrong number of rows. Got {0}. Should have got {1}".format(n, 150) print("Running KMeans on iris") iris_km = h2o.kmeans(training_frame = iris_h2o, k = 3, x = iris_h2o[0:4], max_iterations = 10) print(iris_km) print("Importing covtype.data from HDFS") url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_covtype_file) covtype_h2o = h2o.import_file(url) n = covtype_h2o.nrow print("rows: {0}".format(n)) assert n == 581012, "Wrong number of rows. Got {0}. Should have got {1}".format(n, 581012) print("Running KMeans on covtype") covtype_km = h2o.kmeans(training_frame = covtype_h2o, x = covtype_h2o[0:55], k = 8, max_iterations = 10) print(covtype_km) else: raise EnvironmentError
def hdfs_kmeans(ip, port): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. running_inside_h2o = h2o.is_running_internal_to_h2o() if running_inside_h2o: hdfs_name_node = h2o.get_h2o_internal_hdfs_name_node() hdfs_iris_file = "/datasets/runit/iris_wheader.csv" hdfs_covtype_file = "/datasets/runit/covtype.data" print "Import iris_wheader.csv from HDFS" url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_iris_file) iris_h2o = h2o.import_file(url) n = iris_h2o.nrow print "rows: {0}".format(n) assert n == 150, "Wrong number of rows. Got {0}. Should have got {1}".format(n, 150) print "Running KMeans on iris" iris_km = h2o.kmeans(training_frame = iris_h2o, k = 3, x = iris_h2o[0:4], max_iterations = 10) print iris_km print "Importing covtype.data from HDFS" url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_covtype_file) covtype_h2o = h2o.import_file(url) n = covtype_h2o.nrow print "rows: {0}".format(n) assert n == 581012, "Wrong number of rows. Got {0}. Should have got {1}".format(n, 581012) print "Running KMeans on covtype" covtype_km = h2o.kmeans(training_frame = covtype_h2o, x = covtype_h2o[0:55], k = 8, max_iterations = 10) print covtype_km else: print "Not running on H2O internal network. No access to HDFS."
def ozoneKM(): # Connect to a pre-existing cluster # connect to localhost:54321 train = h2o.import_file(path=pyunit_utils.locate("smalldata/glm_test/ozone.csv")) # See that the data is ready print(train.describe()) # Run KMeans my_km = h2o.kmeans(x=train, k=10, init="PlusPlus", max_iterations=100) my_km.show() my_km.summary() my_pred = my_km.predict(train) my_pred.describe()
def ozoneKM(ip, port): # Connect to a pre-existing cluster # connect to localhost:54321 train = h2o.import_file(path=h2o.locate("smalldata/glm_test/ozone.csv")) # See that the data is ready print train.describe() # Run KMeans my_km = h2o.kmeans(x=train, k=10, init="PlusPlus", max_iterations=100) my_km.show() my_km.summary() my_pred = my_km.predict(train) my_pred.describe()
def emptyclusKmeans(): # Connect to a pre-existing cluster # connect to localhost:54321 #Log.info("Importing ozone.csv data...\n") ozone_sci = np.loadtxt(pyunit_utils.locate("smalldata/glm_test/ozone.csv"), delimiter=',', skiprows=1) ozone_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/glm_test/ozone.csv")) ncent = 10 nempty = random.randint(1,ncent/2) initial_centers = [[41,190,67,7.4], [36,118,72,8], [12,149,74,12.6], [18,313,62,11.5], [23,299,65,8.6], [19,99,59,13.8], [8,19,61,20.1], [16,256,69,9.7], [11,290,66,9.2], [14,274,68,10.9]] for i in random.sample(range(0,ncent-1), nempty): initial_centers[i] = [100*i for z in range(1,len(initial_centers[0])+1)] initial_centers_sci = np.asarray(initial_centers) initial_centers = zip(*initial_centers) initial_centers_h2o = h2o.H2OFrame(initial_centers) #Log.info("Initial cluster centers:") print "H2O initial centers:" initial_centers_h2o.show() print "scikit initial centers:" print initial_centers_sci # H2O can handle empty clusters and so can scikit #Log.info("Check that H2O can handle badly initialized centers") km_sci = KMeans(n_clusters=ncent, init=initial_centers_sci, n_init=1) km_sci.fit(preprocessing.scale(ozone_sci)) print "scikit final centers" print km_sci.cluster_centers_ km_h2o = h2o.kmeans(x=ozone_h2o, k=ncent, user_points=initial_centers_h2o, standardize=True) print "H2O final centers" print km_h2o.centers()
def prostateKmeans(): # Connect to a pre-existing cluster # connect to localhost:54321 #Log.info("Importing prostate.csv data...\n") prostate_h2o = h2o.import_file(path=tests.locate("smalldata/logreg/prostate.csv")) #prostate.summary() prostate_sci = np.loadtxt(tests.locate("smalldata/logreg/prostate_train.csv"), delimiter=',', skiprows=1) prostate_sci = prostate_sci[:,1:] for i in range(5,9): #Log.info(paste("H2O K-Means with ", i, " clusters:\n", sep = "")) #Log.info(paste( "Using these columns: ", colnames(prostate.hex)[-1]) ) prostate_km_h2o = h2o.kmeans(x=prostate_h2o[1:], k=i) prostate_km_h2o.show() prostate_km_sci = KMeans(n_clusters=i, init='k-means++', n_init=1) prostate_km_sci.fit(prostate_sci) print prostate_km_sci.cluster_centers_
def prostateKmeans(): # Connect to a pre-existing cluster # connect to localhost:54321 #Log.info("Importing prostate.csv data...\n") prostate_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) #prostate.summary() prostate_sci = np.loadtxt(pyunit_utils.locate("smalldata/logreg/prostate_train.csv"), delimiter=',', skiprows=1) prostate_sci = prostate_sci[:,1:] for i in range(5,9): #Log.info(paste("H2O K-Means with ", i, " clusters:\n", sep = "")) #Log.info(paste( "Using these columns: ", colnames(prostate.hex)[-1]) ) prostate_km_h2o = h2o.kmeans(x=prostate_h2o[1:], k=i) prostate_km_h2o.show() prostate_km_sci = KMeans(n_clusters=i, init='k-means++', n_init=1) prostate_km_sci.fit(prostate_sci) print prostate_km_sci.cluster_centers_
def benignKmeans(): # Connect to a pre-existing cluster # connect to localhost:54321 # Log.info("Importing benign.csv data...\n") benign_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/benign.csv")) # benign_h2o.summary() benign_sci = np.genfromtxt(pyunit_utils.locate("smalldata/logreg/benign.csv"), delimiter=",") # Impute missing values with column mean imp = Imputer(missing_values="NaN", strategy="mean", axis=0) benign_sci = imp.fit_transform(benign_sci) # Log.info(paste("H2O K-Means with ", i, " clusters:\n", sep = "")) for i in range(1, 7): benign_h2o_km = h2o.kmeans(x=benign_h2o, k=i) print("H2O centers") print(benign_h2o_km.centers()) benign_sci_km = KMeans(n_clusters=i, init="k-means++", n_init=1) benign_sci_km.fit(benign_sci) print("sckit centers") print(benign_sci_km.cluster_centers_)
def get_modelKmeans(): # Connect to a pre-existing cluster # connect to localhost:54321 # Log.info("Importing benign.csv data...\n") benign_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/benign.csv")) # benign_h2o.summary() benign_sci = np.genfromtxt(pyunit_utils.locate("smalldata/logreg/benign.csv"), delimiter=",") # Impute missing values with column mean imp = Imputer(missing_values="NaN", strategy="mean", axis=0) benign_sci = imp.fit_transform(benign_sci) for i in range(2, 7): # Log.info("H2O K-Means") km_h2o = h2o.kmeans(x=benign_h2o, k=i) km_h2o.show() model = h2o.get_model(km_h2o._id) model.show() km_sci = KMeans(n_clusters=i, init="k-means++", n_init=1) km_sci.fit(benign_sci) print("sckit centers") print(km_sci.cluster_centers_)
def hdfs_kmeans_airlines(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. running_inside_h2o = tests.is_running_internal_to_h2o() if running_inside_h2o: hdfs_name_node = tests.get_h2o_internal_hdfs_name_node() hdfs_file = "/datasets/airlines_all.csv" print "Import airlines_all.csv from HDFS" url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_file) airlines_h2o = h2o.import_file(url) n = airlines_h2o.nrow print "rows: {0}".format(n) print "Run k-means++ with k = 7 and max_iterations = 10" myX = range(8) + range(11,16) + range(18,21) + range(24,29) + [9] airlines_km = h2o.kmeans(training_frame = airlines_h2o, x = airlines_h2o[myX], k = 7, init = "Furthest", max_iterations = 10, standardize = True) print airlines_km else: print "Not running on H2O internal network. No access to HDFS."
def hdfs_kmeans_airlines(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() hdfs_file = "/datasets/airlines_all.csv" print("Import airlines_all.csv from HDFS") url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_file) airlines_h2o = h2o.import_file(url) n = airlines_h2o.nrow print("rows: {0}".format(n)) print("Run k-means++ with k = 7 and max_iterations = 10") myX = list(range(8)) + list(range(11,16)) + list(range(18,21)) + list(range(24,29)) + [9] airlines_km = h2o.kmeans(training_frame = airlines_h2o, x = airlines_h2o[myX], k = 7, init = "Furthest", max_iterations = 10, standardize = True) print(airlines_km) else: raise EnvironmentError
def metric_json_check(ip, port): df = h2o.import_file(path=h2o.locate("smalldata/logreg/prostate.csv")) # Regression metric json reg_mod = h2o.gbm(y=df["CAPSULE"], x=df[3:], training_frame=df, distribution="gaussian") reg_met = reg_mod.model_performance() reg_metric_json_keys_have = reg_met._metric_json.keys() reg_metric_json_keys_desired = [u'model_category', u'description', u'r2', u'frame', u'model_checksum', u'MSE', u'__meta', u'scoring_time', u'predictions', u'model', u'duration_in_ms', u'frame_checksum', u'mean_residual_deviance'] reg_metric_diff = list(set(reg_metric_json_keys_have) - set(reg_metric_json_keys_desired)) assert not reg_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) regression " \ "metric json. The difference is {2}".format(reg_metric_json_keys_have, reg_metric_json_keys_desired, reg_metric_diff) # Regression metric json (GLM) reg_mod = h2o.glm(y=df["CAPSULE"], x=df[3:], training_frame=df, family="gaussian") reg_met = reg_mod.model_performance() reg_metric_json_keys_have = reg_met._metric_json.keys() reg_metric_json_keys_desired = [u'model_category', u'description', u'r2', u'residual_degrees_of_freedom', u'frame', u'model_checksum', u'MSE', u'__meta', u'null_deviance', u'scoring_time', u'null_degrees_of_freedom', u'predictions', u'AIC', u'model', u'duration_in_ms', u'frame_checksum', u'residual_deviance', u'mean_residual_deviance'] reg_metric_diff = list(set(reg_metric_json_keys_have) - set(reg_metric_json_keys_desired)) assert not reg_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) glm-regression " \ "metric json. The difference is {2}".format(reg_metric_json_keys_have, reg_metric_json_keys_desired, reg_metric_diff) # Binomial metric json bin_mod = h2o.gbm(y=df["CAPSULE"].asfactor(), x=df[3:], training_frame=df, distribution="bernoulli") bin_met = bin_mod.model_performance() bin_metric_json_keys_have = bin_met._metric_json.keys() bin_metric_json_keys_desired = [u'AUC', u'Gini', u'model_category', u'description', u'r2', u'frame', u'model_checksum', u'MSE', u'__meta', u'logloss', u'scoring_time', u'thresholds_and_metric_scores', u'predictions', u'max_criteria_and_metric_scores', u'model', u'duration_in_ms', u'frame_checksum', u'domain'] bin_metric_diff = list(set(bin_metric_json_keys_have) - set(bin_metric_json_keys_desired)) assert not bin_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) binomial " \ "metric json. The difference is {2}".format(bin_metric_json_keys_have, bin_metric_json_keys_desired, bin_metric_diff) # Binomial metric json (GLM) bin_mod = h2o.glm(y=df["CAPSULE"].asfactor(), x=df[3:], training_frame=df, family="binomial") bin_met = bin_mod.model_performance() bin_metric_json_keys_have = bin_met._metric_json.keys() bin_metric_json_keys_desired = [u'frame', u'residual_deviance', u'max_criteria_and_metric_scores', u'MSE', u'frame_checksum', u'AIC', u'logloss', u'Gini', u'predictions', u'AUC', u'description', u'model_checksum', u'duration_in_ms', u'model_category', u'r2', u'residual_degrees_of_freedom', u'__meta', u'null_deviance', u'scoring_time', u'null_degrees_of_freedom', u'model', u'thresholds_and_metric_scores', u'domain'] bin_metric_diff = list(set(bin_metric_json_keys_have) - set(bin_metric_json_keys_desired)) assert not bin_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) glm-binomial " \ "metric json. The difference is {2}".format(bin_metric_json_keys_have, bin_metric_json_keys_desired, bin_metric_diff) # Multinomial metric json df = h2o.import_file(path=h2o.locate("smalldata/airlines/AirlinesTrain.csv.zip")) myX = ["Origin", "Dest", "IsDepDelayed", "UniqueCarrier", "Distance", "fDayofMonth", "fDayOfWeek"] myY = "fYear" mul_mod = h2o.gbm(x=df[myX], y=df[myY], training_frame=df, distribution="multinomial") mul_met = mul_mod.model_performance() mul_metric_json_keys_have = mul_met._metric_json.keys() mul_metric_json_keys_desired = [u'cm', u'model_category', u'description', u'r2', u'frame', u'model_checksum', u'MSE', u'__meta', u'logloss', u'scoring_time', u'predictions', u'hit_ratio_table', u'model', u'duration_in_ms', u'frame_checksum'] mul_metric_diff = list(set(mul_metric_json_keys_have) - set(mul_metric_json_keys_desired)) assert not mul_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) multinomial " \ "metric json. The difference is {2}".format(mul_metric_json_keys_have, mul_metric_json_keys_desired, mul_metric_diff) # Clustering metric json df = h2o.import_file(path=h2o.locate("smalldata/iris/iris.csv")) clus_mod = h2o.kmeans(x=df[0:4], k=3, standardize=False) clus_met = clus_mod.model_performance() clus_metric_json_keys_have = clus_met._metric_json.keys() clus_metric_json_keys_desired = [u'tot_withinss', u'model_category', u'description', u'frame', u'model_checksum', u'MSE', u'__meta', u'scoring_time', u'betweenss', u'predictions', u'totss', u'model', u'duration_in_ms', u'frame_checksum', u'centroid_stats'] clus_metric_diff = list(set(clus_metric_json_keys_have) - set(clus_metric_json_keys_desired)) assert not clus_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) clustering " \ "metric json. The difference is {2}".format(clus_metric_json_keys_have, clus_metric_json_keys_desired, clus_metric_diff)
def metric_json_check(ip, port): h2o.init(ip, port) df = h2o.import_frame(path=h2o.locate("smalldata/logreg/prostate.csv")) # Regression metric json reg_mod = h2o.gbm(y=df["CAPSULE"], x=df[3:], training_frame=df, distribution="gaussian") reg_met = reg_mod.model_performance() reg_metric_json_keys_have = reg_met._metric_json.keys() reg_metric_json_keys_desired = [ u"model_category", u"description", u"r2", u"frame", u"model_checksum", u"MSE", u"__meta", u"scoring_time", u"predictions", u"model", u"duration_in_ms", u"frame_checksum", ] reg_metric_diff = list(set(reg_metric_json_keys_have) - set(reg_metric_json_keys_desired)) assert not reg_metric_diff, ( "There's a difference between the current ({0}) and the desired ({1}) regression " "metric json. The difference is {2}".format( reg_metric_json_keys_have, reg_metric_json_keys_desired, reg_metric_diff ) ) # Regression metric json (GLM) reg_mod = h2o.glm(y=df["CAPSULE"], x=df[3:], training_frame=df, family="gaussian") reg_met = reg_mod.model_performance() reg_metric_json_keys_have = reg_met._metric_json.keys() reg_metric_json_keys_desired = [ u"model_category", u"description", u"r2", u"residual_degrees_of_freedom", u"frame", u"model_checksum", u"MSE", u"__meta", u"null_deviance", u"scoring_time", u"null_degrees_of_freedom", u"predictions", u"AIC", u"model", u"duration_in_ms", u"frame_checksum", u"residual_deviance", ] reg_metric_diff = list(set(reg_metric_json_keys_have) - set(reg_metric_json_keys_desired)) assert not reg_metric_diff, ( "There's a difference between the current ({0}) and the desired ({1}) glm-regression " "metric json. The difference is {2}".format( reg_metric_json_keys_have, reg_metric_json_keys_desired, reg_metric_diff ) ) # Binomial metric json bin_mod = h2o.gbm(y=df["CAPSULE"].asfactor(), x=df[3:], training_frame=df, distribution="bernoulli") bin_met = bin_mod.model_performance() bin_metric_json_keys_have = bin_met._metric_json.keys() bin_metric_json_keys_desired = [ u"AUC", u"Gini", u"model_category", u"description", u"r2", u"frame", u"model_checksum", u"MSE", u"__meta", u"logloss", u"scoring_time", u"thresholds_and_metric_scores", u"predictions", u"max_criteria_and_metric_scores", u"model", u"duration_in_ms", u"frame_checksum", u"domain", ] bin_metric_diff = list(set(bin_metric_json_keys_have) - set(bin_metric_json_keys_desired)) assert not bin_metric_diff, ( "There's a difference between the current ({0}) and the desired ({1}) binomial " "metric json. The difference is {2}".format( bin_metric_json_keys_have, bin_metric_json_keys_desired, bin_metric_diff ) ) # Binomial metric json (GLM) bin_mod = h2o.glm(y=df["CAPSULE"].asfactor(), x=df[3:], training_frame=df, family="binomial") bin_met = bin_mod.model_performance() bin_metric_json_keys_have = bin_met._metric_json.keys() bin_metric_json_keys_desired = [ u"frame", u"residual_deviance", u"max_criteria_and_metric_scores", u"MSE", u"frame_checksum", u"AIC", u"logloss", u"Gini", u"predictions", u"AUC", u"description", u"model_checksum", u"duration_in_ms", u"model_category", u"r2", u"residual_degrees_of_freedom", u"__meta", u"null_deviance", u"scoring_time", u"null_degrees_of_freedom", u"model", u"thresholds_and_metric_scores", u"domain", ] bin_metric_diff = list(set(bin_metric_json_keys_have) - set(bin_metric_json_keys_desired)) assert not bin_metric_diff, ( "There's a difference between the current ({0}) and the desired ({1}) glm-binomial " "metric json. The difference is {2}".format( bin_metric_json_keys_have, bin_metric_json_keys_desired, bin_metric_diff ) ) # Multinomial metric json df = h2o.import_frame(path=h2o.locate("smalldata/airlines/AirlinesTrain.csv.zip")) myX = ["Origin", "Dest", "IsDepDelayed", "UniqueCarrier", "Distance", "fDayofMonth", "fDayOfWeek"] myY = "fYear" mul_mod = h2o.gbm(x=df[myX], y=df[myY], training_frame=df, distribution="multinomial") mul_met = mul_mod.model_performance() mul_metric_json_keys_have = mul_met._metric_json.keys() mul_metric_json_keys_desired = [ u"cm", u"model_category", u"description", u"r2", u"frame", u"model_checksum", u"MSE", u"__meta", u"logloss", u"scoring_time", u"predictions", u"hit_ratio_table", u"model", u"duration_in_ms", u"frame_checksum", ] mul_metric_diff = list(set(mul_metric_json_keys_have) - set(mul_metric_json_keys_desired)) assert not mul_metric_diff, ( "There's a difference between the current ({0}) and the desired ({1}) multinomial " "metric json. The difference is {2}".format( mul_metric_json_keys_have, mul_metric_json_keys_desired, mul_metric_diff ) ) # Clustering metric json df = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv")) clus_mod = h2o.kmeans(x=df[0:4], k=3, standardize=False) clus_met = clus_mod.model_performance() clus_metric_json_keys_have = clus_met._metric_json.keys() clus_metric_json_keys_desired = [ u"tot_withinss", u"model_category", u"description", u"frame", u"model_checksum", u"MSE", u"__meta", u"scoring_time", u"betweenss", u"predictions", u"totss", u"model", u"duration_in_ms", u"frame_checksum", u"centroid_stats", ] clus_metric_diff = list(set(clus_metric_json_keys_have) - set(clus_metric_json_keys_desired)) assert not clus_metric_diff, ( "There's a difference between the current ({0}) and the desired ({1}) clustering " "metric json. The difference is {2}".format( clus_metric_json_keys_have, clus_metric_json_keys_desired, clus_metric_diff ) )
def metric_accessors(ip, port): cars = h2o.import_file(path=h2o.locate("smalldata/junit/cars_20mpg.csv")) r = cars[0].runif() train = cars[r > .2] valid = cars[r <= .2] # regression response_col = "economy" distribution = "gaussian" predictors = ["displacement", "power", "weight", "acceleration", "year"] gbm = h2o.gbm(y=train[response_col], x=train[predictors], validation_y=valid[response_col], validation_x=valid[predictors], nfolds=3, distribution=distribution, fold_assignment="Random") # mse mse1 = gbm.mse(train=True, valid=False, xval=False) assert isinstance(mse1, float) mse2 = gbm.mse(train=False, valid=True, xval=False) assert isinstance(mse2, float) mse3 = gbm.mse(train=False, valid=False, xval=True) assert isinstance(mse3, float) mse = gbm.mse(train=True, valid=True, xval=False) assert "train" in mse.keys() and "valid" in mse.keys( ), "expected training and validation metrics to be returned, but got {0}".format( mse.keys()) assert len( mse ) == 2, "expected only training and validation metrics to be returned, but got {0}".format( mse.keys()) assert isinstance(mse["train"], float) and isinstance( mse["valid"], float ), "expected training and validation metrics to be floats, but got {0} and {1}".format( type(mse["train"]), type(mse["valid"])) assert mse["valid"] == mse2 mse = gbm.mse(train=True, valid=False, xval=True) assert "train" in mse.keys() and "xval" in mse.keys( ), "expected training and cross validation metrics to be returned, but got {0}".format( mse.keys()) assert len( mse ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format( mse.keys()) assert isinstance(mse["train"], float) and isinstance( mse["xval"], float ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format( type(mse["train"]), type(mse["xval"])) assert mse["xval"] == mse3 mse = gbm.mse(train=True, valid=True, xval=True) assert "train" in mse.keys() and "valid" in mse.keys( ) and "xval" in mse.keys( ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format( mse.keys()) assert len( mse ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format( mse.keys()) assert isinstance(mse["train"], float) and isinstance( mse["valid"], float ) and isinstance( mse["xval"], float ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format( type(mse["train"]), type(mse["valid"]), type(mse["xval"])) mse = gbm.mse(train=False, valid=False, xval=False) # default: return training metrics assert isinstance(mse, float) assert mse == mse1 mse = gbm.mse(train=False, valid=True, xval=True) assert "valid" in mse.keys() and "xval" in mse.keys( ), "expected validation and cross validation metrics to be returned, but got {0}".format( mse.keys()) assert len( mse ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format( mse.keys()) assert isinstance(mse["valid"], float) and isinstance( mse["xval"], float ), "validation and cross validation metrics to be floats, but got {0} and {1}".format( type(mse["valid"]), type(mse["xval"])) # r2 r21 = gbm.r2(train=True, valid=False, xval=False) assert isinstance(r21, float) r22 = gbm.r2(train=False, valid=True, xval=False) assert isinstance(r22, float) r23 = gbm.r2(train=False, valid=False, xval=True) assert isinstance(r23, float) r2 = gbm.r2(train=True, valid=True, xval=False) assert "train" in r2.keys() and "valid" in r2.keys( ), "expected training and validation metrics to be returned, but got {0}".format( r2.keys()) assert len( r2 ) == 2, "expected only training and validation metrics to be returned, but got {0}".format( r2.keys()) assert isinstance(r2["train"], float) and isinstance( r2["valid"], float ), "expected training and validation metrics to be floats, but got {0} and {1}".format( type(r2["train"]), type(r2["valid"])) assert r2["valid"] == r22 r2 = gbm.r2(train=True, valid=False, xval=True) assert "train" in r2.keys() and "xval" in r2.keys( ), "expected training and cross validation metrics to be returned, but got {0}".format( r2.keys()) assert len( r2 ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format( r2.keys()) assert isinstance(r2["train"], float) and isinstance( r2["xval"], float ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format( type(r2["train"]), type(r2["xval"])) assert r2["xval"] == r23 r2 = gbm.r2(train=True, valid=True, xval=True) assert "train" in r2.keys() and "valid" in r2.keys() and "xval" in r2.keys( ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format( r2.keys()) assert len( r2 ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format( r2.keys()) assert isinstance(r2["train"], float) and isinstance( r2["valid"], float ) and isinstance( r2["xval"], float ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format( type(r2["train"]), type(r2["valid"]), type(r2["xval"])) r2 = gbm.r2(train=False, valid=False, xval=False) # default: return training metrics assert isinstance(r2, float) assert r2 == r21 r2 = gbm.r2(train=False, valid=True, xval=True) assert "valid" in r2.keys() and "xval" in r2.keys( ), "expected validation and cross validation metrics to be returned, but got {0}".format( r2.keys()) assert len( r2 ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format( r2.keys()) assert isinstance(r2["valid"], float) and isinstance( r2["xval"], float ), "validation and cross validation metrics to be floats, but got {0} and {1}".format( type(r2["valid"]), type(r2["xval"])) # mean_residual_deviance mean_residual_deviance1 = gbm.mean_residual_deviance(train=True, valid=False, xval=False) assert isinstance(mean_residual_deviance1, float) mean_residual_deviance2 = gbm.mean_residual_deviance(train=False, valid=True, xval=False) assert isinstance(mean_residual_deviance2, float) mean_residual_deviance3 = gbm.mean_residual_deviance(train=False, valid=False, xval=True) assert isinstance(mean_residual_deviance3, float) mean_residual_deviance = gbm.mean_residual_deviance(train=True, valid=True, xval=False) assert "train" in mean_residual_deviance.keys( ) and "valid" in mean_residual_deviance.keys( ), "expected training and validation metrics to be returned, but got {0}".format( mean_residual_deviance.keys()) assert len( mean_residual_deviance ) == 2, "expected only training and validation metrics to be returned, but got {0}".format( mean_residual_deviance.keys()) assert isinstance(mean_residual_deviance["train"], float) and isinstance( mean_residual_deviance["valid"], float ), "expected training and validation metrics to be floats, but got {0} and {1}".format( type(mean_residual_deviance["train"]), type(mean_residual_deviance["valid"])) assert mean_residual_deviance["valid"] == mean_residual_deviance2 mean_residual_deviance = gbm.mean_residual_deviance(train=True, valid=False, xval=True) assert "train" in mean_residual_deviance.keys( ) and "xval" in mean_residual_deviance.keys( ), "expected training and cross validation metrics to be returned, but got {0}".format( mean_residual_deviance.keys()) assert len( mean_residual_deviance ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format( mean_residual_deviance.keys()) assert isinstance(mean_residual_deviance["train"], float) and isinstance( mean_residual_deviance["xval"], float ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format( type(mean_residual_deviance["train"]), type(mean_residual_deviance["xval"])) assert mean_residual_deviance["xval"] == mean_residual_deviance3 mean_residual_deviance = gbm.mean_residual_deviance(train=True, valid=True, xval=True) assert "train" in mean_residual_deviance.keys( ) and "valid" in mean_residual_deviance.keys( ) and "xval" in mean_residual_deviance.keys( ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format( mean_residual_deviance.keys()) assert len( mean_residual_deviance ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format( mean_residual_deviance.keys()) assert isinstance(mean_residual_deviance["train"], float) and isinstance( mean_residual_deviance["valid"], float ) and isinstance( mean_residual_deviance["xval"], float ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format( type(mean_residual_deviance["train"]), type(mean_residual_deviance["valid"]), type(mean_residual_deviance["xval"])) mean_residual_deviance = gbm.mean_residual_deviance( train=False, valid=False, xval=False) # default: return training metrics assert isinstance(mean_residual_deviance, float) assert mean_residual_deviance == mean_residual_deviance1 mean_residual_deviance = gbm.mean_residual_deviance(train=False, valid=True, xval=True) assert "valid" in mean_residual_deviance.keys( ) and "xval" in mean_residual_deviance.keys( ), "expected validation and cross validation metrics to be returned, but got {0}".format( mean_residual_deviance.keys()) assert len( mean_residual_deviance ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format( mean_residual_deviance.keys()) assert isinstance(mean_residual_deviance["valid"], float) and isinstance( mean_residual_deviance["xval"], float ), "validation and cross validation metrics to be floats, but got {0} and {1}".format( type(mean_residual_deviance["valid"]), type(mean_residual_deviance["xval"])) # binomial cars = h2o.import_file(path=h2o.locate("smalldata/junit/cars_20mpg.csv")) cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() r = cars[0].runif() train = cars[r > .2] valid = cars[r <= .2] response_col = "economy_20mpg" distribution = "bernoulli" predictors = ["displacement", "power", "weight", "acceleration", "year"] gbm = h2o.gbm(y=train[response_col], x=train[predictors], validation_y=valid[response_col], validation_x=valid[predictors], nfolds=3, distribution=distribution, fold_assignment="Random") # auc auc1 = gbm.auc(train=True, valid=False, xval=False) assert isinstance(auc1, float) auc2 = gbm.auc(train=False, valid=True, xval=False) assert isinstance(auc2, float) auc3 = gbm.auc(train=False, valid=False, xval=True) assert isinstance(auc3, float) auc = gbm.auc(train=True, valid=True, xval=False) assert "train" in auc.keys() and "valid" in auc.keys( ), "expected training and validation metrics to be returned, but got {0}".format( auc.keys()) assert len( auc ) == 2, "expected only training and validation metrics to be returned, but got {0}".format( auc.keys()) assert isinstance(auc["train"], float) and isinstance( auc["valid"], float ), "expected training and validation metrics to be floats, but got {0} and {1}".format( type(auc["train"]), type(auc["valid"])) assert auc["valid"] == auc2 auc = gbm.auc(train=True, valid=False, xval=True) assert "train" in auc.keys() and "xval" in auc.keys( ), "expected training and cross validation metrics to be returned, but got {0}".format( auc.keys()) assert len( auc ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format( auc.keys()) assert isinstance(auc["train"], float) and isinstance( auc["xval"], float ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format( type(auc["train"]), type(auc["xval"])) assert auc["xval"] == auc3 auc = gbm.auc(train=True, valid=True, xval=True) assert "train" in auc.keys() and "valid" in auc.keys( ) and "xval" in auc.keys( ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format( auc.keys()) assert len( auc ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format( auc.keys()) assert isinstance(auc["train"], float) and isinstance( auc["valid"], float ) and isinstance( auc["xval"], float ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format( type(auc["train"]), type(auc["valid"]), type(auc["xval"])) auc = gbm.auc(train=False, valid=False, xval=False) # default: return training metrics assert isinstance(auc, float) assert auc == auc1 auc = gbm.auc(train=False, valid=True, xval=True) assert "valid" in auc.keys() and "xval" in auc.keys( ), "expected validation and cross validation metrics to be returned, but got {0}".format( auc.keys()) assert len( auc ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format( auc.keys()) assert isinstance(auc["valid"], float) and isinstance( auc["xval"], float ), "validation and cross validation metrics to be floats, but got {0} and {1}".format( type(auc["valid"]), type(auc["xval"])) # logloss logloss1 = gbm.logloss(train=True, valid=False, xval=False) assert isinstance(logloss1, float) logloss2 = gbm.logloss(train=False, valid=True, xval=False) assert isinstance(logloss2, float) logloss3 = gbm.logloss(train=False, valid=False, xval=True) assert isinstance(logloss3, float) logloss = gbm.logloss(train=True, valid=True, xval=False) assert "train" in logloss.keys() and "valid" in logloss.keys( ), "expected training and validation metrics to be returned, but got {0}".format( logloss.keys()) assert len( logloss ) == 2, "expected only training and validation metrics to be returned, but got {0}".format( logloss.keys()) assert isinstance(logloss["train"], float) and isinstance( logloss["valid"], float ), "expected training and validation metrics to be floats, but got {0} and {1}".format( type(logloss["train"]), type(logloss["valid"])) assert logloss["valid"] == logloss2 logloss = gbm.logloss(train=True, valid=False, xval=True) assert "train" in logloss.keys() and "xval" in logloss.keys( ), "expected training and cross validation metrics to be returned, but got {0}".format( logloss.keys()) assert len( logloss ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format( logloss.keys()) assert isinstance(logloss["train"], float) and isinstance( logloss["xval"], float ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format( type(logloss["train"]), type(logloss["xval"])) assert logloss["xval"] == logloss3 logloss = gbm.logloss(train=True, valid=True, xval=True) assert "train" in logloss.keys() and "valid" in logloss.keys( ) and "xval" in logloss.keys( ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format( logloss.keys()) assert len( logloss ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format( logloss.keys()) assert isinstance(logloss["train"], float) and isinstance( logloss["valid"], float ) and isinstance( logloss["xval"], float ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format( type(logloss["train"]), type(logloss["valid"]), type(logloss["xval"])) logloss = gbm.logloss(train=False, valid=False, xval=False) # default: return training metrics assert isinstance(logloss, float) assert logloss == logloss1 logloss = gbm.logloss(train=False, valid=True, xval=True) assert "valid" in logloss.keys() and "xval" in logloss.keys( ), "expected validation and cross validation metrics to be returned, but got {0}".format( logloss.keys()) assert len( logloss ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format( logloss.keys()) assert isinstance(logloss["valid"], float) and isinstance( logloss["xval"], float ), "validation and cross validation metrics to be floats, but got {0} and {1}".format( type(logloss["valid"]), type(logloss["xval"])) # giniCoef giniCoef1 = gbm.giniCoef(train=True, valid=False, xval=False) assert isinstance(giniCoef1, float) giniCoef2 = gbm.giniCoef(train=False, valid=True, xval=False) assert isinstance(giniCoef2, float) giniCoef3 = gbm.giniCoef(train=False, valid=False, xval=True) assert isinstance(giniCoef3, float) giniCoef = gbm.giniCoef(train=True, valid=True, xval=False) assert "train" in giniCoef.keys() and "valid" in giniCoef.keys( ), "expected training and validation metrics to be returned, but got {0}".format( giniCoef.keys()) assert len( giniCoef ) == 2, "expected only training and validation metrics to be returned, but got {0}".format( giniCoef.keys()) assert isinstance(giniCoef["train"], float) and isinstance( giniCoef["valid"], float ), "expected training and validation metrics to be floats, but got {0} and {1}".format( type(giniCoef["train"]), type(giniCoef["valid"])) assert giniCoef["valid"] == giniCoef2 giniCoef = gbm.giniCoef(train=True, valid=False, xval=True) assert "train" in giniCoef.keys() and "xval" in giniCoef.keys( ), "expected training and cross validation metrics to be returned, but got {0}".format( giniCoef.keys()) assert len( giniCoef ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format( giniCoef.keys()) assert isinstance(giniCoef["train"], float) and isinstance( giniCoef["xval"], float ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format( type(giniCoef["train"]), type(giniCoef["xval"])) assert giniCoef["xval"] == giniCoef3 giniCoef = gbm.giniCoef(train=True, valid=True, xval=True) assert "train" in giniCoef.keys() and "valid" in giniCoef.keys( ) and "xval" in giniCoef.keys( ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format( giniCoef.keys()) assert len( giniCoef ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format( giniCoef.keys()) assert isinstance(giniCoef["train"], float) and isinstance( giniCoef["valid"], float ) and isinstance( giniCoef["xval"], float ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format( type(giniCoef["train"]), type(giniCoef["valid"]), type(giniCoef["xval"])) giniCoef = gbm.giniCoef(train=False, valid=False, xval=False) # default: return training metrics assert isinstance(giniCoef, float) assert giniCoef == giniCoef1 giniCoef = gbm.giniCoef(train=False, valid=True, xval=True) assert "valid" in giniCoef.keys() and "xval" in giniCoef.keys( ), "expected validation and cross validation metrics to be returned, but got {0}".format( giniCoef.keys()) assert len( giniCoef ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format( giniCoef.keys()) assert isinstance(giniCoef["valid"], float) and isinstance( giniCoef["xval"], float ), "validation and cross validation metrics to be floats, but got {0} and {1}".format( type(giniCoef["valid"]), type(giniCoef["xval"])) # F1 F11 = gbm.F1(train=True, valid=False, xval=False) F12 = gbm.F1(train=False, valid=True, xval=False) F13 = gbm.F1(train=False, valid=False, xval=True) F1 = gbm.F1(train=True, valid=True, xval=False) F1 = gbm.F1(train=True, valid=False, xval=True) F1 = gbm.F1(train=True, valid=True, xval=True) F1 = gbm.F1(train=False, valid=False, xval=False) # default: return training metrics F1 = gbm.F1(train=False, valid=True, xval=True) # F0point5 F0point51 = gbm.F0point5(train=True, valid=False, xval=False) F0point52 = gbm.F0point5(train=False, valid=True, xval=False) F0point53 = gbm.F0point5(train=False, valid=False, xval=True) F0point5 = gbm.F0point5(train=True, valid=True, xval=False) F0point5 = gbm.F0point5(train=True, valid=False, xval=True) F0point5 = gbm.F0point5(train=True, valid=True, xval=True) F0point5 = gbm.F0point5(train=False, valid=False, xval=False) # default: return training metrics F0point5 = gbm.F0point5(train=False, valid=True, xval=True) # F2 F21 = gbm.F2(train=True, valid=False, xval=False) F22 = gbm.F2(train=False, valid=True, xval=False) F23 = gbm.F2(train=False, valid=False, xval=True) F2 = gbm.F2(train=True, valid=True, xval=False) F2 = gbm.F2(train=True, valid=False, xval=True) F2 = gbm.F2(train=True, valid=True, xval=True) F2 = gbm.F2(train=False, valid=False, xval=False) # default: return training metrics F2 = gbm.F2(train=False, valid=True, xval=True) # accuracy accuracy1 = gbm.accuracy(train=True, valid=False, xval=False) accuracy2 = gbm.accuracy(train=False, valid=True, xval=False) accuracy3 = gbm.accuracy(train=False, valid=False, xval=True) accuracy = gbm.accuracy(train=True, valid=True, xval=False) accuracy = gbm.accuracy(train=True, valid=False, xval=True) accuracy = gbm.accuracy(train=True, valid=True, xval=True) accuracy = gbm.accuracy(train=False, valid=False, xval=False) # default: return training metrics accuracy = gbm.accuracy(train=False, valid=True, xval=True) # error error1 = gbm.error(train=True, valid=False, xval=False) error2 = gbm.error(train=False, valid=True, xval=False) error3 = gbm.error(train=False, valid=False, xval=True) error = gbm.error(train=True, valid=True, xval=False) error = gbm.error(train=True, valid=False, xval=True) error = gbm.error(train=True, valid=True, xval=True) error = gbm.error(train=False, valid=False, xval=False) # default: return training metrics error = gbm.error(train=False, valid=True, xval=True) # precision precision1 = gbm.precision(train=True, valid=False, xval=False) precision2 = gbm.precision(train=False, valid=True, xval=False) precision3 = gbm.precision(train=False, valid=False, xval=True) precision = gbm.precision(train=True, valid=True, xval=False) precision = gbm.precision(train=True, valid=False, xval=True) precision = gbm.precision(train=True, valid=True, xval=True) precision = gbm.precision(train=False, valid=False, xval=False) # default: return training metrics precision = gbm.precision(train=False, valid=True, xval=True) # mcc mcc1 = gbm.mcc(train=True, valid=False, xval=False) mcc2 = gbm.mcc(train=False, valid=True, xval=False) mcc3 = gbm.mcc(train=False, valid=False, xval=True) mcc = gbm.mcc(train=True, valid=True, xval=False) mcc = gbm.mcc(train=True, valid=False, xval=True) mcc = gbm.mcc(train=True, valid=True, xval=True) mcc = gbm.mcc(train=False, valid=False, xval=False) # default: return training metrics mcc = gbm.mcc(train=False, valid=True, xval=True) # max_per_class_error max_per_class_error1 = gbm.max_per_class_error(train=True, valid=False, xval=False) max_per_class_error2 = gbm.max_per_class_error(train=False, valid=True, xval=False) max_per_class_error3 = gbm.max_per_class_error(train=False, valid=False, xval=True) max_per_class_error = gbm.max_per_class_error(train=True, valid=True, xval=False) max_per_class_error = gbm.max_per_class_error(train=True, valid=False, xval=True) max_per_class_error = gbm.max_per_class_error(train=True, valid=True, xval=True) max_per_class_error = gbm.max_per_class_error( train=False, valid=False, xval=False) # default: return training metrics max_per_class_error = gbm.max_per_class_error(train=False, valid=True, xval=True) # confusion_matrix confusion_matrix1 = gbm.confusion_matrix(train=True, valid=False, xval=False) confusion_matrix2 = gbm.confusion_matrix(train=False, valid=True, xval=False) confusion_matrix3 = gbm.confusion_matrix(train=False, valid=False, xval=True) confusion_matrix = gbm.confusion_matrix(train=True, valid=True, xval=False) confusion_matrix = gbm.confusion_matrix(train=True, valid=False, xval=True) confusion_matrix = gbm.confusion_matrix(train=True, valid=True, xval=True) confusion_matrix = gbm.confusion_matrix( train=False, valid=False, xval=False) # default: return training metrics confusion_matrix = gbm.confusion_matrix(train=False, valid=True, xval=True) # # plot # plot1 = gbm.plot(train=True, valid=False, xval=False) # plot2 = gbm.plot(train=False, valid=True, xval=False) # plot3 = gbm.plot(train=False, valid=False, xval=True) # plot = gbm.plot(train=True, valid=True, xval=False) # plot = gbm.plot(train=True, valid=False, xval=True) # plot = gbm.plot(train=True, valid=True, xval=True) # plot = gbm.plot(train=False, valid=False, xval=False) # default: return training metrics # plot = gbm.plot(train=False, valid=True, xval=True) # # tpr # tpr1 = gbm.tpr(train=True, valid=False, xval=False) # tpr2 = gbm.tpr(train=False, valid=True, xval=False) # tpr3 = gbm.tpr(train=False, valid=False, xval=True) # tpr = gbm.tpr(train=True, valid=True, xval=False) # tpr = gbm.tpr(train=True, valid=False, xval=True) # tpr = gbm.tpr(train=True, valid=True, xval=True) # tpr = gbm.tpr(train=False, valid=False, xval=False) # default: return training metrics # tpr = gbm.tpr(train=False, valid=True, xval=True) # # # tnr # tnr1 = gbm.tnr(train=True, valid=False, xval=False) # tnr2 = gbm.tnr(train=False, valid=True, xval=False) # tnr3 = gbm.tnr(train=False, valid=False, xval=True) # tnr = gbm.tnr(train=True, valid=True, xval=False) # tnr = gbm.tnr(train=True, valid=False, xval=True) # tnr = gbm.tnr(train=True, valid=True, xval=True) # tnr = gbm.tnr(train=False, valid=False, xval=False) # default: return training metrics # tnr = gbm.tnr(train=False, valid=True, xval=True) # # # fnr # fnr1 = gbm.fnr(train=True, valid=False, xval=False) # fnr2 = gbm.fnr(train=False, valid=True, xval=False) # fnr3 = gbm.fnr(train=False, valid=False, xval=True) # fnr = gbm.fnr(train=True, valid=True, xval=False) # fnr = gbm.fnr(train=True, valid=False, xval=True) # fnr = gbm.fnr(train=True, valid=True, xval=True) # fnr = gbm.fnr(train=False, valid=False, xval=False) # default: return training metrics # fnr = gbm.fnr(train=False, valid=True, xval=True) # # # fpr # fpr1 = gbm.fpr(train=True, valid=False, xval=False) # fpr2 = gbm.fpr(train=False, valid=True, xval=False) # fpr3 = gbm.fpr(train=False, valid=False, xval=True) # fpr = gbm.fpr(train=True, valid=True, xval=False) # fpr = gbm.fpr(train=True, valid=False, xval=True) # fpr = gbm.fpr(train=True, valid=True, xval=True) # fpr = gbm.fpr(train=False, valid=False, xval=False) # default: return training metrics # fpr = gbm.fpr(train=False, valid=True, xval=True) # multinomial cars = h2o.import_file(path=h2o.locate("smalldata/junit/cars_20mpg.csv")) cars["cylinders"] = cars["cylinders"].asfactor() r = cars[0].runif() train = cars[r > .2] valid = cars[r <= .2] response_col = "cylinders" distribution = "multinomial" predictors = ["displacement", "power", "weight", "acceleration", "year"] gbm = h2o.gbm(y=train[response_col], x=train[predictors], validation_y=valid[response_col], validation_x=valid[predictors], nfolds=3, distribution=distribution, fold_assignment="Random") # mse mse1 = gbm.mse(train=True, valid=False, xval=False) assert isinstance(mse1, float) mse2 = gbm.mse(train=False, valid=True, xval=False) assert isinstance(mse2, float) mse3 = gbm.mse(train=False, valid=False, xval=True) assert isinstance(mse3, float) mse = gbm.mse(train=True, valid=True, xval=False) assert "train" in mse.keys() and "valid" in mse.keys( ), "expected training and validation metrics to be returned, but got {0}".format( mse.keys()) assert len( mse ) == 2, "expected only training and validation metrics to be returned, but got {0}".format( mse.keys()) assert isinstance(mse["train"], float) and isinstance( mse["valid"], float ), "expected training and validation metrics to be floats, but got {0} and {1}".format( type(mse["train"]), type(mse["valid"])) assert mse["valid"] == mse2 mse = gbm.mse(train=True, valid=False, xval=True) assert "train" in mse.keys() and "xval" in mse.keys( ), "expected training and cross validation metrics to be returned, but got {0}".format( mse.keys()) assert len( mse ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format( mse.keys()) assert isinstance(mse["train"], float) and isinstance( mse["xval"], float ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format( type(mse["train"]), type(mse["xval"])) assert mse["xval"] == mse3 mse = gbm.mse(train=True, valid=True, xval=True) assert "train" in mse.keys() and "valid" in mse.keys( ) and "xval" in mse.keys( ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format( mse.keys()) assert len( mse ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format( mse.keys()) assert isinstance(mse["train"], float) and isinstance( mse["valid"], float ) and isinstance( mse["xval"], float ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format( type(mse["train"]), type(mse["valid"]), type(mse["xval"])) mse = gbm.mse(train=False, valid=False, xval=False) # default: return training metrics assert isinstance(mse, float) assert mse == mse1 mse = gbm.mse(train=False, valid=True, xval=True) assert "valid" in mse.keys() and "xval" in mse.keys( ), "expected validation and cross validation metrics to be returned, but got {0}".format( mse.keys()) assert len( mse ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format( mse.keys()) assert isinstance(mse["valid"], float) and isinstance( mse["xval"], float ), "validation and cross validation metrics to be floats, but got {0} and {1}".format( type(mse["valid"]), type(mse["xval"])) # logloss logloss1 = gbm.logloss(train=True, valid=False, xval=False) assert isinstance(logloss1, float) logloss2 = gbm.logloss(train=False, valid=True, xval=False) assert isinstance(logloss2, float) logloss3 = gbm.logloss(train=False, valid=False, xval=True) assert isinstance(logloss3, float) logloss = gbm.logloss(train=True, valid=True, xval=False) assert "train" in logloss.keys() and "valid" in logloss.keys( ), "expected training and validation metrics to be returned, but got {0}".format( logloss.keys()) assert len( logloss ) == 2, "expected only training and validation metrics to be returned, but got {0}".format( logloss.keys()) assert isinstance(logloss["train"], float) and isinstance( logloss["valid"], float ), "expected training and validation metrics to be floats, but got {0} and {1}".format( type(logloss["train"]), type(logloss["valid"])) assert logloss["valid"] == logloss2 logloss = gbm.logloss(train=True, valid=False, xval=True) assert "train" in logloss.keys() and "xval" in logloss.keys( ), "expected training and cross validation metrics to be returned, but got {0}".format( logloss.keys()) assert len( logloss ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format( logloss.keys()) assert isinstance(logloss["train"], float) and isinstance( logloss["xval"], float ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format( type(logloss["train"]), type(logloss["xval"])) assert logloss["xval"] == logloss3 logloss = gbm.logloss(train=True, valid=True, xval=True) assert "train" in logloss.keys() and "valid" in logloss.keys( ) and "xval" in logloss.keys( ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format( logloss.keys()) assert len( logloss ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format( logloss.keys()) assert isinstance(logloss["train"], float) and isinstance( logloss["valid"], float ) and isinstance( logloss["xval"], float ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format( type(logloss["train"]), type(logloss["valid"]), type(logloss["xval"])) logloss = gbm.logloss(train=False, valid=False, xval=False) # default: return training metrics assert isinstance(logloss, float) assert logloss == logloss1 logloss = gbm.logloss(train=False, valid=True, xval=True) assert "valid" in logloss.keys() and "xval" in logloss.keys( ), "expected validation and cross validation metrics to be returned, but got {0}".format( logloss.keys()) assert len( logloss ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format( logloss.keys()) assert isinstance(logloss["valid"], float) and isinstance( logloss["xval"], float ), "validation and cross validation metrics to be floats, but got {0} and {1}".format( type(logloss["valid"]), type(logloss["xval"])) # hit_ratio_table hit_ratio_table1 = gbm.hit_ratio_table(train=True, valid=False, xval=False) hit_ratio_table2 = gbm.hit_ratio_table(train=False, valid=True, xval=False) hit_ratio_table3 = gbm.hit_ratio_table(train=False, valid=False, xval=True) hit_ratio_table = gbm.hit_ratio_table(train=True, valid=True, xval=False) hit_ratio_table = gbm.hit_ratio_table(train=True, valid=False, xval=True) hit_ratio_table = gbm.hit_ratio_table(train=True, valid=True, xval=True) hit_ratio_table = gbm.hit_ratio_table( train=False, valid=False, xval=False) # default: return training metrics hit_ratio_table = gbm.hit_ratio_table(train=False, valid=True, xval=True) # clustering iris = h2o.import_file(path=h2o.locate("smalldata/iris/iris.csv")) km = h2o.kmeans(x=iris[0:4], nfolds=3, k=3) # betweenss betweenss1 = km.betweenss(train=True, valid=False, xval=False) assert isinstance(betweenss1, float) betweenss3 = km.betweenss(train=False, valid=False, xval=True) assert isinstance(betweenss3, float) betweenss = km.betweenss(train=True, valid=False, xval=True) assert "train" in betweenss.keys() and "xval" in betweenss.keys( ), "expected training and cross validation metrics to be returned, but got {0}".format( betweenss.keys()) assert len( betweenss ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format( betweenss.keys()) assert isinstance(betweenss["train"], float) and isinstance( betweenss["xval"], float ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format( type(betweenss["train"]), type(betweenss["xval"])) assert betweenss["xval"] == betweenss3 betweenss = km.betweenss(train=False, valid=False, xval=False) # default: return training metrics assert isinstance(betweenss, float) assert betweenss == betweenss1 # totss totss1 = km.totss(train=True, valid=False, xval=False) assert isinstance(totss1, float) totss3 = km.totss(train=False, valid=False, xval=True) assert isinstance(totss3, float) totss = km.totss(train=True, valid=False, xval=True) assert "train" in totss.keys() and "xval" in totss.keys( ), "expected training and cross validation metrics to be returned, but got {0}".format( totss.keys()) assert len( totss ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format( totss.keys()) assert isinstance(totss["train"], float) and isinstance( totss["xval"], float ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format( type(totss["train"]), type(totss["xval"])) assert totss["xval"] == totss3 totss = km.totss(train=False, valid=False, xval=False) # default: return training metrics assert isinstance(totss, float) assert totss == totss1 # tot_withinss tot_withinss1 = km.tot_withinss(train=True, valid=False, xval=False) assert isinstance(tot_withinss1, float) tot_withinss3 = km.tot_withinss(train=False, valid=False, xval=True) assert isinstance(tot_withinss3, float) tot_withinss = km.tot_withinss(train=True, valid=False, xval=True) assert "train" in tot_withinss.keys() and "xval" in tot_withinss.keys( ), "expected training and cross validation metrics to be returned, but got {0}".format( tot_withinss.keys()) assert len( tot_withinss ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format( tot_withinss.keys()) assert isinstance(tot_withinss["train"], float) and isinstance( tot_withinss["xval"], float ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format( type(tot_withinss["train"]), type(tot_withinss["xval"])) assert tot_withinss["xval"] == tot_withinss3 tot_withinss = km.tot_withinss( train=False, valid=False, xval=False) # default: return training metrics assert isinstance(tot_withinss, float) assert tot_withinss == tot_withinss1 # withinss withinss1 = km.withinss(train=True, valid=False, xval=False) withinss3 = km.withinss(train=False, valid=False, xval=True) withinss = km.withinss(train=True, valid=False, xval=True) withinss = km.withinss(train=False, valid=False, xval=False) # default: return training metrics # centroid_stats centroid_stats1 = km.centroid_stats(train=True, valid=False, xval=False) centroid_stats3 = km.centroid_stats(train=False, valid=False, xval=True) centroid_stats = km.centroid_stats(train=True, valid=False, xval=True) centroid_stats = km.centroid_stats( train=False, valid=False, xval=False) # default: return training metrics # size size1 = km.size(train=True, valid=False, xval=False) size3 = km.size(train=False, valid=False, xval=True) size = km.size(train=True, valid=False, xval=True) size = km.size(train=False, valid=False, xval=False) # default: return training metrics
def metric_json_check(): df = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) # Regression metric json reg_mod = h2o.gbm(y=df["CAPSULE"], x=df[3:], training_frame=df, distribution="gaussian") reg_met = reg_mod.model_performance() reg_metric_json_keys_have = reg_met._metric_json.keys() reg_metric_json_keys_desired = [ u'model_category', u'description', u'r2', u'frame', u'model_checksum', u'MSE', u'__meta', u'scoring_time', u'predictions', u'model', u'duration_in_ms', u'frame_checksum', u'mean_residual_deviance' ] reg_metric_diff = list( set(reg_metric_json_keys_have) - set(reg_metric_json_keys_desired)) assert not reg_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) regression " \ "metric json. The difference is {2}".format(reg_metric_json_keys_have, reg_metric_json_keys_desired, reg_metric_diff) # Regression metric json (GLM) reg_mod = h2o.glm(y=df["CAPSULE"], x=df[3:], training_frame=df, family="gaussian") reg_met = reg_mod.model_performance() reg_metric_json_keys_have = reg_met._metric_json.keys() reg_metric_json_keys_desired = [ u'model_category', u'description', u'r2', u'residual_degrees_of_freedom', u'frame', u'model_checksum', u'MSE', u'__meta', u'null_deviance', u'scoring_time', u'null_degrees_of_freedom', u'predictions', u'AIC', u'model', u'duration_in_ms', u'frame_checksum', u'residual_deviance', u'mean_residual_deviance' ] reg_metric_diff = list( set(reg_metric_json_keys_have) - set(reg_metric_json_keys_desired)) assert not reg_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) glm-regression " \ "metric json. The difference is {2}".format(reg_metric_json_keys_have, reg_metric_json_keys_desired, reg_metric_diff) # Binomial metric json bin_mod = h2o.gbm(y=df["CAPSULE"].asfactor(), x=df[3:], training_frame=df, distribution="bernoulli") bin_met = bin_mod.model_performance() bin_metric_json_keys_have = bin_met._metric_json.keys() bin_metric_json_keys_desired = [ u'AUC', u'Gini', u'model_category', u'description', u'r2', u'frame', u'model_checksum', u'MSE', u'__meta', u'logloss', u'scoring_time', u'thresholds_and_metric_scores', u'predictions', u'max_criteria_and_metric_scores', u'model', u'duration_in_ms', u'frame_checksum', u'domain' ] bin_metric_diff = list( set(bin_metric_json_keys_have) - set(bin_metric_json_keys_desired)) assert not bin_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) binomial " \ "metric json. The difference is {2}".format(bin_metric_json_keys_have, bin_metric_json_keys_desired, bin_metric_diff) # Binomial metric json (GLM) bin_mod = h2o.glm(y=df["CAPSULE"].asfactor(), x=df[3:], training_frame=df, family="binomial") bin_met = bin_mod.model_performance() bin_metric_json_keys_have = bin_met._metric_json.keys() bin_metric_json_keys_desired = [ u'frame', u'residual_deviance', u'max_criteria_and_metric_scores', u'MSE', u'frame_checksum', u'AIC', u'logloss', u'Gini', u'predictions', u'AUC', u'description', u'model_checksum', u'duration_in_ms', u'model_category', u'r2', u'residual_degrees_of_freedom', u'__meta', u'null_deviance', u'scoring_time', u'null_degrees_of_freedom', u'model', u'thresholds_and_metric_scores', u'domain' ] bin_metric_diff = list( set(bin_metric_json_keys_have) - set(bin_metric_json_keys_desired)) assert not bin_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) glm-binomial " \ "metric json. The difference is {2}".format(bin_metric_json_keys_have, bin_metric_json_keys_desired, bin_metric_diff) # Multinomial metric json df = h2o.import_file( path=pyunit_utils.locate("smalldata/airlines/AirlinesTrain.csv.zip")) myX = [ "Origin", "Dest", "IsDepDelayed", "UniqueCarrier", "Distance", "fDayofMonth", "fDayOfWeek" ] myY = "fYear" mul_mod = h2o.gbm(x=df[myX], y=df[myY], training_frame=df, distribution="multinomial") mul_met = mul_mod.model_performance() mul_metric_json_keys_have = mul_met._metric_json.keys() mul_metric_json_keys_desired = [ u'cm', u'model_category', u'description', u'r2', u'frame', u'model_checksum', u'MSE', u'__meta', u'logloss', u'scoring_time', u'predictions', u'hit_ratio_table', u'model', u'duration_in_ms', u'frame_checksum' ] mul_metric_diff = list( set(mul_metric_json_keys_have) - set(mul_metric_json_keys_desired)) assert not mul_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) multinomial " \ "metric json. The difference is {2}".format(mul_metric_json_keys_have, mul_metric_json_keys_desired, mul_metric_diff) # Clustering metric json df = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv")) clus_mod = h2o.kmeans(x=df[0:4], k=3, standardize=False) clus_met = clus_mod.model_performance() clus_metric_json_keys_have = clus_met._metric_json.keys() clus_metric_json_keys_desired = [ u'tot_withinss', u'model_category', u'description', u'frame', u'model_checksum', u'MSE', u'__meta', u'scoring_time', u'betweenss', u'predictions', u'totss', u'model', u'duration_in_ms', u'frame_checksum', u'centroid_stats' ] clus_metric_diff = list( set(clus_metric_json_keys_have) - set(clus_metric_json_keys_desired)) assert not clus_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) clustering " \ "metric json. The difference is {2}".format(clus_metric_json_keys_have, clus_metric_json_keys_desired, clus_metric_diff)
def init_err_casesKmeans(): # Connect to a pre-existing cluster # connect to localhost:54321 # Log.info("Importing benign.csv data...\n") benign_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/benign.csv")) #benign_h2o.summary() numcol = benign_h2o.ncol numrow = benign_h2o.nrow # Log.info("Non-numeric entry that isn't 'Random', 'PlusPlus', or 'Furthest'") try: h2o.kmeans(x=benign_h2o, k=5, init='Test123') assert False, "expected an error" except EnvironmentError: assert True # Log.info("Empty list, tuple, or dictionary") try: h2o.kmeans(x=benign_h2o, k=0, user_points=[]) assert False, "expected an error" except EnvironmentError: assert True try: h2o.kmeans(x=benign_h2o, k=0, user_points=()) assert False, "expected an error" except EnvironmentError: assert True try: h2o.kmeans(x=benign_h2o, k=0, user_points={}) assert False, "expected an error" except EnvironmentError: assert True # Log.info("Number of columns doesn't equal training set's") start_small = [[random.gauss(0,1) for c in range(numcol-2)] for r in range(5)] start_large = [[random.gauss(0,1) for c in range(numcol+2)] for r in range(5)] try: h2o.kmeans(x=benign_h2o, k=5, user_points=h2o.H2OFrame(start_small)) assert False, "expected an error" except EnvironmentError: assert True try: h2o.kmeans(x=benign_h2o, k=5, user_points=h2o.H2OFrame(start_large)) assert False, "expected an error" except EnvironmentError: assert True # Log.info("Number of rows exceeds training set's") start = [[random.gauss(0,1) for c in range(numcol)] for r in range(numrow+2)] try: h2o.kmeans(x=benign_h2o, k=numrow+2, user_points=h2o.H2OFrame(start)) assert False, "expected an error" except EnvironmentError: assert True # Nones are replaced with mean of a column in H2O. Not sure about Inf. # Log.info("Any entry is NA, NaN, or Inf") start = [[random.gauss(0,1) for c in range(numcol)] for r in range(3)] for x in ["NA", "NaN", "Inf", "-Inf"]: start_err = start[:] start_err[1][random.randint(0,numcol-1)] = x h2o.kmeans(x=benign_h2o, k=3, user_points=h2o.H2OFrame(start_err)) # Duplicates will affect sampling probability during initialization. # Log.info("Duplicate initial clusters specified") start = [[random.gauss(0,1) for c in range(numcol)] for r in range(3)] start[2] = start[0] h2o.kmeans(x=benign_h2o, k=3, user_points=h2o.H2OFrame(start))
def metric_accessors(ip,port): cars = h2o.import_frame(path=h2o.locate("smalldata/junit/cars_20mpg.csv")) r = cars[0].runif() train = cars[r > .2] valid = cars[r <= .2] # regression response_col = "economy" distribution = "gaussian" predictors = ["displacement","power","weight","acceleration","year"] gbm = h2o.gbm(y=train[response_col], x=train[predictors], validation_y=valid[response_col], validation_x=valid[predictors], nfolds=3, distribution=distribution, fold_assignment="Random") # mse mse1 = gbm.mse(train=True, valid=False, xval=False) assert isinstance(mse1, float) mse2 = gbm.mse(train=False, valid=True, xval=False) assert isinstance(mse2, float) mse3 = gbm.mse(train=False, valid=False, xval=True) assert isinstance(mse3, float) mse = gbm.mse(train=True, valid=True, xval=False) assert "train" in mse.keys() and "valid" in mse.keys(), "expected training and validation metrics to be returned, but got {0}".format(mse.keys()) assert len(mse) == 2, "expected only training and validation metrics to be returned, but got {0}".format(mse.keys()) assert isinstance(mse["train"], float) and isinstance(mse["valid"], float), "expected training and validation metrics to be floats, but got {0} and {1}".format(type(mse["train"]), type(mse["valid"])) assert mse["valid"] == mse2 mse = gbm.mse(train=True, valid=False, xval=True) assert "train" in mse.keys() and "xval" in mse.keys(), "expected training and cross validation metrics to be returned, but got {0}".format(mse.keys()) assert len(mse) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(mse.keys()) assert isinstance(mse["train"], float) and isinstance(mse["xval"], float), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(type(mse["train"]), type(mse["xval"])) assert mse["xval"] == mse3 mse = gbm.mse(train=True, valid=True, xval=True) assert "train" in mse.keys() and "valid" in mse.keys() and "xval" in mse.keys(), "expected training, validation, and cross validation metrics to be returned, but got {0}".format(mse.keys()) assert len(mse) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format(mse.keys()) assert isinstance(mse["train"], float) and isinstance(mse["valid"], float) and isinstance(mse["xval"], float), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format(type(mse["train"]), type(mse["valid"]), type(mse["xval"])) mse = gbm.mse(train=False, valid=False, xval=False) # default: return training metrics assert isinstance(mse, float) assert mse == mse1 mse = gbm.mse(train=False, valid=True, xval=True) assert "valid" in mse.keys() and "xval" in mse.keys(), "expected validation and cross validation metrics to be returned, but got {0}".format(mse.keys()) assert len(mse) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format(mse.keys()) assert isinstance(mse["valid"], float) and isinstance(mse["xval"], float), "validation and cross validation metrics to be floats, but got {0} and {1}".format(type(mse["valid"]), type(mse["xval"])) # r2 r21 = gbm.r2(train=True, valid=False, xval=False) assert isinstance(r21, float) r22 = gbm.r2(train=False, valid=True, xval=False) assert isinstance(r22, float) r23 = gbm.r2(train=False, valid=False, xval=True) assert isinstance(r23, float) r2 = gbm.r2(train=True, valid=True, xval=False) assert "train" in r2.keys() and "valid" in r2.keys(), "expected training and validation metrics to be returned, but got {0}".format(r2.keys()) assert len(r2) == 2, "expected only training and validation metrics to be returned, but got {0}".format(r2.keys()) assert isinstance(r2["train"], float) and isinstance(r2["valid"], float), "expected training and validation metrics to be floats, but got {0} and {1}".format(type(r2["train"]), type(r2["valid"])) assert r2["valid"] == r22 r2 = gbm.r2(train=True, valid=False, xval=True) assert "train" in r2.keys() and "xval" in r2.keys(), "expected training and cross validation metrics to be returned, but got {0}".format(r2.keys()) assert len(r2) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(r2.keys()) assert isinstance(r2["train"], float) and isinstance(r2["xval"], float), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(type(r2["train"]), type(r2["xval"])) assert r2["xval"] == r23 r2 = gbm.r2(train=True, valid=True, xval=True) assert "train" in r2.keys() and "valid" in r2.keys() and "xval" in r2.keys(), "expected training, validation, and cross validation metrics to be returned, but got {0}".format(r2.keys()) assert len(r2) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format(r2.keys()) assert isinstance(r2["train"], float) and isinstance(r2["valid"], float) and isinstance(r2["xval"], float), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format(type(r2["train"]), type(r2["valid"]), type(r2["xval"])) r2 = gbm.r2(train=False, valid=False, xval=False) # default: return training metrics assert isinstance(r2, float) assert r2 == r21 r2 = gbm.r2(train=False, valid=True, xval=True) assert "valid" in r2.keys() and "xval" in r2.keys(), "expected validation and cross validation metrics to be returned, but got {0}".format(r2.keys()) assert len(r2) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format(r2.keys()) assert isinstance(r2["valid"], float) and isinstance(r2["xval"], float), "validation and cross validation metrics to be floats, but got {0} and {1}".format(type(r2["valid"]), type(r2["xval"])) # mean_residual_deviance mean_residual_deviance1 = gbm.mean_residual_deviance(train=True, valid=False, xval=False) assert isinstance(mean_residual_deviance1, float) mean_residual_deviance2 = gbm.mean_residual_deviance(train=False, valid=True, xval=False) assert isinstance(mean_residual_deviance2, float) mean_residual_deviance3 = gbm.mean_residual_deviance(train=False, valid=False, xval=True) assert isinstance(mean_residual_deviance3, float) mean_residual_deviance = gbm.mean_residual_deviance(train=True, valid=True, xval=False) assert "train" in mean_residual_deviance.keys() and "valid" in mean_residual_deviance.keys(), "expected training and validation metrics to be returned, but got {0}".format(mean_residual_deviance.keys()) assert len(mean_residual_deviance) == 2, "expected only training and validation metrics to be returned, but got {0}".format(mean_residual_deviance.keys()) assert isinstance(mean_residual_deviance["train"], float) and isinstance(mean_residual_deviance["valid"], float), "expected training and validation metrics to be floats, but got {0} and {1}".format(type(mean_residual_deviance["train"]), type(mean_residual_deviance["valid"])) assert mean_residual_deviance["valid"] == mean_residual_deviance2 mean_residual_deviance = gbm.mean_residual_deviance(train=True, valid=False, xval=True) assert "train" in mean_residual_deviance.keys() and "xval" in mean_residual_deviance.keys(), "expected training and cross validation metrics to be returned, but got {0}".format(mean_residual_deviance.keys()) assert len(mean_residual_deviance) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(mean_residual_deviance.keys()) assert isinstance(mean_residual_deviance["train"], float) and isinstance(mean_residual_deviance["xval"], float), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(type(mean_residual_deviance["train"]), type(mean_residual_deviance["xval"])) assert mean_residual_deviance["xval"] == mean_residual_deviance3 mean_residual_deviance = gbm.mean_residual_deviance(train=True, valid=True, xval=True) assert "train" in mean_residual_deviance.keys() and "valid" in mean_residual_deviance.keys() and "xval" in mean_residual_deviance.keys(), "expected training, validation, and cross validation metrics to be returned, but got {0}".format(mean_residual_deviance.keys()) assert len(mean_residual_deviance) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format(mean_residual_deviance.keys()) assert isinstance(mean_residual_deviance["train"], float) and isinstance(mean_residual_deviance["valid"], float) and isinstance(mean_residual_deviance["xval"], float), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format(type(mean_residual_deviance["train"]), type(mean_residual_deviance["valid"]), type(mean_residual_deviance["xval"])) mean_residual_deviance = gbm.mean_residual_deviance(train=False, valid=False, xval=False) # default: return training metrics assert isinstance(mean_residual_deviance, float) assert mean_residual_deviance == mean_residual_deviance1 mean_residual_deviance = gbm.mean_residual_deviance(train=False, valid=True, xval=True) assert "valid" in mean_residual_deviance.keys() and "xval" in mean_residual_deviance.keys(), "expected validation and cross validation metrics to be returned, but got {0}".format(mean_residual_deviance.keys()) assert len(mean_residual_deviance) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format(mean_residual_deviance.keys()) assert isinstance(mean_residual_deviance["valid"], float) and isinstance(mean_residual_deviance["xval"], float), "validation and cross validation metrics to be floats, but got {0} and {1}".format(type(mean_residual_deviance["valid"]), type(mean_residual_deviance["xval"])) # binomial cars = h2o.import_frame(path=h2o.locate("smalldata/junit/cars_20mpg.csv")) cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() r = cars[0].runif() train = cars[r > .2] valid = cars[r <= .2] response_col = "economy_20mpg" distribution = "bernoulli" predictors = ["displacement","power","weight","acceleration","year"] gbm = h2o.gbm(y=train[response_col], x=train[predictors], validation_y=valid[response_col], validation_x=valid[predictors], nfolds=3, distribution=distribution, fold_assignment="Random") # auc auc1 = gbm.auc(train=True, valid=False, xval=False) assert isinstance(auc1, float) auc2 = gbm.auc(train=False, valid=True, xval=False) assert isinstance(auc2, float) auc3 = gbm.auc(train=False, valid=False, xval=True) assert isinstance(auc3, float) auc = gbm.auc(train=True, valid=True, xval=False) assert "train" in auc.keys() and "valid" in auc.keys(), "expected training and validation metrics to be returned, but got {0}".format(auc.keys()) assert len(auc) == 2, "expected only training and validation metrics to be returned, but got {0}".format(auc.keys()) assert isinstance(auc["train"], float) and isinstance(auc["valid"], float), "expected training and validation metrics to be floats, but got {0} and {1}".format(type(auc["train"]), type(auc["valid"])) assert auc["valid"] == auc2 auc = gbm.auc(train=True, valid=False, xval=True) assert "train" in auc.keys() and "xval" in auc.keys(), "expected training and cross validation metrics to be returned, but got {0}".format(auc.keys()) assert len(auc) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(auc.keys()) assert isinstance(auc["train"], float) and isinstance(auc["xval"], float), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(type(auc["train"]), type(auc["xval"])) assert auc["xval"] == auc3 auc = gbm.auc(train=True, valid=True, xval=True) assert "train" in auc.keys() and "valid" in auc.keys() and "xval" in auc.keys(), "expected training, validation, and cross validation metrics to be returned, but got {0}".format(auc.keys()) assert len(auc) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format(auc.keys()) assert isinstance(auc["train"], float) and isinstance(auc["valid"], float) and isinstance(auc["xval"], float), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format(type(auc["train"]), type(auc["valid"]), type(auc["xval"])) auc = gbm.auc(train=False, valid=False, xval=False) # default: return training metrics assert isinstance(auc, float) assert auc == auc1 auc = gbm.auc(train=False, valid=True, xval=True) assert "valid" in auc.keys() and "xval" in auc.keys(), "expected validation and cross validation metrics to be returned, but got {0}".format(auc.keys()) assert len(auc) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format(auc.keys()) assert isinstance(auc["valid"], float) and isinstance(auc["xval"], float), "validation and cross validation metrics to be floats, but got {0} and {1}".format(type(auc["valid"]), type(auc["xval"])) # logloss logloss1 = gbm.logloss(train=True, valid=False, xval=False) assert isinstance(logloss1, float) logloss2 = gbm.logloss(train=False, valid=True, xval=False) assert isinstance(logloss2, float) logloss3 = gbm.logloss(train=False, valid=False, xval=True) assert isinstance(logloss3, float) logloss = gbm.logloss(train=True, valid=True, xval=False) assert "train" in logloss.keys() and "valid" in logloss.keys(), "expected training and validation metrics to be returned, but got {0}".format(logloss.keys()) assert len(logloss) == 2, "expected only training and validation metrics to be returned, but got {0}".format(logloss.keys()) assert isinstance(logloss["train"], float) and isinstance(logloss["valid"], float), "expected training and validation metrics to be floats, but got {0} and {1}".format(type(logloss["train"]), type(logloss["valid"])) assert logloss["valid"] == logloss2 logloss = gbm.logloss(train=True, valid=False, xval=True) assert "train" in logloss.keys() and "xval" in logloss.keys(), "expected training and cross validation metrics to be returned, but got {0}".format(logloss.keys()) assert len(logloss) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(logloss.keys()) assert isinstance(logloss["train"], float) and isinstance(logloss["xval"], float), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(type(logloss["train"]), type(logloss["xval"])) assert logloss["xval"] == logloss3 logloss = gbm.logloss(train=True, valid=True, xval=True) assert "train" in logloss.keys() and "valid" in logloss.keys() and "xval" in logloss.keys(), "expected training, validation, and cross validation metrics to be returned, but got {0}".format(logloss.keys()) assert len(logloss) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format(logloss.keys()) assert isinstance(logloss["train"], float) and isinstance(logloss["valid"], float) and isinstance(logloss["xval"], float), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format(type(logloss["train"]), type(logloss["valid"]), type(logloss["xval"])) logloss = gbm.logloss(train=False, valid=False, xval=False) # default: return training metrics assert isinstance(logloss, float) assert logloss == logloss1 logloss = gbm.logloss(train=False, valid=True, xval=True) assert "valid" in logloss.keys() and "xval" in logloss.keys(), "expected validation and cross validation metrics to be returned, but got {0}".format(logloss.keys()) assert len(logloss) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format(logloss.keys()) assert isinstance(logloss["valid"], float) and isinstance(logloss["xval"], float), "validation and cross validation metrics to be floats, but got {0} and {1}".format(type(logloss["valid"]), type(logloss["xval"])) # giniCoef giniCoef1 = gbm.giniCoef(train=True, valid=False, xval=False) assert isinstance(giniCoef1, float) giniCoef2 = gbm.giniCoef(train=False, valid=True, xval=False) assert isinstance(giniCoef2, float) giniCoef3 = gbm.giniCoef(train=False, valid=False, xval=True) assert isinstance(giniCoef3, float) giniCoef = gbm.giniCoef(train=True, valid=True, xval=False) assert "train" in giniCoef.keys() and "valid" in giniCoef.keys(), "expected training and validation metrics to be returned, but got {0}".format(giniCoef.keys()) assert len(giniCoef) == 2, "expected only training and validation metrics to be returned, but got {0}".format(giniCoef.keys()) assert isinstance(giniCoef["train"], float) and isinstance(giniCoef["valid"], float), "expected training and validation metrics to be floats, but got {0} and {1}".format(type(giniCoef["train"]), type(giniCoef["valid"])) assert giniCoef["valid"] == giniCoef2 giniCoef = gbm.giniCoef(train=True, valid=False, xval=True) assert "train" in giniCoef.keys() and "xval" in giniCoef.keys(), "expected training and cross validation metrics to be returned, but got {0}".format(giniCoef.keys()) assert len(giniCoef) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(giniCoef.keys()) assert isinstance(giniCoef["train"], float) and isinstance(giniCoef["xval"], float), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(type(giniCoef["train"]), type(giniCoef["xval"])) assert giniCoef["xval"] == giniCoef3 giniCoef = gbm.giniCoef(train=True, valid=True, xval=True) assert "train" in giniCoef.keys() and "valid" in giniCoef.keys() and "xval" in giniCoef.keys(), "expected training, validation, and cross validation metrics to be returned, but got {0}".format(giniCoef.keys()) assert len(giniCoef) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format(giniCoef.keys()) assert isinstance(giniCoef["train"], float) and isinstance(giniCoef["valid"], float) and isinstance(giniCoef["xval"], float), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format(type(giniCoef["train"]), type(giniCoef["valid"]), type(giniCoef["xval"])) giniCoef = gbm.giniCoef(train=False, valid=False, xval=False) # default: return training metrics assert isinstance(giniCoef, float) assert giniCoef == giniCoef1 giniCoef = gbm.giniCoef(train=False, valid=True, xval=True) assert "valid" in giniCoef.keys() and "xval" in giniCoef.keys(), "expected validation and cross validation metrics to be returned, but got {0}".format(giniCoef.keys()) assert len(giniCoef) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format(giniCoef.keys()) assert isinstance(giniCoef["valid"], float) and isinstance(giniCoef["xval"], float), "validation and cross validation metrics to be floats, but got {0} and {1}".format(type(giniCoef["valid"]), type(giniCoef["xval"])) # F1 F11 = gbm.F1(train=True, valid=False, xval=False) F12 = gbm.F1(train=False, valid=True, xval=False) F13 = gbm.F1(train=False, valid=False, xval=True) F1 = gbm.F1(train=True, valid=True, xval=False) F1 = gbm.F1(train=True, valid=False, xval=True) F1 = gbm.F1(train=True, valid=True, xval=True) F1 = gbm.F1(train=False, valid=False, xval=False) # default: return training metrics F1 = gbm.F1(train=False, valid=True, xval=True) # F0point5 F0point51 = gbm.F0point5(train=True, valid=False, xval=False) F0point52 = gbm.F0point5(train=False, valid=True, xval=False) F0point53 = gbm.F0point5(train=False, valid=False, xval=True) F0point5 = gbm.F0point5(train=True, valid=True, xval=False) F0point5 = gbm.F0point5(train=True, valid=False, xval=True) F0point5 = gbm.F0point5(train=True, valid=True, xval=True) F0point5 = gbm.F0point5(train=False, valid=False, xval=False) # default: return training metrics F0point5 = gbm.F0point5(train=False, valid=True, xval=True) # F2 F21 = gbm.F2(train=True, valid=False, xval=False) F22 = gbm.F2(train=False, valid=True, xval=False) F23 = gbm.F2(train=False, valid=False, xval=True) F2 = gbm.F2(train=True, valid=True, xval=False) F2 = gbm.F2(train=True, valid=False, xval=True) F2 = gbm.F2(train=True, valid=True, xval=True) F2 = gbm.F2(train=False, valid=False, xval=False) # default: return training metrics F2 = gbm.F2(train=False, valid=True, xval=True) # accuracy accuracy1 = gbm.accuracy(train=True, valid=False, xval=False) accuracy2 = gbm.accuracy(train=False, valid=True, xval=False) accuracy3 = gbm.accuracy(train=False, valid=False, xval=True) accuracy = gbm.accuracy(train=True, valid=True, xval=False) accuracy = gbm.accuracy(train=True, valid=False, xval=True) accuracy = gbm.accuracy(train=True, valid=True, xval=True) accuracy = gbm.accuracy(train=False, valid=False, xval=False) # default: return training metrics accuracy = gbm.accuracy(train=False, valid=True, xval=True) # error error1 = gbm.error(train=True, valid=False, xval=False) error2 = gbm.error(train=False, valid=True, xval=False) error3 = gbm.error(train=False, valid=False, xval=True) error = gbm.error(train=True, valid=True, xval=False) error = gbm.error(train=True, valid=False, xval=True) error = gbm.error(train=True, valid=True, xval=True) error = gbm.error(train=False, valid=False, xval=False) # default: return training metrics error = gbm.error(train=False, valid=True, xval=True) # precision precision1 = gbm.precision(train=True, valid=False, xval=False) precision2 = gbm.precision(train=False, valid=True, xval=False) precision3 = gbm.precision(train=False, valid=False, xval=True) precision = gbm.precision(train=True, valid=True, xval=False) precision = gbm.precision(train=True, valid=False, xval=True) precision = gbm.precision(train=True, valid=True, xval=True) precision = gbm.precision(train=False, valid=False, xval=False) # default: return training metrics precision = gbm.precision(train=False, valid=True, xval=True) # mcc mcc1 = gbm.mcc(train=True, valid=False, xval=False) mcc2 = gbm.mcc(train=False, valid=True, xval=False) mcc3 = gbm.mcc(train=False, valid=False, xval=True) mcc = gbm.mcc(train=True, valid=True, xval=False) mcc = gbm.mcc(train=True, valid=False, xval=True) mcc = gbm.mcc(train=True, valid=True, xval=True) mcc = gbm.mcc(train=False, valid=False, xval=False) # default: return training metrics mcc = gbm.mcc(train=False, valid=True, xval=True) # max_per_class_error max_per_class_error1 = gbm.max_per_class_error(train=True, valid=False, xval=False) max_per_class_error2 = gbm.max_per_class_error(train=False, valid=True, xval=False) max_per_class_error3 = gbm.max_per_class_error(train=False, valid=False, xval=True) max_per_class_error = gbm.max_per_class_error(train=True, valid=True, xval=False) max_per_class_error = gbm.max_per_class_error(train=True, valid=False, xval=True) max_per_class_error = gbm.max_per_class_error(train=True, valid=True, xval=True) max_per_class_error = gbm.max_per_class_error(train=False, valid=False, xval=False) # default: return training metrics max_per_class_error = gbm.max_per_class_error(train=False, valid=True, xval=True) # confusion_matrix confusion_matrix1 = gbm.confusion_matrix(train=True, valid=False, xval=False) confusion_matrix2 = gbm.confusion_matrix(train=False, valid=True, xval=False) confusion_matrix3 = gbm.confusion_matrix(train=False, valid=False, xval=True) confusion_matrix = gbm.confusion_matrix(train=True, valid=True, xval=False) confusion_matrix = gbm.confusion_matrix(train=True, valid=False, xval=True) confusion_matrix = gbm.confusion_matrix(train=True, valid=True, xval=True) confusion_matrix = gbm.confusion_matrix(train=False, valid=False, xval=False) # default: return training metrics confusion_matrix = gbm.confusion_matrix(train=False, valid=True, xval=True) # # plot # plot1 = gbm.plot(train=True, valid=False, xval=False) # plot2 = gbm.plot(train=False, valid=True, xval=False) # plot3 = gbm.plot(train=False, valid=False, xval=True) # plot = gbm.plot(train=True, valid=True, xval=False) # plot = gbm.plot(train=True, valid=False, xval=True) # plot = gbm.plot(train=True, valid=True, xval=True) # plot = gbm.plot(train=False, valid=False, xval=False) # default: return training metrics # plot = gbm.plot(train=False, valid=True, xval=True) # # tpr # tpr1 = gbm.tpr(train=True, valid=False, xval=False) # tpr2 = gbm.tpr(train=False, valid=True, xval=False) # tpr3 = gbm.tpr(train=False, valid=False, xval=True) # tpr = gbm.tpr(train=True, valid=True, xval=False) # tpr = gbm.tpr(train=True, valid=False, xval=True) # tpr = gbm.tpr(train=True, valid=True, xval=True) # tpr = gbm.tpr(train=False, valid=False, xval=False) # default: return training metrics # tpr = gbm.tpr(train=False, valid=True, xval=True) # # # tnr # tnr1 = gbm.tnr(train=True, valid=False, xval=False) # tnr2 = gbm.tnr(train=False, valid=True, xval=False) # tnr3 = gbm.tnr(train=False, valid=False, xval=True) # tnr = gbm.tnr(train=True, valid=True, xval=False) # tnr = gbm.tnr(train=True, valid=False, xval=True) # tnr = gbm.tnr(train=True, valid=True, xval=True) # tnr = gbm.tnr(train=False, valid=False, xval=False) # default: return training metrics # tnr = gbm.tnr(train=False, valid=True, xval=True) # # # fnr # fnr1 = gbm.fnr(train=True, valid=False, xval=False) # fnr2 = gbm.fnr(train=False, valid=True, xval=False) # fnr3 = gbm.fnr(train=False, valid=False, xval=True) # fnr = gbm.fnr(train=True, valid=True, xval=False) # fnr = gbm.fnr(train=True, valid=False, xval=True) # fnr = gbm.fnr(train=True, valid=True, xval=True) # fnr = gbm.fnr(train=False, valid=False, xval=False) # default: return training metrics # fnr = gbm.fnr(train=False, valid=True, xval=True) # # # fpr # fpr1 = gbm.fpr(train=True, valid=False, xval=False) # fpr2 = gbm.fpr(train=False, valid=True, xval=False) # fpr3 = gbm.fpr(train=False, valid=False, xval=True) # fpr = gbm.fpr(train=True, valid=True, xval=False) # fpr = gbm.fpr(train=True, valid=False, xval=True) # fpr = gbm.fpr(train=True, valid=True, xval=True) # fpr = gbm.fpr(train=False, valid=False, xval=False) # default: return training metrics # fpr = gbm.fpr(train=False, valid=True, xval=True) # multinomial cars = h2o.import_frame(path=h2o.locate("smalldata/junit/cars_20mpg.csv")) cars["cylinders"] = cars["cylinders"].asfactor() r = cars[0].runif() train = cars[r > .2] valid = cars[r <= .2] response_col = "cylinders" distribution = "multinomial" predictors = ["displacement","power","weight","acceleration","year"] gbm = h2o.gbm(y=train[response_col], x=train[predictors], validation_y=valid[response_col], validation_x=valid[predictors], nfolds=3, distribution=distribution, fold_assignment="Random") # mse mse1 = gbm.mse(train=True, valid=False, xval=False) assert isinstance(mse1, float) mse2 = gbm.mse(train=False, valid=True, xval=False) assert isinstance(mse2, float) mse3 = gbm.mse(train=False, valid=False, xval=True) assert isinstance(mse3, float) mse = gbm.mse(train=True, valid=True, xval=False) assert "train" in mse.keys() and "valid" in mse.keys(), "expected training and validation metrics to be returned, but got {0}".format(mse.keys()) assert len(mse) == 2, "expected only training and validation metrics to be returned, but got {0}".format(mse.keys()) assert isinstance(mse["train"], float) and isinstance(mse["valid"], float), "expected training and validation metrics to be floats, but got {0} and {1}".format(type(mse["train"]), type(mse["valid"])) assert mse["valid"] == mse2 mse = gbm.mse(train=True, valid=False, xval=True) assert "train" in mse.keys() and "xval" in mse.keys(), "expected training and cross validation metrics to be returned, but got {0}".format(mse.keys()) assert len(mse) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(mse.keys()) assert isinstance(mse["train"], float) and isinstance(mse["xval"], float), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(type(mse["train"]), type(mse["xval"])) assert mse["xval"] == mse3 mse = gbm.mse(train=True, valid=True, xval=True) assert "train" in mse.keys() and "valid" in mse.keys() and "xval" in mse.keys(), "expected training, validation, and cross validation metrics to be returned, but got {0}".format(mse.keys()) assert len(mse) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format(mse.keys()) assert isinstance(mse["train"], float) and isinstance(mse["valid"], float) and isinstance(mse["xval"], float), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format(type(mse["train"]), type(mse["valid"]), type(mse["xval"])) mse = gbm.mse(train=False, valid=False, xval=False) # default: return training metrics assert isinstance(mse, float) assert mse == mse1 mse = gbm.mse(train=False, valid=True, xval=True) assert "valid" in mse.keys() and "xval" in mse.keys(), "expected validation and cross validation metrics to be returned, but got {0}".format(mse.keys()) assert len(mse) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format(mse.keys()) assert isinstance(mse["valid"], float) and isinstance(mse["xval"], float), "validation and cross validation metrics to be floats, but got {0} and {1}".format(type(mse["valid"]), type(mse["xval"])) # logloss logloss1 = gbm.logloss(train=True, valid=False, xval=False) assert isinstance(logloss1, float) logloss2 = gbm.logloss(train=False, valid=True, xval=False) assert isinstance(logloss2, float) logloss3 = gbm.logloss(train=False, valid=False, xval=True) assert isinstance(logloss3, float) logloss = gbm.logloss(train=True, valid=True, xval=False) assert "train" in logloss.keys() and "valid" in logloss.keys(), "expected training and validation metrics to be returned, but got {0}".format(logloss.keys()) assert len(logloss) == 2, "expected only training and validation metrics to be returned, but got {0}".format(logloss.keys()) assert isinstance(logloss["train"], float) and isinstance(logloss["valid"], float), "expected training and validation metrics to be floats, but got {0} and {1}".format(type(logloss["train"]), type(logloss["valid"])) assert logloss["valid"] == logloss2 logloss = gbm.logloss(train=True, valid=False, xval=True) assert "train" in logloss.keys() and "xval" in logloss.keys(), "expected training and cross validation metrics to be returned, but got {0}".format(logloss.keys()) assert len(logloss) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(logloss.keys()) assert isinstance(logloss["train"], float) and isinstance(logloss["xval"], float), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(type(logloss["train"]), type(logloss["xval"])) assert logloss["xval"] == logloss3 logloss = gbm.logloss(train=True, valid=True, xval=True) assert "train" in logloss.keys() and "valid" in logloss.keys() and "xval" in logloss.keys(), "expected training, validation, and cross validation metrics to be returned, but got {0}".format(logloss.keys()) assert len(logloss) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format(logloss.keys()) assert isinstance(logloss["train"], float) and isinstance(logloss["valid"], float) and isinstance(logloss["xval"], float), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format(type(logloss["train"]), type(logloss["valid"]), type(logloss["xval"])) logloss = gbm.logloss(train=False, valid=False, xval=False) # default: return training metrics assert isinstance(logloss, float) assert logloss == logloss1 logloss = gbm.logloss(train=False, valid=True, xval=True) assert "valid" in logloss.keys() and "xval" in logloss.keys(), "expected validation and cross validation metrics to be returned, but got {0}".format(logloss.keys()) assert len(logloss) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format(logloss.keys()) assert isinstance(logloss["valid"], float) and isinstance(logloss["xval"], float), "validation and cross validation metrics to be floats, but got {0} and {1}".format(type(logloss["valid"]), type(logloss["xval"])) # hit_ratio_table hit_ratio_table1 = gbm.hit_ratio_table(train=True, valid=False, xval=False) hit_ratio_table2 = gbm.hit_ratio_table(train=False, valid=True, xval=False) hit_ratio_table3 = gbm.hit_ratio_table(train=False, valid=False, xval=True) hit_ratio_table = gbm.hit_ratio_table(train=True, valid=True, xval=False) hit_ratio_table = gbm.hit_ratio_table(train=True, valid=False, xval=True) hit_ratio_table = gbm.hit_ratio_table(train=True, valid=True, xval=True) hit_ratio_table = gbm.hit_ratio_table(train=False, valid=False, xval=False) # default: return training metrics hit_ratio_table = gbm.hit_ratio_table(train=False, valid=True, xval=True) # clustering iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv")) km = h2o.kmeans(x=iris[0:4], nfolds=3, k=3) # betweenss betweenss1 = km.betweenss(train=True, valid=False, xval=False) assert isinstance(betweenss1, float) betweenss3 = km.betweenss(train=False, valid=False, xval=True) assert isinstance(betweenss3, float) betweenss = km.betweenss(train=True, valid=False, xval=True) assert "train" in betweenss.keys() and "xval" in betweenss.keys(), "expected training and cross validation metrics to be returned, but got {0}".format(betweenss.keys()) assert len(betweenss) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(betweenss.keys()) assert isinstance(betweenss["train"], float) and isinstance(betweenss["xval"], float), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(type(betweenss["train"]), type(betweenss["xval"])) assert betweenss["xval"] == betweenss3 betweenss = km.betweenss(train=False, valid=False, xval=False) # default: return training metrics assert isinstance(betweenss, float) assert betweenss == betweenss1 # totss totss1 = km.totss(train=True, valid=False, xval=False) assert isinstance(totss1, float) totss3 = km.totss(train=False, valid=False, xval=True) assert isinstance(totss3, float) totss = km.totss(train=True, valid=False, xval=True) assert "train" in totss.keys() and "xval" in totss.keys(), "expected training and cross validation metrics to be returned, but got {0}".format(totss.keys()) assert len(totss) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(totss.keys()) assert isinstance(totss["train"], float) and isinstance(totss["xval"], float), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(type(totss["train"]), type(totss["xval"])) assert totss["xval"] == totss3 totss = km.totss(train=False, valid=False, xval=False) # default: return training metrics assert isinstance(totss, float) assert totss == totss1 # tot_withinss tot_withinss1 = km.tot_withinss(train=True, valid=False, xval=False) assert isinstance(tot_withinss1, float) tot_withinss3 = km.tot_withinss(train=False, valid=False, xval=True) assert isinstance(tot_withinss3, float) tot_withinss = km.tot_withinss(train=True, valid=False, xval=True) assert "train" in tot_withinss.keys() and "xval" in tot_withinss.keys(), "expected training and cross validation metrics to be returned, but got {0}".format(tot_withinss.keys()) assert len(tot_withinss) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(tot_withinss.keys()) assert isinstance(tot_withinss["train"], float) and isinstance(tot_withinss["xval"], float), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(type(tot_withinss["train"]), type(tot_withinss["xval"])) assert tot_withinss["xval"] == tot_withinss3 tot_withinss = km.tot_withinss(train=False, valid=False, xval=False) # default: return training metrics assert isinstance(tot_withinss, float) assert tot_withinss == tot_withinss1 # withinss withinss1 = km.withinss(train=True, valid=False, xval=False) withinss3 = km.withinss(train=False, valid=False, xval=True) withinss = km.withinss(train=True, valid=False, xval=True) withinss = km.withinss(train=False, valid=False, xval=False) # default: return training metrics # centroid_stats centroid_stats1 = km.centroid_stats(train=True, valid=False, xval=False) centroid_stats3 = km.centroid_stats(train=False, valid=False, xval=True) centroid_stats = km.centroid_stats(train=True, valid=False, xval=True) centroid_stats = km.centroid_stats(train=False, valid=False, xval=False) # default: return training metrics # size size1 = km.size(train=True, valid=False, xval=False) size3 = km.size(train=False, valid=False, xval=True) size = km.size(train=True, valid=False, xval=True) size = km.size(train=False, valid=False, xval=False) # default: return training metrics
def init_err_casesKmeans(ip, port): # Connect to a pre-existing cluster # connect to localhost:54321 # Log.info("Importing benign.csv data...\n") benign_h2o = h2o.import_file( path=h2o.locate("smalldata/logreg/benign.csv")) #benign_h2o.summary() numcol = benign_h2o.ncol numrow = benign_h2o.nrow # Log.info("Non-numeric entry that isn't 'Random', 'PlusPlus', or 'Furthest'") try: h2o.kmeans(x=benign_h2o, k=5, init='Test123') assert False, "expected an error" except EnvironmentError: assert True # Log.info("Empty list, tuple, or dictionary") try: h2o.kmeans(x=benign_h2o, k=0, user_points=[]) assert False, "expected an error" except EnvironmentError: assert True try: h2o.kmeans(x=benign_h2o, k=0, user_points=()) assert False, "expected an error" except EnvironmentError: assert True try: h2o.kmeans(x=benign_h2o, k=0, user_points={}) assert False, "expected an error" except EnvironmentError: assert True # Log.info("Number of columns doesn't equal training set's") start_small = [[random.gauss(0, 1) for c in range(numcol - 2)] for r in range(5)] start_large = [[random.gauss(0, 1) for c in range(numcol + 2)] for r in range(5)] try: h2o.kmeans(x=benign_h2o, k=5, user_points=h2o.H2OFrame(start_small)) assert False, "expected an error" except EnvironmentError: assert True try: h2o.kmeans(x=benign_h2o, k=5, user_points=h2o.H2OFrame(start_large)) assert False, "expected an error" except EnvironmentError: assert True # Log.info("Number of rows exceeds training set's") start = [[random.gauss(0, 1) for c in range(numcol)] for r in range(numrow + 2)] try: h2o.kmeans(x=benign_h2o, k=numrow + 2, user_points=h2o.H2OFrame(start)) assert False, "expected an error" except EnvironmentError: assert True # Nones are replaced with mean of a column in H2O. Not sure about Inf. # Log.info("Any entry is NA, NaN, or Inf") start = [[random.gauss(0, 1) for c in range(numcol)] for r in range(3)] for x in ["NA", "NaN", "Inf", "-Inf"]: start_err = start[:] start_err[1][random.randint(0, numcol - 1)] = x h2o.kmeans(x=benign_h2o, k=3, user_points=h2o.H2OFrame(start_err)) # Duplicates will affect sampling probability during initialization. # Log.info("Duplicate initial clusters specified") start = [[random.gauss(0, 1) for c in range(numcol)] for r in range(3)] start[2] = start[0] h2o.kmeans(x=benign_h2o, k=3, user_points=h2o.H2OFrame(start))
def baddataKmeans(ip,port): # Connect to a pre-existing cluster # connect to localhost:54321 rows = 100 cols = 10 rawdata = [[random.random() for c in range(cols)] for r in range(rows)] # Row elements that are None will be replaced with mean of column #Log.info("Training data with 1 row of all Nones: replace with column mean") data = rawdata[:] for cidx, cval in enumerate(data[24]): data[24][cidx] = None frame = h2o.H2OFrame(data) km_model = h2o.kmeans(x=frame, k=5) centers = km_model.centers() assert len(centers) == 5, "expected 5 centers" for c in range(len(centers)): assert len(centers[c]) == 10, "expected center to be 10 dimensional" # Columns with constant value will be automatically dropped #Log.info("Training data with 1 col of all 5's: drop automatically") data = rawdata[:] for idx, val in enumerate(data): data[idx][4] = 5 frame = h2o.H2OFrame(data) km_model = h2o.kmeans(x=frame, k=5) centers = km_model.centers() assert len(centers) == 5, "expected 5 centers" for c in range(len(centers)): assert len(centers[c]) == 9, "expected center to be 9 " # TODO: expect_warning(km_model = h2o.kmeans(x=frame, k=5)) # Log.info("Training data with 1 col of all None's, 1 col of all zeroes: drop automatically") data = rawdata[:] for idx, val in enumerate(data): data[idx][4] = None data[idx][7] = 0 frame = h2o.H2OFrame(data) km_model = h2o.kmeans(x=frame, k=5) centers = km_model.centers() assert len(centers) == 5, "expected 5 centers" for c in range(len(centers)): assert len(centers[c]) == 8, "expected center to be 9 " # TODO: expect_warning(km_model = h2o.kmeans(x=frame, k=5)) # Log.info("Training data with all None's") data = [[None for c in range(cols)] for r in range(rows)] frame = h2o.H2OFrame(data) try: h2o.kmeans(x=frame, k=5) assert False, "expected an error" except EnvironmentError: assert True # Log.info("Training data with a categorical column(s)") data = [[random.choice(string.ascii_uppercase) for c in range(cols)] for r in range(rows)] frame = h2o.H2OFrame(data) km_model = h2o.kmeans(x=frame, k=5) centers = km_model.centers() assert len(centers) == 5, "expected 5 centers" for c in range(len(centers)): assert len(centers[c]) == 10, "expected center to be 10 "+str(len(centers[c])) # Log.info("Importing iris.csv data...\n") iris = h2o.import_file(path=h2o.locate("smalldata/iris/iris.csv")) km_model = h2o.kmeans(x=iris, k=5) centers = km_model.centers() assert len(centers) == 5, "expected 5 centers" for c in range(len(centers)): assert len(centers[c]) == 5, "expected center to be 5 "+str(len(centers[c]))