def hdfs_kmeans(): hdfs_name_node = pyunit_utils.hadoop_namenode() hdfs_iris_file = "/datasets/runit/iris_wheader.csv" hdfs_covtype_file = "/datasets/runit/covtype.data" print("Import iris_wheader.csv from HDFS") url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_iris_file) iris_h2o = h2o.import_file(url) n = iris_h2o.nrow print("rows: {0}".format(n)) assert n == 150, "Wrong number of rows. Got {0}. Should have got {1}".format(n, 150) print("Running KMeans on iris") iris_km = H2OKMeansEstimator(k=3, training_frame=iris_h2o[0:4], max_iterations=10) iris_km.train() print(iris_km) print("Importing covtype.data from HDFS") url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_covtype_file) covtype_h2o = h2o.import_file(url) n = covtype_h2o.nrow print("rows: {0}".format(n)) assert n == 581012, "Wrong number of rows. Got {0}. Should have got {1}".format(n, 581012) print("Running KMeans on covtype") covtype_km = H2OKMeansEstimator(training_frame=covtype_h2o[0:55], k=8, max_iterations=10) covtype_km.train() print(covtype_km)
def parametersKmeans(): print("Getting data...") iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv")) print("Create and and duplicate...") iris_km = H2OKMeansEstimator(k=3, seed=1234) iris_km.train(x=list(range(4)), training_frame=iris) parameters = iris_km._model_json['parameters'] param_dict = {} for p in range(len(parameters)): param_dict[parameters[p]['name']] = parameters[p]['actual_value'] fold_column = param_dict['fold_column'] del param_dict['fold_column'] del param_dict['training_frame'] del param_dict['validation_frame'] del param_dict['max_runtime_secs'] iris_km_again = H2OKMeansEstimator( **param_dict) ## not all parameters go here - invalid test iris_km_again.train(x=list(range(4)), training_frame=iris, fold_column=fold_column) print("wss") wss = iris_km.withinss().sort() wss_again = iris_km_again.withinss().sort() assert wss == wss_again, "expected wss to be equal" print("centers") centers = iris_km.centers() centers_again = iris_km_again.centers() assert centers == centers_again, "expected centers to be the same"
def parametersKmeans(): print("Getting data...") iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv")) print("Create and and duplicate...") iris_km = H2OKMeansEstimator(k=3, seed=1234) iris_km.train(x=list(range(4)), training_frame=iris) parameters = iris_km._model_json['parameters'] param_dict = {pp['name']: pp['actual_value'] for pp in parameters} fold_column = param_dict.pop('fold_column') del param_dict["model_id"] del param_dict['training_frame'] del param_dict['validation_frame'] del param_dict['max_runtime_secs'] iris_km_again = H2OKMeansEstimator( **param_dict) # not all parameters go here - invalid test # remove assigning the x parameter to prevent H2OValueError: Properties x and ignored_columns cannot be specified simultaneously iris_km_again.train(training_frame=iris, fold_column=fold_column) print("wss") wss = iris_km.withinss().sort() wss_again = iris_km_again.withinss().sort() assert wss == wss_again, "expected wss to be equal" print("centers") centers = iris_km.centers() centers_again = iris_km_again.centers() assert centers == centers_again, "expected centers to be the same"
def test_kmeans_fields(self): """ test_kmeans_grid_search_over_validation_datasets performs the following: a. build H2O kmeans models using grid search. Count and make sure models are only built for hyper-parameters set to legal values. No model is built for bad hyper-parameters values. We should instead get a warning/error message printed out. b. For each model built using grid search, we will extract the parameters used in building that model and manually build a H2O kmeans model. Training metrics are calculated from the gridsearch model and the manually built model. If their metrics differ by too much, print a warning message but don't fail the test. c. we will check and make sure the models are built within the max_runtime_secs time limit that was set for it as well. If max_runtime_secs was exceeded, declare test failure. """ print( "*******************************************************************************************" ) h2o.cluster_info() good_params_list = { 'max_iterations': 20, 'k': 6, 'init': 'Furthest', 'seed': 1464891169 } good_model_params = {'max_runtime_secs': 0.014673351} good_model = H2OKMeansEstimator(**good_params_list) good_model.train(x=self.x_indices, training_frame=self.training1_data, **good_model_params) bad_params_list = { 'init': 'Random', 'seed': 1464888628, 'k': 6, 'max_iterations': 0 } bad_model_params = {'max_runtime_secs': 0.007948218600000001} bad_model = H2OKMeansEstimator(**bad_params_list) bad_model.train(x=self.x_indices, training_frame=self.training1_data, **bad_model_params) good_model_type = type( good_model._model_json['output']['model_summary']) bad_model_type = type(bad_model._model_json['output']['model_summary']) print( "good_model._model_json['output']['model_summary'] type is {0}. " "bad_model._model_json['output']['model_summary'] type is " "{1}".format(good_model_type, bad_model_type)) if not (good_model_type == bad_model_type): print("They are not equal for some reason....") self.test_failed = 1 else: print("The fields are of the same type.")
def convergeKmeans(): # Connect to a pre-existing cluster # connect to localhost:54321 # Log.info("Importing ozone.csv data...\n") ozone_h2o = h2o.import_file( path=pyunit_utils.locate("smalldata/glm_test/ozone.csv")) #ozone_h2o.summary() miters = 5 ncent = 10 # Log.info(paste("Run k-means in a loop of", miters, "iterations with max_iter = 1")) start = ozone_h2o[0:10, 0:4] # expect error for 0 iterations try: H2OKMeansEstimator(max_iterations=0).train(x=list(range( ozone_h2o.ncol)), training_frame=ozone_h2o) assert False, "expected an error" except EnvironmentError: assert True centers = start for i in range(miters): rep_fit = H2OKMeansEstimator(k=ncent, user_points=centers, max_iterations=1) rep_fit.train(x=list(range(ozone_h2o.ncol)), training_frame=ozone_h2o) centers = h2o.H2OFrame(rep_fit.centers()) # Log.info(paste("Run k-means with max_iter=miters")) all_fit = H2OKMeansEstimator(k=ncent, user_points=start, max_iterations=miters) all_fit.train(x=list(range(ozone_h2o.ncol)), training_frame=ozone_h2o) assert rep_fit.centers() == all_fit.centers( ), "expected the centers to be the same" # Log.info("Check cluster centers have converged") all_fit2 = H2OKMeansEstimator(k=ncent, user_points=h2o.H2OFrame(all_fit.centers()), max_iterations=1) all_fit2.train(x=list(range(ozone_h2o.ncol)), training_frame=ozone_h2o) avg_change = old_div( sum([ sum([pow((e1 - e2), 2) for e1, e2 in zip(c1, c2)]) for c1, c2 in zip(all_fit.centers(), all_fit2.centers()) ]), ncent) assert avg_change < 1e-6 or all_fit._model_json['output'][ 'iterations'] == miters
def get_modelKmeans(): # Connect to a pre-existing cluster # connect to localhost:54321 #Log.info("Importing benign.csv data...\n") benign_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/benign.csv")) #benign_h2o.summary() benign_sci = np.genfromtxt(pyunit_utils.locate("smalldata/logreg/benign.csv"), delimiter=",") # Impute missing values with column mean imp = Imputer(missing_values='NaN', strategy='mean', axis=0) benign_sci = imp.fit_transform(benign_sci) for i in range(2,7): # Log.info("H2O K-Means") km_h2o = H2OKMeansEstimator(k=i) km_h2o.train(x=range(benign_h2o.ncol), training_frame=benign_h2o) km_h2o.show() model = h2o.get_model(km_h2o._id) model.show() km_sci = KMeans(n_clusters=i, init='k-means++', n_init=1) km_sci.fit(benign_sci) print "sckit centers" print km_sci.cluster_centers_
def test_kmeans_cv(): data = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv")) km_model = H2OKMeansEstimator(k=3, nfolds=3, estimate_k=True) km_model.train(x=list(range(4)), training_frame=data) centers = km_model.centers() print(centers) # test cross validation model 3 has centroid stats cv_model1 = h2o.get_model( km_model._model_json['output']['cross_validation_models'][0]['name']) print(cv_model1) assert cv_model1._model_json['output']['training_metrics'][ 'centroid_stats'] is not None # test cross validation model 3 has centroid stats cv_model2 = h2o.get_model( km_model._model_json['output']['cross_validation_models'][1]['name']) print(cv_model2) assert cv_model2._model_json['output']['training_metrics'][ 'centroid_stats'] is not None # test cross validation model 3 has centroid stats cv_model3 = h2o.get_model( km_model._model_json['output']['cross_validation_models'][2]['name']) print(cv_model3) assert cv_model3._model_json['output']['training_metrics'][ 'centroid_stats'] is not None # test cross validation metrics does not have centroid stats print(km_model._model_json['output']['cross_validation_metrics']) assert km_model._model_json['output']['cross_validation_metrics'][ 'centroid_stats'] is None
def test_kmeans_grid_search_over_validation_datasets(self): """ test_kmeans_grid_search_over_validation_datasets performs the following: a. build H2O kmeans models using grid search. b. For each model built using grid search, print out the total_sum_squares errors. c. If an exception was thrown, mark the test as failed. """ print( "*******************************************************************************************" ) print("test_kmeans_grid_search_over_validation_datasets for kmeans ") h2o.cluster_info() print("Hyper-parameters used here is {0}".format(self.hyper_params)) # try: # start grid search grid_model = H2OGridSearch(H2OKMeansEstimator(), hyper_params=self.hyper_params) grid_model.train(x=self.x_indices, training_frame=self.training1_data) for each_model in grid_model: summary_list = each_model._model_json["output"][ "validation_metrics"] if (summary_list is not None) and (summary_list._metric_json is not None): grid_model_metrics = summary_list._metric_json['totss'] print("total sum of squares of a model is: {0}".format( grid_model_metrics)) else: print( 'model._model_json["output"]["validation_metrics"] of a model is None for some reason....' )
def prostateKmeans(): # Connect to a pre-existing cluster # connect to localhost:54321 #Log.info("Importing prostate.csv data...\n") prostate_h2o = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) #prostate.summary() prostate_sci = np.loadtxt( pyunit_utils.locate("smalldata/logreg/prostate_train.csv"), delimiter=',', skiprows=1) prostate_sci = prostate_sci[:, 1:] from h2o.estimators.kmeans import H2OKMeansEstimator for i in range(5, 9): #Log.info(paste("H2O K-Means with ", i, " clusters:\n", sep = "")) #Log.info(paste( "Using these columns: ", colnames(prostate.hex)[-1]) ) prostate_km_h2o = H2OKMeansEstimator(k=i) prostate_km_h2o.train(x=range(1, prostate_h2o.ncol), training_frame=prostate_h2o) prostate_km_h2o.show() prostate_km_sci = KMeans(n_clusters=i, init='k-means++', n_init=1) prostate_km_sci.fit(prostate_sci) print prostate_km_sci.cluster_centers_
def iris_h2o_vs_sciKmeans(): # Connect to a pre-existing cluster # connect to localhost:54321 iris_h2o = h2o.import_file( path=pyunit_utils.locate("smalldata/iris/iris.csv")) iris_sci = np.genfromtxt(pyunit_utils.locate("smalldata/iris/iris.csv"), delimiter=',') iris_sci = iris_sci[:, 0:4] s = [[4.9, 3.0, 1.4, 0.2], [5.6, 2.5, 3.9, 1.1], [6.5, 3.0, 5.2, 2.0]] start = h2o.H2OFrame(zip(*s)) h2o_km = H2OKMeansEstimator(k=3, user_points=start, standardize=False) h2o_km.train(x=range(4), training_frame=iris_h2o) sci_km = KMeans(n_clusters=3, init=np.asarray(s), n_init=1) sci_km.fit(iris_sci) # Log.info("Cluster centers from H2O:") print "Cluster centers from H2O:" h2o_centers = h2o_km.centers() print h2o_centers # Log.info("Cluster centers from scikit:") print "Cluster centers from scikit:" sci_centers = sci_km.cluster_centers_.tolist() sci_centers = zip(*sci_centers) for hcenter, scenter in zip(h2o_centers, sci_centers): for hpoint, spoint in zip(hcenter, scenter): assert (hpoint - spoint) < 1e-10, "expected centers to be the same"
def benignKmeans(): # Connect to a pre-existing cluster # connect to localhost:54321 # Log.info("Importing benign.csv data...\n") benign_h2o = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/benign.csv")) #benign_h2o.summary() benign_sci = np.genfromtxt( pyunit_utils.locate("smalldata/logreg/benign.csv"), delimiter=",") # Impute missing values with column mean imp = Imputer(missing_values='NaN', strategy='mean', axis=0) benign_sci = imp.fit_transform(benign_sci) # Log.info(paste("H2O K-Means with ", i, " clusters:\n", sep = "")) from h2o.estimators.kmeans import H2OKMeansEstimator for i in range(1, 7): benign_h2o_km = H2OKMeansEstimator(k=i) benign_h2o_km.train(x=range(benign_h2o.ncol), training_frame=benign_h2o) print "H2O centers" print benign_h2o_km.centers() benign_sci_km = KMeans(n_clusters=i, init='k-means++', n_init=1) benign_sci_km.fit(benign_sci) print "sckit centers" print benign_sci_km.cluster_centers_
def attack(train, x): kwargs = {} # randomly select parameters and their corresponding values kwargs['k'] = random.randint(1, 20) if random.randint(0, 1): kwargs['model_id'] = "my_model" if random.randint(0, 1): kwargs['max_iterations'] = random.randint(1, 1000) if random.randint(0, 1): kwargs['standardize'] = [True, False][random.randint(0, 1)] if random.randint(0, 1): method = random.randint(0, 3) if method == 3: # Can be simplified to: train[x].mean() + (train[x].runif() - 0.5)*200 # once .runif() is fixed s = [[train[c].mean().getrow()[0] + random.uniform(-100, 100) for p in range(kwargs['k'])] for c in x] print("s: {0}".format(s)) start = h2o.H2OFrame(list(zip(*s))) kwargs['user_points'] = start else: kwargs['init'] = ["Furthest", "Random", "PlusPlus"][method] if random.randint(0, 1): kwargs['seed'] = random.randint(1, 10000) # display the parameters and their corresponding values print("-----------------------") print("x: {0}".format(x)) for k, v in kwargs.items(): if k == 'user_points': print(k + ": ") start.show() else: print(k + ": {0}".format(v)) H2OKMeansEstimator(**kwargs).train(x=x, training_frame=train) print("-----------------------")
def pyunit_model_params(): pros = h2o.import_file( pyunit_utils.locate("smalldata/prostate/prostate.csv")) m = H2OKMeansEstimator(k=4) m.train(x=list(range(pros.ncol)), training_frame=pros) print(m.params) print(m.full_parameters)
def _get_kmeans_model(predictor_col, response_col, train_f, val_f): from h2o.estimators.kmeans import H2OKMeansEstimator kmeans_model = H2OKMeansEstimator(k=2, max_iterations=1000000) kmeans_model.train(x=predictor_col, training_frame=train_f, validation_frame=val_f) return kmeans_model
def kmeans_start(grid_id, export_dir, train, params, hyper_parameters): grid = H2OGridSearch(H2OKMeansEstimator(), grid_id=grid_id, hyper_params=hyper_parameters, recovery_dir=export_dir) grid.start(x=list(range(4)), training_frame=train, **params) return grid
def k_means_export(): print("###### K MEANS ######") frame = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/benign.csv")) model = H2OKMeansEstimator(k=1) model.train(x=list(range(frame.ncol)), training_frame=frame) h2o.download_pojo(model, path=RESULT_DIR) model.download_mojo(path=RESULT_DIR)
def kmeans_model(df, xValues): hf = h2o.H2OFrame(df) train, valid, test = hf.split_frame(ratios=[.8, .1]) # kmeans model kmeans = H2OKMeansEstimator(k=3,max_iterations=5,seed = 10,categorical_encoding = "AUTO",max_runtime_secs=10) kmeans.train(xValues, training_frame= hf) # pca model, generate Principal Components for further modelling or plotting pca = H2OPrincipalComponentAnalysisEstimator(k=4) # pca.train(xValues, training_frame= hf) pca.train(list(df.columns), training_frame= hf) pca_features = pca.predict(hf).as_data_frame() pca_metric = pca.summary().as_data_frame() # model metrics cluster_column = kmeans.predict(hf).as_data_frame() # The Between Cluster Sum-of-Square Error inter_cluster_error = kmeans.betweenss() # Within Cluster Sum-of-Square Error intra_cluster_error = kmeans.withinss() # Centroids centroids = kmeans.centers() # Size of clusters cluster_size = kmeans.size() # cluster_column.columns = ['cluster'] frames = [df,cluster_column] transformed_data = pd.concat(frames, axis=1) output = [transformed_data, pca_features, pca_metric, centroids, inter_cluster_error, intra_cluster_error, cluster_size] return output
def k_means(xval=None, sample_size=None, nfolds=None, hparams=None): """ create a k-means algorithm estimator :param xval: if for cross-validation :param sample_size: training set sample amount :param nfolds: k value for k-fold cross-validation :param hparams: hyper parameters for grid search :return: a constructed k-means estimator, a parameters' dict for grid search """ if sample_size <= 10000: if sample_size < 5000: default_nfolds = 3 else: default_nfolds = 5 k_opts = [3, 5, 10] max_iterations_opts = [5, 10, 20] standardize_opts = [0.1, 0.6, 0.8] elif 10000 < sample_size <= 100000: default_nfolds = 3 k_opts = [3, 5, 10] max_iterations_opts = [5, 10, 20] standardize_opts = [0.1, 0.6] else: default_nfolds = 2 k_opts = [3, 5, 10] max_iterations_opts = [5, 10] standardize_opts = [0.1, 0.6] default_hparams = dict({'k': k_opts, 'max_iterations': max_iterations_opts, 'standardize': standardize_opts}) if nfolds is None: nfolds = default_nfolds if hparams is None: hparams = default_hparams if xval: km_estimator = H2OKMeansEstimator(nfolds=nfolds) else: km_estimator = H2OKMeansEstimator() return km_estimator, hparams
def km_num_iterations(): # Connect to a pre-existing cluster # connect to localhost:54321 prostate_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) from h2o.estimators.kmeans import H2OKMeansEstimator prostate_km_h2o = H2OKMeansEstimator(k=3, max_iterations=4) prostate_km_h2o.train(training_frame=prostate_h2o, x=range(1,prostate_h2o.ncol)) num_iterations = prostate_km_h2o.num_iterations() assert num_iterations <= 4, "Expected 4 iterations, but got {0}".format(num_iterations)
def emptyclusKmeans(): # Connect to a pre-existing cluster # connect to localhost:54321 #Log.info("Importing ozone.csv data...\n") ozone_sci = np.loadtxt(pyunit_utils.locate("smalldata/glm_test/ozone.csv"), delimiter=',', skiprows=1) ozone_h2o = h2o.import_file( path=pyunit_utils.locate("smalldata/glm_test/ozone.csv")) ncent = 10 nempty = random.randint(1, ncent / 2) initial_centers = [[41, 190, 67, 7.4], [36, 118, 72, 8], [12, 149, 74, 12.6], [18, 313, 62, 11.5], [23, 299, 65, 8.6], [19, 99, 59, 13.8], [8, 19, 61, 20.1], [16, 256, 69, 9.7], [11, 290, 66, 9.2], [14, 274, 68, 10.9]] for i in random.sample(range(ncent - 1), nempty): initial_centers[i] = [ 100 * i for z in range(1, len(initial_centers[0]) + 1) ] initial_centers_sci = np.asarray(initial_centers) initial_centers = zip(*initial_centers) initial_centers_h2o = h2o.H2OFrame(initial_centers) #Log.info("Initial cluster centers:") print "H2O initial centers:" initial_centers_h2o.show() print "scikit initial centers:" print initial_centers_sci # H2O can handle empty clusters and so can scikit #Log.info("Check that H2O can handle badly initialized centers") km_sci = KMeans(n_clusters=ncent, init=initial_centers_sci, n_init=1) km_sci.fit(preprocessing.scale(ozone_sci)) print "scikit final centers" print km_sci.cluster_centers_ from h2o.estimators.kmeans import H2OKMeansEstimator km_h2o = H2OKMeansEstimator(k=ncent, user_points=initial_centers_h2o, standardize=True) km_h2o.train(x=range(ozone_h2o.ncol), training_frame=ozone_h2o) print "H2O final centers" print km_h2o.centers()
def kmeans_mllib(): hdfs_name_node = pyunit_utils.hadoop_namenode() hdfs_cross_file = "/datasets/runit/BigCross.data" print("Import BigCross.data from HDFS") url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_cross_file) cross_h2o = h2o.import_file(url) n = cross_h2o.nrow err_mllib = np.genfromtxt( pyunit_utils.locate("smalldata/mllib_bench/bigcross_wcsse.csv"), delimiter=",", skip_header=1) ncent = [int(err_mllib[r][0]) for r in range(len(err_mllib))] for k in ncent: print("Run k-means++ with k = {0} and max_iterations = 10".format(k)) cross_km = H2OKMeansEstimator(training_frame=cross_h2o, k=k, init="PlusPlus", max_iterations=10, standardize=False) cross_km.train() clust_mllib = np.genfromtxt( pyunit_utils.locate("smalldata/mllib_bench/bigcross_centers_" + str(k) + ".csv"), delimiter=",").tolist() clust_h2o = cross_km.centers() # Sort in ascending order by first dimension for comparison purposes clust_mllib.sort(key=lambda x: x[0]) clust_h2o.sort(key=lambda x: x[0]) print("\nMLlib Cluster Centers:\n") print(clust_mllib) print("\nH2O Cluster Centers:\n") print(clust_h2o) wcsse_mllib = err_mllib[err_mllib[0:4, 0].tolist().index(k)][1] wcsse_h2o = old_div(cross_km.tot_withinss(), n) print("\nMLlib Average Within-Cluster SSE: \n".format(wcsse_mllib)) print("H2O Average Within-Cluster SSE: \n".format(wcsse_h2o)) assert wcsse_h2o == wcsse_mllib, "Expected mllib and h2o to get the same wcsse. Mllib got {0}, and H2O " \ "got {1}".format(wcsse_mllib, wcsse_h2o)
def tuneAndTrain(trainDataFrame): h2o.init() #trainData=trainDataFrame trainDataHex=h2o.H2OFrame(trainDataFrame) #to consider categorical columns uncomment all the comments dc=DataCollection() categoricalColumns=dc.findCategorical(trainDataFrame) trainDataHex[categoricalColumns] = trainDataHex[categoricalColumns].asfactor() # #k = range(1,len(trainDataFrame)) k = len(trainDataFrame)-1 hyperParameters = {"k":k} modelGrid = H2OGridSearch(H2OKMeansEstimator(ignore_const_cols=False),hyper_params=hyperParameters) modelGrid.train(x= list(range(0,int(len(trainDataFrame.columns)))),training_frame=trainDataHex) gridperf1 = modelGrid.get_grid(sort_by='mse', decreasing=True) bestModel = gridperf1.models[0] return bestModel # """model = H2OKMeansEstimator(k = 5, estimate_k = True, ignore_const_cols=False)
def benign_kmeans(): print("Importing benign.csv data...") benign_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/benign.csv")) benign_sci = np.genfromtxt(pyunit_utils.locate("smalldata/logreg/benign.csv"), delimiter=",") # Impute missing values with column mean imp = SimpleImputer(missing_values=np.nan, strategy="mean") benign_sci = imp.fit_transform(benign_sci) for i in range(1,7): print("H2O K-Means with " + str(i) + " clusters:") benign_h2o_km = H2OKMeansEstimator(k=i) benign_h2o_km.train(x=list(range(benign_h2o.ncol)), training_frame=benign_h2o) print("H2O centers") print(benign_h2o_km.centers()) benign_sci_km = KMeans(n_clusters=i, init='k-means++', n_init=1) benign_sci_km.fit(benign_sci) print("sckit centers") print(benign_sci_km.cluster_centers_)
def ozoneKM(): # Connect to a pre-existing cluster # connect to localhost:54321 train = h2o.import_file( path=pyunit_utils.locate("smalldata/glm_test/ozone.csv")) # See that the data is ready print(train.describe()) # Run KMeans from h2o.estimators.kmeans import H2OKMeansEstimator my_km = H2OKMeansEstimator(k=10, init="PlusPlus", max_iterations=100) my_km.train(x=list(range(train.ncol)), training_frame=train) my_km.show() my_km.summary() my_pred = my_km.predict(train) my_pred.describe()
def get_model_kmeans(): print("Importing benign.csv data...") benign_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/benign.csv")) benign_sci = np.genfromtxt(pyunit_utils.locate("smalldata/logreg/benign.csv"), delimiter=",") # Impute missing values with column mean imp = SimpleImputer(missing_values=np.nan, strategy="mean") benign_sci = imp.fit_transform(benign_sci) for i in range(2,7): km_h2o = H2OKMeansEstimator(k=i) km_h2o.train(x=list(range(benign_h2o.ncol)), training_frame=benign_h2o) km_h2o.show() model = h2o.get_model(km_h2o._id) model.show() km_sci = KMeans(n_clusters=i, init='k-means++', n_init=1) km_sci.fit(benign_sci) print("sckit centers") print(km_sci.cluster_centers_)
def build_mojo_pipeline(): results_dir = pyunit_utils.locate("results") iris_csv = pyunit_utils.locate('smalldata/iris/iris_train.csv') iris = h2o.import_file(iris_csv) pca = H2OPrincipalComponentAnalysisEstimator(k=2) pca.train(training_frame=iris) principal_components = pca.predict(iris) km = H2OKMeansEstimator(k=3) km.train(training_frame=principal_components) pca_mojo_path = pca.download_mojo(path=results_dir) km_mojo_path = km.download_mojo(get_genmodel_jar=True, path=results_dir) java_cmd = [ "java", "-cp", os.path.join(results_dir, "h2o-genmodel.jar"), "hex.genmodel.tools.BuildPipeline", "--mapping" ] pca_mojo_name = os.path.basename(pca_mojo_path).split('.')[0] for i, pc in enumerate(principal_components.columns): mapping = pc + '=' + pca_mojo_name + ':' + str(i) java_cmd += [mapping] java_cmd += [ "--output", os.path.join(results_dir, "pipe.zip"), "--input", km_mojo_path, pca_mojo_path ] subprocess.Popen(java_cmd, stdout=PIPE, stderr=STDOUT).communicate() h2o_preds = km.predict(principal_components) mojo_preds_raw = h2o.mojo_predict_csv(input_csv_path=iris_csv, mojo_zip_path=os.path.join( results_dir, "pipe.zip")) mojo_preds = h2o.H2OFrame([c['cluster'] for c in mojo_preds_raw], column_names=['predict']) assert (mojo_preds == h2o_preds).mean()[0, "predict"] == 1
def test_kmeans_hangup(self): """ train a kmeans model with some parameters that will make the system hang. """ print( "*******************************************************************************************" ) h2o.cluster_info() good_params_list = { 'seed': 1464837706, 'max_iterations': 50, 'init': 'Furthest', 'k': 5 } good_model_params = {'max_runtime_secs': 0.001} good_model = H2OKMeansEstimator(**good_params_list) good_model.train(x=self.x_indices, training_frame=self.training1_data, **good_model_params) print("Finished.")
def attack(train, x): kwargs = {} # randomly select parameters and their corresponding values kwargs['k'] = random.randint(1, 20) if random.randint(0, 1): kwargs['model_id'] = "my_model" if random.randint(0, 1): kwargs['max_iterations'] = random.randint(1, 1000) if random.randint(0, 1): kwargs['standardize'] = [True, False][random.randint(0, 1)] if random.randint(0, 1): method = random.randint(0, 3) if method == 3: s = [[ random.uniform(train[c].mean()[0] - 100, train[c].mean()[0] + 100) for p in range(kwargs['k']) ] for c in x] print "s: {0}".format(s) start = h2o.H2OFrame(s) kwargs['user_points'] = start else: kwargs['init'] = ["Furthest", "Random", "PlusPlus"][method] if random.randint(0, 1): kwargs['seed'] = random.randint(1, 10000) # display the parameters and their corresponding values print "-----------------------" print "x: {0}".format(x) for k, v in zip(kwargs.keys(), kwargs.values()): if k == 'user_points': print k + ": " start.show() else: print k + ": {0}".format(v) H2OKMeansEstimator(**kwargs).train(x=x, training_frame=train) print "-----------------------"
def KMeans_ClusteringH2O(data, metric, parameters): try: h2o.init() rfm_data = h2o.H2OFrame(data) train, valid = rfm_data.split_frame( ratios=[constants.clustering_parameters['split_ratio']], seed=constants.clustering_parameters['seed']) rfm_kmeans = H2OKMeansEstimator( k=constants.clustering_parameters['k'], seed=constants.clustering_parameters['seed'], max_iterations=int(len(data) / 2)) rfm_kmeans.train(x=metric, training_frame=train, validation_frame=valid) grid = H2OGridSearch( model=rfm_kmeans, hyper_params=constants.clustering_parameters['hyper_params'], search_criteria=constants.clustering_parameters['search_criteria']) # train using the grid grid.train(x=metric, training_frame=train, validation_frame=valid) # sort the grid models by total within cluster sum-of-square error. sorted_grid = grid.get_grid(sort_by='tot_withinss', decreasing=False) prediction = sorted_grid[0].predict(rfm_data) data = rfm_data.concat(prediction, axis=1)[[metric, 'predict' ]].as_data_frame(use_pandas=True) data = data.rename(columns={'predict': metric + '_segment'}) data[metric + '_segment'] = data[metric + '_segment'].apply(lambda x: x + 1) if parameters['is_h2o_cluster_shut_down']: h2o.shutdown(prompt=False) except: if parameters['is_h2o_cluster_shut_down']: h2o.shutdown(prompt=False) return data
def test_constrained_kmeans(): iris_h2o = h2o.import_file( path=pyunit_utils.locate("smalldata/iris/iris.csv")) k = 3 start = h2o.H2OFrame([[4.9, 3.0, 1.4, 0.2], [5.6, 2.5, 3.9, 1.1], [6.5, 3.0, 5.2, 2.0]]) constraints = [[100, 40, 1], [100, 1, 1], [1, 100, 1], [1, 40, 100], [1, 1, 1], [1, 1, 148], [147, 1, 1], [1, 148, 1], [1, 1, 1], [50, 50, 50]] for i in range(len(constraints)): for standardize in [True, False]: print("===== Train KMeans model with constraints: ======") print(constraints[i]) kmm = H2OKMeansEstimator(k=k, user_points=start, standardize=standardize, cluster_size_constraints=constraints[i], score_each_iteration=True) kmm.train(x=list(range(4)), training_frame=iris_h2o) kmm.show() for j in range(k): number_points = kmm._model_json['output'][ 'training_metrics']._metric_json[ 'centroid_stats']._cell_values[j][2] assert number_points >= constraints[i][ j], "Number of points (" + str( number_points) + ") in cluster " + str( i + 1) + " should be >= constraint value (" + str( constraints[i][j]) + ")"