def setup_grid_params(self): """ This function setup the randomized gridsearch parameters that will be used later on: 1. It will first try to grab all the parameters that are griddable and parameters used by GLM. 2. It will find the intersection of parameters that are both griddable and used by GLM. 3. There are several extra parameters that are used by GLM that are denoted as griddable but actually is not. These parameters have to be discovered manually and they These are captured in self.exclude_parameter_lists. 4. We generate the gridsearch hyper-parameter. For numerical parameters, we will generate those randomly. For enums, we will include all of them. :return: None """ # build bare bone model to get all parameters model = H2OGeneralizedLinearEstimator(family=self.family, nfolds=self.nfolds) model.train(x=self.x_indices, y=self.y_index, training_frame=self.training1_data) self.one_model_time = pyunit_utils.find_grid_runtime([model]) # find model train time print("Time taken to build a base barebone model is {0}".format(self.one_model_time)) # grab all gridable parameters and its type (self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \ pyunit_utils.get_gridables(model._model_json["parameters"]) # give the user opportunity to pre-assign hyper parameters for fixed values self.hyper_params = {} self.hyper_params["fold_assignment"] = ['AUTO', 'Random', 'Modulo'] self.hyper_params["missing_values_handling"] = ['MeanImputation', 'Skip'] # randomly generate griddable parameters (self.hyper_params, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \ pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params, self.exclude_parameter_lists, self.gridable_parameters, self.gridable_types, self.gridable_defaults, random.randint(1, self.max_int_number), self.max_int_val, self.min_int_val, random.randint(1, self.max_real_number), self.max_real_val, self.min_real_val) # change the value of lambda parameters to be from 0 to self.lambda_scale instead of 0 to 1. if "lambda" in list(self.hyper_params): self.hyper_params["lambda"] = [self.lambda_scale * x for x in self.hyper_params["lambda"]] time_scale = self.max_runtime_scale * self.one_model_time # change the value of runtime parameters to be from 0 to self.lambda_scale instead of 0 to 1. if "max_runtime_secs" in list(self.hyper_params): self.hyper_params["max_runtime_secs"] = [time_scale * x for x in self.hyper_params["max_runtime_secs"]] # number of possible models being built: self.possible_number_models = pyunit_utils.count_models(self.hyper_params) # save hyper-parameters in sandbox and current test directories. pyunit_utils.write_hyper_parameters_json(self.current_dir, self.sandbox_dir, self.json_filename, self.hyper_params)
def setup_model(self): """ This function setup the gridsearch hyper-parameters that will be used later on: 1. It will first try to grab all the parameters that are griddable and parameters used by GBM. 2. It will find the intersection of parameters that are both griddable and used by GBM. 3. There are several extra parameters that are used by GBM that are denoted as griddable but actually is not. These parameters have to be discovered manually and they These are captured in self.exclude_parameter_lists. 4. We generate the gridsearch hyper-parameter. For numerical parameters, we will generate those randomly. For enums, we will include all of them. :return: None """ # build bare bone model to get all parameters model = H2OGradientBoostingEstimator(distribution=self.family, seed=self.seed, nfolds=self.nfolds) model.train(x=self.x_indices, y=self.y_index, training_frame=self.training1_data) self.model_run_time = pyunit_utils.find_grid_runtime([model]) # find model train time print("Time taken to build a base barebone model is {0}".format(self.model_run_time)) summary_list = model._model_json["output"]["model_summary"] num_trees = summary_list.cell_values[0][summary_list.col_header.index('number_of_trees')] if num_trees == 0: self.min_runtime_per_tree = self.model_run_time else: self.min_runtime_per_tree = self.model_run_time / num_trees # grab all gridable parameters and its type (self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \ pyunit_utils.get_gridables(model._model_json["parameters"]) # randomly generate griddable parameters including values outside legal range, like setting alpha values to # be outside legal range of 0 and 1 and etc (self.hyper_params, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \ pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params, self.exclude_parameter_lists, self.gridable_parameters, self.gridable_types, self.gridable_defaults, random.randint(1, self.max_int_number), self.max_int_val, self.min_int_val, random.randint(1, self.max_real_number), self.max_real_val, self.min_real_val) # scale the max_runtime_secs parameters time_scale = self.time_scale * self.model_run_time if "max_runtime_secs" in list(self.hyper_params): self.hyper_params["max_runtime_secs"] = [time_scale * x for x in self.hyper_params["max_runtime_secs"]] # generate a new final_hyper_params which only takes a subset of all griddable parameters while # hyper_params take all griddable parameters and generate the grid search hyper-parameters [self.possible_number_models, self.final_hyper_params] = \ pyunit_utils.check_and_count_models(self.hyper_params, self.params_zero_one, self.params_more_than_zero, self.params_more_than_one, self.params_zero_positive, self.max_grid_model) # must add max_runtime_secs to restrict unit test run time and as a promise to Arno to test for this if ("max_runtime_secs" not in list(self.final_hyper_params)) and \ ("max_runtime_secs" in list(self.hyper_params)): self.final_hyper_params["max_runtime_secs"] = self.hyper_params["max_runtime_secs"] len_good_time = len([x for x in self.hyper_params["max_runtime_secs"] if (x >= 0)]) self.possible_number_models = self.possible_number_models*len_good_time if "fold_assignment" in list(self.final_hyper_params): self.possible_number_models = self.possible_number_models * self.scale_model self.final_hyper_params["seed"] = [self.seed] # added see to make test more repeatable # write out the hyper-parameters used into json files. pyunit_utils.write_hyper_parameters_json(self.current_dir, self.sandbox_dir, self.json_filename, self.final_hyper_params)
def setup_model(self): """ This function setup the gridsearch hyper-parameters that will be used later on: 1. It will first try to grab all the parameters that are griddable and parameters used by deeplearning. 2. It will find the intersection of parameters that are both griddable and used by deeplearning. 3. There are several extra parameters that are used by deeplearning that are denoted as griddable but actually is not. These parameters have to be discovered manually and they These are captured in self.exclude_parameter_lists. 4. We generate the gridsearch hyper-parameter. For numerical parameters, we will generate those randomly. For enums, we will include all of them. :return: None """ # build bare bone model to get all parameters model = H2ODeepLearningEstimator(distribution=self.family, seed=self.seed, nfolds=self.nfolds, hidden=[10, 10, 10]) model.train(x=self.x_indices, y=self.y_index, training_frame=self.training1_data) self.model_run_time = pyunit_utils.find_grid_runtime([model]) # find model train time print("Time taken to build a base barebone model is {0}".format(self.model_run_time)) summary_list = model._model_json["output"]["scoring_history"] num_iterations = summary_list.cell_values[2][summary_list.col_header.index('iterations')] if num_iterations == 0: self.min_runtime_per_iteration = self.model_run_time else: self.min_runtime_per_iteration = self.model_run_time / num_iterations # grab all gridable parameters and its type (self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \ pyunit_utils.get_gridables(model._model_json["parameters"]) # randomly generate griddable parameters including values outside legal range, like setting alpha values to # be outside legal range of 0 and 1 and etc (self.hyper_params, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \ pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params, self.exclude_parameter_lists, self.gridable_parameters, self.gridable_types, self.gridable_defaults, random.randint(1, self.max_int_number), self.max_int_val, self.min_int_val, random.randint(1, self.max_real_number), self.max_real_val, self.min_real_val) # scale the max_runtime_secs parameter and others as well to make sure they make sense time_scale = self.time_scale * self.model_run_time if "max_runtime_secs" in list(self.hyper_params): self.hyper_params["max_runtime_secs"] = [time_scale * x for x in self.hyper_params["max_runtime_secs"]] if "epsilon" in list(self.hyper_params): self.hyper_params["epsilon"] = [1e-4 * x for x in self.hyper_params["epsilon"]] if "input_dropout_ratio" in list(self.hyper_params): self.hyper_params["input_dropout_ratio"] = [0.5 * x for x in self.hyper_params["input_dropout_ratio"]] if "hidden_dropout_ratio" in list(self.hyper_params): self.hyper_params["hidden_dropout_ratio"] = [0.5 * x for x in self.hyper_params["hidden_dropout_ratio"]] if "hidden" in list(self.hyper_params): # need to change this up # randomly generate the number of layers in the network num_layer = random.randint(1,3) # for each layer, randomly generate the number of nodes in it self.hyper_params["hidden"] = [random.randint(1, self.max_int_val) for p in range(0, num_layer)] if "epochs" in self.hyper_params: self.hyper_params["epochs"] = [random.randint(self.min_int_val, self.max_int_val) for p in range(0, self.max_int_number)] # generate a new final_hyper_params which only takes a subset of all griddable parameters while [self.possible_number_models, self.final_hyper_params] = \ pyunit_utils.check_and_count_models(self.hyper_params, self.params_zero_one, self.params_more_than_zero, self.params_more_than_one, self.params_zero_positive, self.max_grid_model) # # # must add max_runtime_secs to restrict unit test run time and as a promise to Arno to test for this if ("max_runtime_secs" not in list(self.final_hyper_params)) and \ ("max_runtime_secs" in list(self.hyper_params)): self.final_hyper_params["max_runtime_secs"] = self.hyper_params["max_runtime_secs"] len_good_time = len([x for x in self.hyper_params["max_runtime_secs"] if (x >= 0)]) self.possible_number_models = self.possible_number_models*len_good_time # make correction for stratified not being a legal argument if "fold_assignment" in list(self.final_hyper_params): self.possible_number_models = self.possible_number_models * 3/4 # write out the hyper-parameters used into json files. pyunit_utils.write_hyper_parameters_json(self.current_dir, self.sandbox_dir, self.json_filename, self.final_hyper_params)
def setup_model(self): """ This function setup the gridsearch hyper-parameters that will be used later on: 1. It will first try to grab all the parameters that are griddable and parameters used by GBM. 2. It will find the intersection of parameters that are both griddable and used by GBM. 3. There are several extra parameters that are used by GBM that are denoted as griddable but actually is not. These parameters have to be discovered manually and they These are captured in self.exclude_parameter_lists. 4. We generate the gridsearch hyper-parameter. For numerical parameters, we will generate those randomly. For enums, we will include all of them. :return: None """ # build bare bone model to get all parameters model = H2OGradientBoostingEstimator(distribution=self.family, seed=self.seed, nfolds=self.nfolds) model.train(x=self.x_indices, y=self.y_index, training_frame=self.training1_data) self.model_run_time = pyunit_utils.find_grid_runtime([model]) # find model train time print("Time taken to build a base barebone model is {0}".format(self.model_run_time)) summary_list = model._model_json["output"]["model_summary"] num_trees = summary_list["number_of_trees"][0] if num_trees == 0: self.min_runtime_per_tree = self.model_run_time else: self.min_runtime_per_tree = self.model_run_time / num_trees # grab all gridable parameters and its type (self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \ pyunit_utils.get_gridables(model._model_json["parameters"]) # randomly generate griddable parameters including values outside legal range, like setting alpha values to # be outside legal range of 0 and 1 and etc (self.hyper_params, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \ pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params, self.exclude_parameter_lists, self.gridable_parameters, self.gridable_types, self.gridable_defaults, random.randint(1, self.max_int_number), self.max_int_val, self.min_int_val, random.randint(1, self.max_real_number), self.max_real_val, self.min_real_val) # scale the max_runtime_secs parameters time_scale = self.time_scale * self.model_run_time if "max_runtime_secs" in list(self.hyper_params): self.hyper_params["max_runtime_secs"] = [time_scale * x for x in self.hyper_params["max_runtime_secs"]] # generate a new final_hyper_params which only takes a subset of all griddable parameters while # hyper_params take all griddable parameters and generate the grid search hyper-parameters [self.possible_number_models, self.final_hyper_params] = \ pyunit_utils.check_and_count_models(self.hyper_params, self.params_zero_one, self.params_more_than_zero, self.params_more_than_one, self.params_zero_positive, self.max_grid_model) # must add max_runtime_secs to restrict unit test run time and as a promise to Arno to test for this if ("max_runtime_secs" not in list(self.final_hyper_params)) and \ ("max_runtime_secs" in list(self.hyper_params)): self.final_hyper_params["max_runtime_secs"] = self.hyper_params["max_runtime_secs"] len_good_time = len([x for x in self.hyper_params["max_runtime_secs"] if (x >= 0)]) self.possible_number_models = self.possible_number_models*len_good_time if "fold_assignment" in list(self.final_hyper_params): self.possible_number_models = self.possible_number_models * self.scale_model self.final_hyper_params["seed"] = [self.seed] # added see to make test more repeatable # write out the hyper-parameters used into json files. pyunit_utils.write_hyper_parameters_json(self.current_dir, self.sandbox_dir, self.json_filename, self.final_hyper_params)
def setup_model(self): """ This function setup the gridsearch hyper-parameters that will be used later on: 1. It will first try to grab all the parameters that are griddable and parameters used by PCA. 2. It will find the intersection of parameters that are both griddable and used by PCA. 3. There are several extra parameters that are used by PCA that are denoted as griddable but actually is not. These parameters have to be discovered manually and they These are captured in self.exclude_parameter_lists. 4. We generate the gridsearch hyper-parameter. For numerical parameters, we will generate those randomly. For enums, we will include all of them. :return: None """ # build bare bone model to get all parameters model = H2OPCA(k=10, transform="NONE", pca_method=self.pca_method) model.train(x=self.x_indices, training_frame=self.training1_data) self.model_run_time = pyunit_utils.find_grid_runtime([model]) # find model train time print("Time taken to build a base barebone model is {0}".format(self.model_run_time)) # grab all gridable parameters and its type (self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \ pyunit_utils.get_gridables(model._model_json["parameters"]) # randomly generate griddable parameters including values outside legal range, like setting alpha values to # be outside legal range of 0 and 1 and etc (self.hyper_params, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \ pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params, self.exclude_parameter_lists, self.gridable_parameters, self.gridable_types, self.gridable_defaults, random.randint(1, self.max_int_number), self.max_int_val, self.min_int_val, random.randint(1, self.max_real_number), self.max_real_val, self.min_real_val) # scale the max_runtime_secs parameters time_scale = self.time_scale * self.model_run_time if "max_runtime_secs" in list(self.hyper_params): self.hyper_params["max_runtime_secs"] = [time_scale * x for x in self.hyper_params["max_runtime_secs"]] if 'max_iterations' in list(self.hyper_params): self.hyper_params['max_iterations'] = [self.max_iter_scale * x for x in self.hyper_params['max_iterations']] # generate a new final_hyper_params which only takes a subset of all griddable parameters while # hyper_params take all griddable parameters and generate the grid search hyper-parameters [self.possible_number_models, self.final_hyper_params] = \ pyunit_utils.check_and_count_models(self.hyper_params, self.params_zero_one, self.params_more_than_zero, self.params_more_than_one, self.params_zero_positive, self.max_grid_model) # must add max_runtime_secs to restrict unit test run time and as a promise to Arno to test for this if ("max_runtime_secs" not in list(self.final_hyper_params)) and \ ("max_runtime_secs" in list(self.hyper_params)): self.final_hyper_params["max_runtime_secs"] = self.hyper_params["max_runtime_secs"] len_good_time = len([x for x in self.hyper_params["max_runtime_secs"] if (x >= 0)]) self.possible_number_models = self.possible_number_models*len_good_time # must include k in hyper-parameters if ('k' not in list(self.final_hyper_params)) and ('k' in list(self.hyper_params)): self.final_hyper_params["k"] = self.hyper_params["k"] len_good_k = len([x for x in self.hyper_params["k"] if (x > 0)]) self.possible_number_models = self.possible_number_models*len_good_k # write out the hyper-parameters used into json files. pyunit_utils.write_hyper_parameters_json(self.current_dir, self.sandbox_dir, self.json_filename, self.final_hyper_params)
def setup_model(self): """ This function setup the gridsearch hyper-parameters that will be used later on: 1. It will first try to grab all the parameters that are griddable and parameters used by GBM. 2. It will find the intersection of parameters that are both griddable and used by GBM. 3. There are several extra parameters that are used by GBM that are denoted as griddable but actually is not. These parameters have to be discovered manually and they These are captured in self.exclude_parameter_lists. 4. We generate the gridsearch hyper-parameter. For numerical parameters, we will generate those randomly. For enums, we will include all of them. :return: None """ # build bare bone model to get all parameters model = H2OGradientBoostingEstimator(distribution=self.family) model.train(x=self.x_indices, y=self.y_index, training_frame=self.training1_data) run_time = pyunit_utils.find_grid_runtime([model ]) # find model train time print("Time taken to build a base barebone model is {0}".format( run_time)) summary_list = model._model_json["output"]["model_summary"] num_trees = summary_list.cell_values[0][summary_list.col_header.index( 'number_of_trees')] if num_trees == 0: self.min_runtime_per_tree = run_time else: self.min_runtime_per_tree = run_time / num_trees # grab all gridable parameters and its type (self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \ pyunit_utils.get_gridables(model._model_json["parameters"]) # randomly generate griddable parameters including values outside legal range, like setting alpha values to # be outside legal range of 0 and 1 and etc (self.hyper_params, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \ pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params, self.exclude_parameter_lists, self.gridable_parameters, self.gridable_types, self.gridable_defaults, random.randint(1, self.max_int_number), self.max_int_val, self.min_int_val, random.randint(1, self.max_real_number), self.max_real_val, self.min_real_val) # scale the max_runtime_secs parameters time_scale = self.time_scale * run_time if "max_runtime_secs" in list(self.hyper_params): self.hyper_params["max_runtime_secs"] = [ time_scale * x for x in self.hyper_params["max_runtime_secs"] ] self.possible_number_models = self.check_and_count_models() self.final_hyper_params["max_runtime_secs"] = self.hyper_params[ "max_runtime_secs"] # calculate true possible_number_models and exclude the bad parameters since they will not # result in any models being built # alpha_len = len(self.hyper_params["alpha"]) # lambda_len = len(self.hyper_params["lambda"]) time_len = len(self.hyper_params["max_runtime_secs"]) # len_good_alpha = len([x for x in self.hyper_params["alpha"] if (x >= 0) and (x <= 1)]) # len_good_lambda = len([x for x in self.hyper_params["lambda"] if (x >= 0)]) len_good_time = len( [x for x in self.hyper_params["max_runtime_secs"] if (x >= 0)]) self.possible_number_models = int(self.possible_number_models * len_good_time / time_len) # write out the hyper-parameters used into json files. pyunit_utils.write_hyper_parameters_json(self.current_dir, self.sandbox_dir, self.json_filename, self.final_hyper_params)
def setup_model(self): """ This function setup the gridsearch hyper-parameters that will be used later on: 1. It will first try to grab all the parameters that are griddable and parameters used by GLM. 2. It will find the intersection of parameters that are both griddable and used by GLM. 3. There are several extra parameters that are used by GLM that are denoted as griddable but actually is not. These parameters have to be discovered manually and they These are captured in self.exclude_parameter_lists. 4. We generate the gridsearch hyper-parameter. For numerical parameters, we will generate those randomly. For enums, we will include all of them. :return: None """ # build bare bone model to get all parameters model = H2OGeneralizedLinearEstimator(family=self.family, nfolds=self.nfolds) model.train(x=self.x_indices, y=self.y_index, training_frame=self.training1_data) run_time = pyunit_utils.find_grid_runtime([model]) # find model train time print("Time taken to build a base barebone model is {0}".format(run_time)) summary_list = model._model_json["output"]["model_summary"] num_iteration = summary_list.cell_values[0][summary_list.col_header.index('number_of_iterations')] if num_iteration == 0: self.min_runtime_per_epoch = run_time else: self.min_runtime_per_epoch = run_time/num_iteration # grab all gridable parameters and its type (self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \ pyunit_utils.get_gridables(model._model_json["parameters"]) # randomly generate griddable parameters including values outside legal range, like setting alpha values to # be outside legal range of 0 and 1 and etc (self.hyper_params_bad, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \ pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params_bad, self.exclude_parameter_lists, self.gridable_parameters, self.gridable_types, self.gridable_defaults, random.randint(1, self.max_int_number), self.max_int_val, self.min_int_val, random.randint(1, self.max_real_number), self.max_real_val*self.alpha_scale, self.min_real_val*self.alpha_scale) # scale the value of lambda parameters if "lambda" in list(self.hyper_params_bad): self.hyper_params_bad["lambda"] = [self.lambda_scale * x for x in self.hyper_params_bad["lambda"]] # scale the max_runtime_secs parameters time_scale = self.time_scale * run_time if "max_runtime_secs" in list(self.hyper_params_bad): self.hyper_params_bad["max_runtime_secs"] = [time_scale * x for x in self.hyper_params_bad["max_runtime_secs"]] [self.possible_number_models, self.final_hyper_params_bad] = \ pyunit_utils.check_and_count_models(self.hyper_params_bad, self.params_zero_one, self.params_more_than_zero, self.params_more_than_one, self.params_zero_positive, self.max_grid_model) if ("max_runtime_secs" not in list(self.final_hyper_params_bad)) and \ ("max_runtime_secs" in list(self.hyper_params_bad)): self.final_hyper_params_bad["max_runtime_secs"] = self.hyper_params_bad["max_runtime_secs"] len_good_time = len([x for x in self.hyper_params_bad["max_runtime_secs"] if (x >= 0)]) self.possible_number_models = self.possible_number_models * len_good_time # Stratified is illegal for Gaussian GLM self.possible_number_models = self.possible_number_models * self.scale_model # randomly generate griddable parameters with only good values (self.hyper_params, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \ pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params, self.exclude_parameter_lists, self.gridable_parameters, self.gridable_types, self.gridable_defaults, random.randint(1, self.max_int_number), self.max_int_val, 0, random.randint(1, self.max_real_number), self.max_real_val, 0) # scale the value of lambda parameters if "lambda" in list(self.hyper_params): self.hyper_params["lambda"] = [self.lambda_scale * x for x in self.hyper_params["lambda"]] # scale the max_runtime_secs parameters if "max_runtime_secs" in list(self.hyper_params): self.hyper_params["max_runtime_secs"] = [time_scale * x for x in self.hyper_params["max_runtime_secs"]] [self.true_correct_model_number, self.final_hyper_params] = \ pyunit_utils.check_and_count_models(self.hyper_params, self.params_zero_one, self.params_more_than_zero, self.params_more_than_one, self.params_zero_positive, self.max_grid_model) if ("max_runtime_secs" not in list(self.final_hyper_params)) and \ ("max_runtime_secs" in list(self.hyper_params)): self.final_hyper_params["max_runtime_secs"] = self.hyper_params["max_runtime_secs"] self.true_correct_model_number = self.true_correct_model_number * \ len(self.final_hyper_params["max_runtime_secs"]) # write out the hyper-parameters used into json files. pyunit_utils.write_hyper_parameters_json(self.current_dir, self.sandbox_dir, self.json_filename_bad, self.final_hyper_params_bad) pyunit_utils.write_hyper_parameters_json(self.current_dir, self.sandbox_dir, self.json_filename, self.final_hyper_params)
def setup_model(self): """ This function setup the gridsearch hyper-parameters that will be used later on: 1. It will first try to grab all the parameters that are griddable and parameters used by GBM. 2. It will find the intersection of parameters that are both griddable and used by GBM. 3. There are several extra parameters that are used by GBM that are denoted as griddable but actually is not. These parameters have to be discovered manually and they These are captured in self.exclude_parameter_lists. 4. We generate the gridsearch hyper-parameter. For numerical parameters, we will generate those randomly. For enums, we will include all of them. :return: None """ # build bare bone model to get all parameters model = H2OGradientBoostingEstimator(distribution=self.family) model.train(x=self.x_indices, y=self.y_index, training_frame=self.training1_data) run_time = pyunit_utils.find_grid_runtime([model]) # find model train time print("Time taken to build a base barebone model is {0}".format(run_time)) summary_list = model._model_json["output"]["model_summary"] num_trees = summary_list.cell_values[0][summary_list.col_header.index('number_of_trees')] if num_trees == 0: self.min_runtime_per_tree = run_time else: self.min_runtime_per_tree = run_time / num_trees # grab all gridable parameters and its type (self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \ pyunit_utils.get_gridables(model._model_json["parameters"]) # randomly generate griddable parameters including values outside legal range, like setting alpha values to # be outside legal range of 0 and 1 and etc (self.hyper_params, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \ pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params, self.exclude_parameter_lists, self.gridable_parameters, self.gridable_types, self.gridable_defaults, random.randint(1, self.max_int_number), self.max_int_val, self.min_int_val, random.randint(1, self.max_real_number), self.max_real_val, self.min_real_val) # scale the max_runtime_secs parameters time_scale = self.time_scale * run_time if "max_runtime_secs" in list(self.hyper_params): self.hyper_params["max_runtime_secs"] = [time_scale * x for x in self.hyper_params["max_runtime_secs"]] self.possible_number_models = self.check_and_count_models() self.final_hyper_params["max_runtime_secs"] = self.hyper_params["max_runtime_secs"] # calculate true possible_number_models and exclude the bad parameters since they will not # result in any models being built # alpha_len = len(self.hyper_params["alpha"]) # lambda_len = len(self.hyper_params["lambda"]) time_len = len(self.hyper_params["max_runtime_secs"]) # len_good_alpha = len([x for x in self.hyper_params["alpha"] if (x >= 0) and (x <= 1)]) # len_good_lambda = len([x for x in self.hyper_params["lambda"] if (x >= 0)]) len_good_time = len([x for x in self.hyper_params["max_runtime_secs"] if (x >= 0)]) self.possible_number_models = int(self.possible_number_models*len_good_time/time_len) # write out the hyper-parameters used into json files. pyunit_utils.write_hyper_parameters_json(self.current_dir, self.sandbox_dir, self.json_filename, self.final_hyper_params)
def setup_model(self): """ This function setup the gridsearch hyper-parameters that will be used later on: 1. It will first try to grab all the parameters that are griddable and parameters used by GLRM. 2. It will find the intersection of parameters that are both griddable and used by GLRM. 3. There are several extra parameters that are used by GLRM that are denoted as griddable but actually is not. These parameters have to be discovered manually and they These are captured in self.exclude_parameter_lists. 4. We generate the gridsearch hyper-parameter. For numerical parameters, we will generate those randomly. For enums, we will include all of them. :return: None """ # build bare bone model to get all parameters model = H2OGeneralizedLowRankEstimator(k=10, loss="Quadratic", gamma_x=random.uniform(0, 1), gamma_y=random.uniform(0, 1), transform="DEMEAN") model.train(x=self.training1_data.names, training_frame=self.training1_data) self.model_run_time = pyunit_utils.find_grid_runtime([model]) # find model train time print("Time taken to build a base barebone model is {0}".format(self.model_run_time)) summary_list = model._model_json["output"]["model_summary"] num_iter = summary_list["number_of_iterations"][0] self.min_runtime_per_iter = self.model_run_time / num_iter # grab all gridable parameters and its type (self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \ pyunit_utils.get_gridables(model._model_json["parameters"]) # randomly generate griddable parameters including values outside legal range, like setting alpha values to # be outside legal range of 0 and 1 and etc (self.hyper_params, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \ pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params, self.exclude_parameter_lists, self.gridable_parameters, self.gridable_types, self.gridable_defaults, random.randint(1, self.max_int_number), self.max_int_val, self.min_int_val, random.randint(1, self.max_real_number), self.max_real_val, self.min_real_val) hyper_params_list = list(self.hyper_params) # scale the max_runtime_secs parameters time_scale = self.time_scale * self.model_run_time if "max_runtime_secs" in hyper_params_list: self.hyper_params["max_runtime_secs"] = [time_scale * x for x in self.hyper_params["max_runtime_secs"]] # scale up the max_iterations to 100 if "max_iterations" in hyper_params_list: self.hyper_params["max_iterations"] = [self.iter_scale * x for x in self.hyper_params["max_iterations"]] # generate a new final_hyper_params which only takes a subset of all griddable parameters while # hyper_params take all griddable parameters and generate the grid search hyper-parameters [self.possible_number_models, self.final_hyper_params] = \ pyunit_utils.check_and_count_models(self.hyper_params, self.params_zero_one, self.params_more_than_zero, self.params_more_than_one, self.params_zero_positive, self.max_grid_model) # must add max_runtime_secs to restrict unit test run time and as a promise to Arno to test for this if ("max_runtime_secs" not in list(self.final_hyper_params)) and \ ("max_runtime_secs" in list(self.hyper_params)): self.final_hyper_params["max_runtime_secs"] = self.hyper_params["max_runtime_secs"] len_good_time = len([x for x in self.hyper_params["max_runtime_secs"] if (x >= 0)]) self.possible_number_models = self.possible_number_models*len_good_time if "k" not in list(self.final_hyper_params): # must add this one self.final_hyper_params["k"] = self.hyper_params["k"] len_good_k = len([x for x in self.final_hyper_params["k"] if (x >= 1)]) self.possible_number_models = self.possible_number_models*len_good_k self.final_hyper_params["seed"] = [self.seed] # added see to make test more repeatable # write out the hyper-parameters used into json files. pyunit_utils.write_hyper_parameters_json(self.current_dir, self.sandbox_dir, self.json_filename, self.final_hyper_params)
def setup_model(self): """ This function setup the gridsearch hyper-parameters that will be used later on: 1. It will first try to grab all the parameters that are griddable and parameters used by GLRM. 2. It will find the intersection of parameters that are both griddable and used by GLRM. 3. There are several extra parameters that are used by GLRM that are denoted as griddable but actually is not. These parameters have to be discovered manually and they These are captured in self.exclude_parameter_lists. 4. We generate the gridsearch hyper-parameter. For numerical parameters, we will generate those randomly. For enums, we will include all of them. :return: None """ # build bare bone model to get all parameters model = H2OGeneralizedLowRankEstimator(k=10, loss="Quadratic", gamma_x=random.uniform(0, 1), gamma_y=random.uniform(0, 1), transform="DEMEAN") model.train(x=self.training1_data.names, training_frame=self.training1_data) self.model_run_time = pyunit_utils.find_grid_runtime( [model]) # find model train time print("Time taken to build a base barebone model is {0}".format( self.model_run_time)) summary_list = model._model_json["output"]["model_summary"] num_iter = summary_list.cell_values[0][summary_list.col_header.index( 'number_of_iterations')] self.min_runtime_per_iter = self.model_run_time / num_iter # grab all gridable parameters and its type (self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \ pyunit_utils.get_gridables(model._model_json["parameters"]) # randomly generate griddable parameters including values outside legal range, like setting alpha values to # be outside legal range of 0 and 1 and etc (self.hyper_params, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \ pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params, self.exclude_parameter_lists, self.gridable_parameters, self.gridable_types, self.gridable_defaults, random.randint(1, self.max_int_number), self.max_int_val, self.min_int_val, random.randint(1, self.max_real_number), self.max_real_val, self.min_real_val) hyper_params_list = list(self.hyper_params) # scale the max_runtime_secs parameters time_scale = self.time_scale * self.model_run_time if "max_runtime_secs" in hyper_params_list: self.hyper_params["max_runtime_secs"] = [ time_scale * x for x in self.hyper_params["max_runtime_secs"] ] # scale up the max_iterations to 100 if "max_iterations" in hyper_params_list: self.hyper_params["max_iterations"] = [ self.iter_scale * x for x in self.hyper_params["max_iterations"] ] # generate a new final_hyper_params which only takes a subset of all griddable parameters while # hyper_params take all griddable parameters and generate the grid search hyper-parameters [self.possible_number_models, self.final_hyper_params] = \ pyunit_utils.check_and_count_models(self.hyper_params, self.params_zero_one, self.params_more_than_zero, self.params_more_than_one, self.params_zero_positive, self.max_grid_model) # must add max_runtime_secs to restrict unit test run time and as a promise to Arno to test for this if ("max_runtime_secs" not in list(self.final_hyper_params)) and \ ("max_runtime_secs" in list(self.hyper_params)): self.final_hyper_params["max_runtime_secs"] = self.hyper_params[ "max_runtime_secs"] len_good_time = len( [x for x in self.hyper_params["max_runtime_secs"] if (x >= 0)]) self.possible_number_models = self.possible_number_models * len_good_time if "k" not in list(self.final_hyper_params): # must add this one self.final_hyper_params["k"] = self.hyper_params["k"] len_good_k = len( [x for x in self.final_hyper_params["k"] if (x >= 1)]) self.possible_number_models = self.possible_number_models * len_good_k self.final_hyper_params["seed"] = [ self.seed ] # added see to make test more repeatable # write out the hyper-parameters used into json files. pyunit_utils.write_hyper_parameters_json(self.current_dir, self.sandbox_dir, self.json_filename, self.final_hyper_params)
def setup_model(self): """ This function setup the gridsearch hyper-parameters that will be used later on: 1. It will first try to grab all the parameters that are griddable and parameters used by deeplearning. 2. It will find the intersection of parameters that are both griddable and used by deeplearning. 3. There are several extra parameters that are used by deeplearning that are denoted as griddable but actually is not. These parameters have to be discovered manually and they These are captured in self.exclude_parameter_lists. 4. We generate the gridsearch hyper-parameter. For numerical parameters, we will generate those randomly. For enums, we will include all of them. :return: None """ # build bare bone model to get all parameters model = H2ODeepLearningEstimator(distribution=self.family, seed=self.seed, nfolds=self.nfolds, hidden=[10, 10, 10]) model.train(x=self.x_indices, y=self.y_index, training_frame=self.training1_data) self.model_run_time = pyunit_utils.find_grid_runtime( [model]) # find model train time print("Time taken to build a base barebone model is {0}".format( self.model_run_time)) summary_list = model._model_json["output"]["scoring_history"] num_iterations = summary_list.cell_values[2][ summary_list.col_header.index('iterations')] if num_iterations == 0: self.min_runtime_per_iteration = self.model_run_time else: self.min_runtime_per_iteration = self.model_run_time / num_iterations # grab all gridable parameters and its type (self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \ pyunit_utils.get_gridables(model._model_json["parameters"]) # randomly generate griddable parameters including values outside legal range, like setting alpha values to # be outside legal range of 0 and 1 and etc (self.hyper_params, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \ pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params, self.exclude_parameter_lists, self.gridable_parameters, self.gridable_types, self.gridable_defaults, random.randint(1, self.max_int_number), self.max_int_val, self.min_int_val, random.randint(1, self.max_real_number), self.max_real_val, self.min_real_val) # scale the max_runtime_secs parameter and others as well to make sure they make sense time_scale = self.time_scale * self.model_run_time if "max_runtime_secs" in list(self.hyper_params): self.hyper_params["max_runtime_secs"] = [ time_scale * x for x in self.hyper_params["max_runtime_secs"] ] if "epsilon" in list(self.hyper_params): self.hyper_params["epsilon"] = [ 1e-4 * x for x in self.hyper_params["epsilon"] ] if "input_dropout_ratio" in list(self.hyper_params): self.hyper_params["input_dropout_ratio"] = [ 0.5 * x for x in self.hyper_params["input_dropout_ratio"] ] if "hidden_dropout_ratio" in list(self.hyper_params): self.hyper_params["hidden_dropout_ratio"] = [ 0.5 * x for x in self.hyper_params["hidden_dropout_ratio"] ] if "hidden" in list(self.hyper_params): # need to change this up # randomly generate the number of layers in the network num_layer = random.randint(1, 3) # for each layer, randomly generate the number of nodes in it self.hyper_params["hidden"] = [ random.randint(1, self.max_int_val) for p in range(0, num_layer) ] if "epochs" in self.hyper_params: self.hyper_params["epochs"] = [ random.randint(self.min_int_val, self.max_int_val) for p in range(0, self.max_int_number) ] # generate a new final_hyper_params which only takes a subset of all griddable parameters [self.possible_number_models, self.final_hyper_params] = \ pyunit_utils.check_and_count_models(self.hyper_params, [], [], [], [], self.max_grid_model) # must add max_runtime_secs to restrict unit test run time and as a promise to Arno to test for this if ("max_runtime_secs" not in list(self.final_hyper_params)) and \ ("max_runtime_secs" in list(self.hyper_params)): self.final_hyper_params["max_runtime_secs"] = self.hyper_params[ "max_runtime_secs"] self.final_hyper_params["seed"] = [ self.seed ] # added see to make test more repeatable # write out the hyper-parameters used into json files. pyunit_utils.write_hyper_parameters_json(self.current_dir, self.sandbox_dir, self.json_filename, self.final_hyper_params)
def setup_model(self): """ This function setup the gridsearch hyper-parameters that will be used later on: 1. It will first try to grab all the parameters that are griddable and parameters used by GLM. 2. It will find the intersection of parameters that are both griddable and used by GLM. 3. There are several extra parameters that are used by GLM that are denoted as griddable but actually is not. These parameters have to be discovered manually and they These are captured in self.exclude_parameter_lists. 4. We generate the gridsearch hyper-parameter. For numerical parameters, we will generate those randomly. For enums, we will include all of them. :return: None """ # build bare bone model to get all parameters model = H2OGeneralizedLinearEstimator(family=self.family, nfolds=self.nfolds) model.train(x=self.x_indices, y=self.y_index, training_frame=self.training1_data) run_time = pyunit_utils.find_grid_runtime([model ]) # find model train time print("Time taken to build a base barebone model is {0}".format( run_time)) summary_list = model._model_json["output"]["model_summary"] num_iteration = summary_list.cell_values[0][ summary_list.col_header.index("number_of_iterations")] if num_iteration == 0: self.min_runtime_per_epoch = run_time else: self.min_runtime_per_epoch = run_time / num_iteration # grab all gridable parameters and its type (self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \ pyunit_utils.get_gridables(model._model_json["parameters"]) # randomly generate griddable parameters including values outside legal range, like setting alpha values to # be outside legal range of 0 and 1 and etc (self.hyper_params_bad, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \ pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params_bad, self.exclude_parameter_lists, self.gridable_parameters, self.gridable_types, self.gridable_defaults, random.randint(1, self.max_int_number), self.max_int_val, self.min_int_val, random.randint(1, self.max_real_number), self.max_real_val*self.alpha_scale, self.min_real_val*self.alpha_scale) # scale the value of lambda parameters if "lambda" in list(self.hyper_params_bad): self.hyper_params_bad["lambda"] = [ self.lambda_scale * x for x in self.hyper_params_bad["lambda"] ] # scale the max_runtime_secs parameters time_scale = self.time_scale * run_time if "max_runtime_secs" in list(self.hyper_params_bad): self.hyper_params_bad["max_runtime_secs"] = [ time_scale * x for x in self.hyper_params_bad["max_runtime_secs"] ] [self.possible_number_models, self.final_hyper_params_bad] = \ pyunit_utils.check_and_count_models(self.hyper_params_bad, self.params_zero_one, self.params_more_than_zero, self.params_more_than_one, self.params_zero_positive, self.max_grid_model) if ("max_runtime_secs" not in list(self.final_hyper_params_bad)) and \ ("max_runtime_secs" in list(self.hyper_params_bad)): self.final_hyper_params_bad[ "max_runtime_secs"] = self.hyper_params_bad["max_runtime_secs"] len_good_time = len([ x for x in self.hyper_params_bad["max_runtime_secs"] if (x >= 0) ]) self.possible_number_models = self.possible_number_models * len_good_time # Stratified is illegal for Gaussian GLM self.possible_number_models = self.possible_number_models * self.scale_model # randomly generate griddable parameters with only good values (self.hyper_params, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \ pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params, self.exclude_parameter_lists, self.gridable_parameters, self.gridable_types, self.gridable_defaults, random.randint(1, self.max_int_number), self.max_int_val, 0, random.randint(1, self.max_real_number), self.max_real_val, 0) # scale the value of lambda parameters if "lambda" in list(self.hyper_params): self.hyper_params["lambda"] = [ self.lambda_scale * x for x in self.hyper_params["lambda"] ] # scale the max_runtime_secs parameters if "max_runtime_secs" in list(self.hyper_params): self.hyper_params["max_runtime_secs"] = [ time_scale * x for x in self.hyper_params["max_runtime_secs"] ] [self.true_correct_model_number, self.final_hyper_params] = \ pyunit_utils.check_and_count_models(self.hyper_params, self.params_zero_one, self.params_more_than_zero, self.params_more_than_one, self.params_zero_positive, self.max_grid_model) if ("max_runtime_secs" not in list(self.final_hyper_params)) and \ ("max_runtime_secs" in list(self.hyper_params)): self.final_hyper_params["max_runtime_secs"] = self.hyper_params[ "max_runtime_secs"] self.true_correct_model_number = \ self.true_correct_model_number * len(self.final_hyper_params["max_runtime_secs"]) # write out the hyper-parameters used into json files. pyunit_utils.write_hyper_parameters_json(self.current_dir, self.sandbox_dir, self.json_filename_bad, self.final_hyper_params_bad) pyunit_utils.write_hyper_parameters_json(self.current_dir, self.sandbox_dir, self.json_filename, self.final_hyper_params)
def setup_model(self): """ This function setup the gridsearch hyper-parameters that will be used later on: 1. It will first try to grab all the parameters that are griddable and parameters used by naivebayes. 2. It will find the intersection of parameters that are both griddable and used by naivebayes. 3. There are several extra parameters that are used by naivebayes that are denoted as griddable but actually are not. These parameters have to be discovered manually and they are captured in self.exclude_parameter_lists. 4. We generate the gridsearch hyper-parameter. For numerical parameters, we will generate those randomly. For enums, we will include all of them. :return: None """ # build bare bone model to get all parameters model = H2ONaiveBayesEstimator(nfolds=self.nfolds, compute_metrics=True) model.train(x=self.x_indices, y=self.y_index, training_frame=self.training1_data) self.model_run_time = pyunit_utils.find_grid_runtime([model]) # find model train time print("Time taken to build a base barebone model is {0}".format(self.model_run_time)) # grab all gridable parameters and its type (self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \ pyunit_utils.get_gridables(model._model_json["parameters"]) # randomly generate griddable parameters including values outside legal range, like setting alpha values to # be outside legal range of 0 and 1 and etc (self.hyper_params, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \ pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params, self.exclude_parameter_lists, self.gridable_parameters, self.gridable_types, self.gridable_defaults, random.randint(1, self.max_int_number), self.max_int_val, self.min_int_val, random.randint(1, self.max_real_number), self.max_real_val, self.min_real_val) # scale the max_runtime_secs parameter and others as well to make sure they make sense time_scale = self.time_scale * self.model_run_time if "max_runtime_secs" in list(self.hyper_params): self.hyper_params["max_runtime_secs"] = [time_scale * x for x in self.hyper_params["max_runtime_secs"]] # generate a new final_hyper_params which only takes a subset of all griddable parameters while # hyper_params take all griddable parameters and generate the grid search hyper-parameters [self.possible_number_models, self.final_hyper_params] = \ pyunit_utils.check_and_count_models(self.hyper_params, self.params_zero_one, self.params_more_than_zero, self.params_more_than_one, self.params_zero_positive, self.max_grid_model) final_hyper_params_keys = list(self.final_hyper_params) # must add max_runtime_secs to restrict unit test run time and as a promise to Arno to test for this if ("max_runtime_secs" not in final_hyper_params_keys) and \ ("max_runtime_secs" in list(self.hyper_params)): self.final_hyper_params["max_runtime_secs"] = self.hyper_params["max_runtime_secs"] len_good_time = len([x for x in self.hyper_params["max_runtime_secs"] if (x >= 0)]) self.possible_number_models = self.possible_number_models*len_good_time # need to check that min_prob >= 1e-10 if "min_prob" in final_hyper_params_keys: old_len_prob = len([x for x in self.final_hyper_params["max_runtime_secs"] if (x >= 0)]) good_len_prob = len([x for x in self.final_hyper_params["max_runtime_secs"] if (x >= 1e-10)]) if (old_len_prob > 0): self.possible_number_models = self.possible_number_models*good_len_prob/old_len_prob else: self.possible_number_models = 0 if "laplace" in final_hyper_params_keys: self.final_hyper_params["laplace"] = [self.laplace_scale * x for x in self.hyper_params["laplace"]] # write out the hyper-parameters used into json files. pyunit_utils.write_hyper_parameters_json(self.current_dir, self.sandbox_dir, self.json_filename, self.final_hyper_params)