Python write_syn_floating_point_dataset_glm示例

编程语言: Python

命名空间/包名称: tests.pyunit_utils

方法/功能: write_syn_floating_point_dataset_glm

hotexamples.com的示例: 4

Python write_syn_floating_point_dataset_glm - 已找到4个示例。这些是从开源项目中提取的最受好评的tests.pyunit_utils.write_syn_floating_point_dataset_glm现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： pyunit_NOPASS_glm_binomial_gridsearch_randomdiscrete_large.py 项目： AllCodeNoGyaan/h2o-3

    def setup_data(self):
        """
        This function performs all initializations necessary:
        1. generates all the random values for our dynamic tests like the Gaussian
        noise std, column count and row count for training/test data sets.
        2. generate the appropriate data sets.
        """

        # clean out the sandbox directory first
        self.sandbox_dir = pyunit_utils.make_Rsandbox_dir(self.current_dir, self.test_name, True)

        # randomly set Gaussian noise standard deviation as a fraction of actual predictor standard deviation
        self.noise_std = random.uniform(0, math.sqrt(pow((self.max_p_value - self.min_p_value), 2) / 12))
        self.noise_var = self.noise_std*self.noise_std

        # randomly determine data set size in terms of column and row counts
        self.train_col_count = random.randint(1, self.max_col_count)
        self.train_row_count = round(self.train_col_count * random.uniform(self.min_col_count_ratio,
                                                                           self.max_col_count_ratio))

        #  DEBUGGING setup_data, remember to comment them out once done.
        # self.train_col_count = 3
        # self.train_row_count = 200
        # self.max_real_number = 1
        # self.max_int_number = 1
        # end DEBUGGING

        if 'gaussian' in self.family:       # increase data range
            self.max_p_value *= 50
            self.min_p_value *= 50
            self.max_w_value *= 50
            self.min_w_value *= 50

        # generate real value weight vector and training/validation/test data sets for GLM
        pyunit_utils.write_syn_floating_point_dataset_glm(self.training1_data_file, "",
                                                          "", self.weight_data_file,
                                                          self.train_row_count, self.train_col_count, self.data_type,
                                                          self.max_p_value, self.min_p_value, self.max_w_value,
                                                          self.min_w_value, self.noise_std, self.family,
                                                          self.train_row_count, self.train_row_count,
                                                          class_number=self.class_number,
                                                          class_method=[self.class_method, self.class_method,
                                                                        self.test_class_method],
                                                          class_margin=[self.margin, self.margin,
                                                                        self.test_class_margin])

        # preload data sets
        self.training1_data = h2o.import_file(pyunit_utils.locate(self.training1_data_file))

        # set data set indices for predictors and response
        self.y_index = self.training1_data.ncol-1
        self.x_indices = list(range(self.y_index))

        # set response to be categorical for classification tasks
        if 'binomial' in self.family:
            self.training1_data[self.y_index] = self.training1_data[self.y_index].round().asfactor()

            # check to make sure all response classes are represented, otherwise, quit
            if self.training1_data[self.y_index].nlevels()[0] < self.class_number:
                print("Response classes are not represented in training dataset.")
                sys.exit(0)

        # save the training data files just in case the code crashed.
        pyunit_utils.remove_csv_files(self.current_dir, ".csv", action='copy', new_dir_path=self.sandbox_dir)

示例#2

显示文件

文件： pyunit_NOPASS_gbm_gridsearch_over_all_params_large.py 项目： AllCodeNoGyaan/h2o-3

    def setup_data(self):
        """
        This function performs all initializations necessary:
        1. generates all the random parameter values for our dynamic tests like the Gaussian
        noise std, column count and row count for training/test data sets.
        2. randomly choose the distribution family (gaussian, binomial, multinomial)
        to test.
        3. with the chosen distribution family, generate the appropriate data sets
        4. load the data sets and set the training set indices and response column index
        """

        # create and clean out the sandbox directory first
        self.sandbox_dir = pyunit_utils.make_Rsandbox_dir(self.current_dir, self.test_name, True)

        # randomly set Gaussian noise standard deviation as a fraction of actual predictor standard deviation
        self.noise_std = random.uniform(0, math.sqrt(pow((self.max_p_value - self.min_p_value), 2) / 12))
        self.noise_var = self.noise_std*self.noise_std

        # randomly determine data set size in terms of column and row counts
        self.train_col_count = random.randint(1, self.max_col_count)
        self.train_row_count = round(self.train_col_count * random.uniform(self.min_col_count_ratio,
                                                                           self.max_col_count_ratio))

        #  DEBUGGING setup_data, remember to comment them out once done.
        self.train_col_count = 3
        self.train_row_count = 200
        # self.max_real_number = 1
        # self.max_int_number = 1
        # end DEBUGGING

        #### This is used to generate dataset for regression or classification.  Nothing to do
        #### with setting the distribution family in this case
        # randomly choose which family of GLM algo to use
        self.family = self.families[random.randint(0, len(self.families)-1)]

        # set class number for classification
        if 'multinomial' in self.family:
            self.class_number = random.randint(2, self.max_class_number)    # randomly set number of classes K

        # generate real value weight vector and training/validation/test data sets for GLM
        pyunit_utils.write_syn_floating_point_dataset_glm(self.training1_data_file, self.training2_data_file,
                                                          self.training3_data_file, self.weight_data_file,
                                                          self.train_row_count, self.train_col_count, 2,
                                                          self.max_p_value, self.min_p_value, self.max_w_value,
                                                          self.min_w_value, self.noise_std, self.family,
                                                          self.train_row_count, self.train_row_count,
                                                          class_number=self.class_number,
                                                          class_method=['probability', 'probability',
                                                                        'probability'])

        # preload data sets
        self.training1_data = h2o.import_file(pyunit_utils.locate(self.training1_data_file))
        self.training2_data = h2o.import_file(pyunit_utils.locate(self.training2_data_file))
        self.training3_data = h2o.import_file(pyunit_utils.locate(self.training3_data_file))

        # set data set indices for predictors and response
        self.y_index = self.training1_data.ncol-1
        self.x_indices = list(range(self.y_index))

        # set response to be categorical for classification tasks
        if 'multinomial' in self.family:
            self.training1_data[self.y_index] = self.training1_data[self.y_index].round().asfactor()

            # check to make sure all response classes are represented, otherwise, quit
            if self.training1_data[self.y_index].nlevels()[0] < self.class_number:
                print("Response classes are not represented in training dataset.")
                sys.exit(0)

            self.training2_data[self.y_index] = self.training2_data[self.y_index].round().asfactor()
            self.training3_data[self.y_index] = self.training2_data[self.y_index].round().asfactor()
            # self.hyper_params["validation_frame"] = [self.training1_data.frame_id, self.training2_data.frame_id,
            #                                          self.training3_data.frame_id]

        # save the training data files just in case the code crashed.
        pyunit_utils.remove_csv_files(self.current_dir, ".csv", action='copy', new_dir_path=self.sandbox_dir)

示例#3

显示文件

文件： pyunit_gbm_gridsearch_over_all_params_large.py 项目： asish12/h2o-3

    def setup_data(self):
        """
        This function performs all initializations necessary:
        1. generates all the random parameter values for our dynamic tests like the Gaussian
        noise std, column count and row count for training/test data sets.
        2. randomly choose the distribution family (gaussian, binomial, multinomial)
        to test.
        3. with the chosen distribution family, generate the appropriate data sets
        4. load the data sets and set the training set indices and response column index
        """

        # create and clean out the sandbox directory first
        self.sandbox_dir = pyunit_utils.make_Rsandbox_dir(
            self.current_dir, self.test_name, True)

        # randomly set Gaussian noise standard deviation as a fraction of actual predictor standard deviation
        self.noise_std = random.uniform(
            0, math.sqrt(pow((self.max_p_value - self.min_p_value), 2) / 12))
        self.noise_var = self.noise_std * self.noise_std

        # randomly determine data set size in terms of column and row counts
        self.train_col_count = random.randint(1, self.max_col_count)
        self.train_row_count = round(
            self.train_col_count *
            random.uniform(self.min_col_count_ratio, self.max_col_count_ratio))

        #  DEBUGGING setup_data, remember to comment them out once done.
        self.train_col_count = 3
        self.train_row_count = 200
        # self.max_real_number = 1
        # self.max_int_number = 1
        # end DEBUGGING

        #### This is used to generate dataset for regression or classification.  Nothing to do
        #### with setting the distribution family in this case
        # randomly choose which family of GLM algo to use
        self.family = self.families[random.randint(0, len(self.families) - 1)]

        # set class number for classification
        if 'multinomial' in self.family:
            self.class_number = random.randint(
                2, self.max_class_number)  # randomly set number of classes K

        # generate real value weight vector and training/validation/test data sets for GLM
        pyunit_utils.write_syn_floating_point_dataset_glm(
            self.training1_data_file,
            self.training2_data_file,
            self.training3_data_file,
            self.weight_data_file,
            self.train_row_count,
            self.train_col_count,
            2,
            self.max_p_value,
            self.min_p_value,
            self.max_w_value,
            self.min_w_value,
            self.noise_std,
            self.family,
            self.train_row_count,
            self.train_row_count,
            class_number=self.class_number,
            class_method=['probability', 'probability', 'probability'])

        # preload data sets
        self.training1_data = h2o.import_file(
            pyunit_utils.locate(self.training1_data_file))
        self.training2_data = h2o.import_file(
            pyunit_utils.locate(self.training2_data_file))
        self.training3_data = h2o.import_file(
            pyunit_utils.locate(self.training3_data_file))

        # set data set indices for predictors and response
        self.y_index = self.training1_data.ncol - 1
        self.x_indices = list(range(self.y_index))

        # set response to be categorical for classification tasks
        if 'multinomial' in self.family:
            self.training1_data[self.y_index] = self.training1_data[
                self.y_index].round().asfactor()

            # check to make sure all response classes are represented, otherwise, quit
            if self.training1_data[
                    self.y_index].nlevels()[0] < self.class_number:
                print(
                    "Response classes are not represented in training dataset."
                )
                sys.exit(0)

            self.training2_data[self.y_index] = self.training2_data[
                self.y_index].round().asfactor()
            self.training3_data[self.y_index] = self.training2_data[
                self.y_index].round().asfactor()
            # self.hyper_params["validation_frame"] = [self.training1_data.frame_id, self.training2_data.frame_id,
            #                                          self.training3_data.frame_id]

        # save the training data files just in case the code crashed.
        pyunit_utils.remove_csv_files(self.current_dir,
                                      ".csv",
                                      action='copy',
                                      new_dir_path=self.sandbox_dir)

示例#4

显示文件

文件： pyunit_NOPASS_glm_gaussian_gridsearch_randomdiscrete_large.py 项目： ryanallen82/h2o-3

    def setup_data(self):
        """
        This function performs all initializations necessary:
        1. generates all the random values for our dynamic tests like the Gaussian
        noise std, column count and row count for training/test data sets.
        2. generate the appropriate data sets.
        """

        # clean out the sandbox directory first
        self.sandbox_dir = pyunit_utils.make_Rsandbox_dir(
            self.current_dir, self.test_name, True)

        # randomly set Gaussian noise standard deviation as a fraction of actual predictor standard deviation
        self.noise_std = random.uniform(
            0, math.sqrt(pow((self.max_p_value - self.min_p_value), 2) / 12))
        self.noise_var = self.noise_std * self.noise_std

        # randomly determine data set size in terms of column and row counts
        self.train_col_count = random.randint(1, self.max_col_count)
        self.train_row_count = round(
            self.train_col_count *
            random.uniform(self.min_col_count_ratio, self.max_col_count_ratio))

        #  DEBUGGING setup_data, remember to comment them out once done.
        # self.train_col_count = 3
        # self.train_row_count = 200
        # self.max_real_number = 5
        # self.max_int_number = 5
        # end DEBUGGING

        if 'gaussian' in self.family:  # increase data range
            self.max_p_value *= 50
            self.min_p_value *= 50
            self.max_w_value *= 50
            self.min_w_value *= 50

        # generate real value weight vector and training/validation/test data sets for GLM
        pyunit_utils.write_syn_floating_point_dataset_glm(
            self.training1_data_file,
            "",
            "",
            self.weight_data_file,
            self.train_row_count,
            self.train_col_count,
            self.data_type,
            self.max_p_value,
            self.min_p_value,
            self.max_w_value,
            self.min_w_value,
            self.noise_std,
            self.family,
            self.train_row_count,
            self.train_row_count,
            class_number=self.class_number,
            class_method=[
                self.class_method, self.class_method, self.test_class_method
            ],
            class_margin=[self.margin, self.margin, self.test_class_margin])

        # preload data sets
        self.training1_data = h2o.import_file(
            pyunit_utils.locate(self.training1_data_file))

        # set data set indices for predictors and response
        self.y_index = self.training1_data.ncol - 1
        self.x_indices = list(range(self.y_index))

        # set response to be categorical for classification tasks
        if ('binomial' in self.family):
            self.training1_data[self.y_index] = self.training1_data[
                self.y_index].round().asfactor()

            # check to make sure all response classes are represented, otherwise, quit
            if self.training1_data[
                    self.y_index].nlevels()[0] < self.class_number:
                print(
                    "Response classes are not represented in training dataset."
                )
                sys.exit(0)

        # save the training data files just in case the code crashed.
        pyunit_utils.remove_csv_files(self.current_dir,
                                      ".csv",
                                      action='copy',
                                      new_dir_path=self.sandbox_dir)