예제 #1
0
    def fit(self,X,Y):
        """Fit the model according to the given training datastructure.
        Parameters
        ----------
        df : pandas dataframe with name variables with last column as target
        variable.
        Returns
        -------
        self : object
        """
        is_nominal_target  = is_classifier(self)
        start_time = time()
        #self._rulelist = _fit_rulelist(
        #        X,Y, self.target_model, self.max_depth,self.beam_width,self.min_support, self.n_cutpoints,
        #        self.task,self.discretization,self.max_rules,self.alpha_gain)

        data = Data(input_data=X, n_cutpoints=self.n_cutpoints, discretization=self.discretization,
                    target_data=Y, target_model=self.target_model, min_support=self.min_support)

        if is_nominal_target:
            self._rulelist = CategoricalRuleList(data, self.task, self.max_depth, self.beam_width, self.min_support, self.max_rules,
                                                       self.alpha_gain)
        else:
            self._rulelist = GaussianRuleList(data, self.task, self.max_depth, self.beam_width, self.min_support, self.max_rules,
                                                       self.alpha_gain)
        self._rulelist = greedy_and_beamsearch(data, self._rulelist)
        self._rulelist.add_description()
        self.runtime = time() - start_time
        self.number_rules = self._rulelist.number_rules
        self.rule_sets = [bitset2indexes(bitset) for bitset in self._rulelist.bitset_rules]

        return self
예제 #2
0
def generate_inputvalues_one_target(constant_parameters):
    input_input_data, input_n_cutpoints, input_discretization, input_target_data, input_minsupp = constant_parameters
    # targets
    np.random.seed(seed=42)
    dictoutput = {"target1": np.arange(100)}
    input_output_data = pd.DataFrame(data=dictoutput)
    data_class = Data(input_input_data, input_n_cutpoints,
                      input_discretization, input_output_data,
                      input_target_data, input_minsupp)
    input_bitarray_for_statistic = bit_mask(data_class.number_instances)
    yield data_class, input_bitarray_for_statistic
예제 #3
0
def generate_inputvalues_one_target(constant_parameters):
    input_input_data, input_n_cutpoints, input_discretization, input_target_data, input_minsupp = constant_parameters
    # targets
    dictoutput = {
        "target1":
        np.array(["below50" if i < 50 else "above49" for i in range(100)])
    }
    input_output_data = pd.DataFrame(data=dictoutput)
    data_class = Data(input_input_data, input_n_cutpoints,
                      input_discretization, input_output_data,
                      input_target_data, input_minsupp)
    input_bitarray_for_statistic = bit_mask(data_class.number_instances)
    yield data_class
예제 #4
0
def generate_input_dataframe_two_target_variancezero(constant_parameters):
    input_n_cutpoints, input_discretization, input_target_data, input_minsupp = constant_parameters
    dictinput = {
        "attribute1":
        np.arange(100),
        "attribute2":
        np.array(["below50" if i < 50 else "above49" for i in range(100)])
    }
    input_input_data = pd.DataFrame(data=dictinput)
    dictoutput = {"target1": np.arange(100), "target2": np.ones(100)}
    input_output_data = pd.DataFrame(data=dictoutput)
    data = Data(input_input_data, input_n_cutpoints, input_discretization,
                input_output_data, input_target_data, input_minsupp)
    yield data
예제 #5
0
def generate_input_dataframe_one_target(constant_parameters):
    input_n_cutpoints, input_discretization, input_target_data, input_minsupp = constant_parameters
    dictinput = {
        "attribute1":
        np.arange(100000),
        "attribute2":
        np.array(
            ["below1000" if i < 1000 else "above999" for i in range(100000)])
    }
    input_input_data = pd.DataFrame(data=dictinput)
    dictoutput = {"target1": np.random.normal(loc=1, scale=1, size=100000)}
    input_output_data = pd.DataFrame(data=dictoutput)
    data = Data(input_input_data, input_n_cutpoints, input_discretization,
                input_output_data, input_target_data, input_minsupp)
    yield data
예제 #6
0
def generate_inputvalues_two_targets(constant_parameters):
    input_input_data, input_n_cutpoints, input_discretization, input_target_data, input_minsupp = constant_parameters
    # targets
    dictoutput = {
        "target1":
        np.array(["below50" if i < 50 else "above49" for i in range(100)]),
        "target2":
        np.array(["below99" if i < 99 else "above99" for i in range(100)])
    }

    input_output_data = pd.DataFrame(data=dictoutput)
    data_class = Data(input_input_data, input_n_cutpoints,
                      input_discretization, input_output_data,
                      input_target_data, input_minsupp)
    yield data_class
예제 #7
0
def generate_input_dataframe_two_target_normal(constant_parameters):
    input_n_cutpoints, input_discretization, input_target_data, input_minsupp = constant_parameters
    dictinput = {
        "attribute1":
        np.arange(100000),
        "attribute2":
        np.array(
            ["below1000" if i < 1000 else "above999" for i in range(100000)])
    }
    input_input_data = pd.DataFrame(data=dictinput)
    dictoutput = {
        "target1":
        np.concatenate((np.random.normal(loc=20, scale=3, size=16666),
                        np.random.normal(loc=100, scale=6, size=83334)),
                       axis=None),
        "target2":
        np.concatenate((np.random.normal(loc=10, scale=2, size=16666),
                        np.random.normal(loc=50, scale=5, size=83334)),
                       axis=None)
    }
    input_output_data = pd.DataFrame(data=dictoutput)
    data = Data(input_input_data, input_n_cutpoints, input_discretization,
                input_output_data, input_target_data, input_minsupp)
    yield data
예제 #8
0
    def test_gaussian_twotargets(self, generate_input_dataframe_two_target,
                                 constant_parameters):
        input_input_data, input_output_data = generate_input_dataframe_two_target
        input_n_cutpoints, input_discretization, input_target_data, input_minsupp = constant_parameters

        expected_number_targets = 2
        expected_number_attributes = 2
        expected_number_instances = 100
        expected_attribute_names = {"attribute1", "attribute2"}
        expected_target_names = {"target1", "target2"}

        output_data = Data(input_input_data, input_n_cutpoints,
                           input_discretization, input_output_data,
                           input_target_data, input_minsupp)

        pd.testing.assert_frame_equal(input_input_data, output_data.input_data)
        pd.testing.assert_frame_equal(input_output_data,
                                      output_data.target_data)
        assert expected_number_attributes == output_data.number_attributes
        assert expected_number_attributes == len(output_data.attributes)
        assert expected_number_targets == output_data.number_targets
        assert expected_number_instances == output_data.number_instances
        assert expected_attribute_names == output_data.attribute_names
        assert expected_target_names == output_data.target_names

        @pytest.mark.xfail
        def test_name_not_present(self):
            pass

        @pytest.mark.xfail
        def test_category_not_present(self):
            pass

        @pytest.mark.xfail
        def test_receives_series(self):
            pass
예제 #9
0
    def test_gaussian_onetarget(self, generate_input_dataframe_one_target,
                                constant_parameters):
        input_input_data, input_output_data = generate_input_dataframe_one_target
        input_n_cutpoints, input_discretization, input_target_data, input_minsupp = constant_parameters

        expected_number_targets = 1
        expected_number_attributes = 2
        expected_number_instances = 100
        expected_attribute_names = {"attribute1", "attribute2"}
        expected_target_names = {"target1"}

        output_data = Data(input_input_data, input_n_cutpoints,
                           input_discretization, input_output_data,
                           input_target_data, input_minsupp)

        pd.testing.assert_frame_equal(input_input_data, output_data.input_data)
        pd.testing.assert_frame_equal(input_output_data,
                                      output_data.target_data)
        assert expected_number_attributes == output_data.number_attributes
        assert expected_number_attributes == len(output_data.attributes)
        assert expected_number_targets == output_data.number_targets
        assert expected_number_instances == output_data.number_instances
        assert expected_attribute_names == output_data.attribute_names
        assert expected_target_names == output_data.target_names
예제 #10
0
def generate_input_dataframe_two_target(constant_parameters):
    input_input_data, input_output_data, input_n_cutpoints, input_discretization, input_target_data ,input_minsupp\
        = constant_parameters
    data = Data(input_input_data, input_n_cutpoints, input_discretization,
                input_output_data, input_target_data, input_minsupp)
    yield data