def fit(self,X,Y): """Fit the model according to the given training datastructure. Parameters ---------- df : pandas dataframe with name variables with last column as target variable. Returns ------- self : object """ is_nominal_target = is_classifier(self) start_time = time() #self._rulelist = _fit_rulelist( # X,Y, self.target_model, self.max_depth,self.beam_width,self.min_support, self.n_cutpoints, # self.task,self.discretization,self.max_rules,self.alpha_gain) data = Data(input_data=X, n_cutpoints=self.n_cutpoints, discretization=self.discretization, target_data=Y, target_model=self.target_model, min_support=self.min_support) if is_nominal_target: self._rulelist = CategoricalRuleList(data, self.task, self.max_depth, self.beam_width, self.min_support, self.max_rules, self.alpha_gain) else: self._rulelist = GaussianRuleList(data, self.task, self.max_depth, self.beam_width, self.min_support, self.max_rules, self.alpha_gain) self._rulelist = greedy_and_beamsearch(data, self._rulelist) self._rulelist.add_description() self.runtime = time() - start_time self.number_rules = self._rulelist.number_rules self.rule_sets = [bitset2indexes(bitset) for bitset in self._rulelist.bitset_rules] return self
def generate_inputvalues_one_target(constant_parameters): input_input_data, input_n_cutpoints, input_discretization, input_target_data, input_minsupp = constant_parameters # targets np.random.seed(seed=42) dictoutput = {"target1": np.arange(100)} input_output_data = pd.DataFrame(data=dictoutput) data_class = Data(input_input_data, input_n_cutpoints, input_discretization, input_output_data, input_target_data, input_minsupp) input_bitarray_for_statistic = bit_mask(data_class.number_instances) yield data_class, input_bitarray_for_statistic
def generate_inputvalues_one_target(constant_parameters): input_input_data, input_n_cutpoints, input_discretization, input_target_data, input_minsupp = constant_parameters # targets dictoutput = { "target1": np.array(["below50" if i < 50 else "above49" for i in range(100)]) } input_output_data = pd.DataFrame(data=dictoutput) data_class = Data(input_input_data, input_n_cutpoints, input_discretization, input_output_data, input_target_data, input_minsupp) input_bitarray_for_statistic = bit_mask(data_class.number_instances) yield data_class
def generate_input_dataframe_two_target_variancezero(constant_parameters): input_n_cutpoints, input_discretization, input_target_data, input_minsupp = constant_parameters dictinput = { "attribute1": np.arange(100), "attribute2": np.array(["below50" if i < 50 else "above49" for i in range(100)]) } input_input_data = pd.DataFrame(data=dictinput) dictoutput = {"target1": np.arange(100), "target2": np.ones(100)} input_output_data = pd.DataFrame(data=dictoutput) data = Data(input_input_data, input_n_cutpoints, input_discretization, input_output_data, input_target_data, input_minsupp) yield data
def generate_input_dataframe_one_target(constant_parameters): input_n_cutpoints, input_discretization, input_target_data, input_minsupp = constant_parameters dictinput = { "attribute1": np.arange(100000), "attribute2": np.array( ["below1000" if i < 1000 else "above999" for i in range(100000)]) } input_input_data = pd.DataFrame(data=dictinput) dictoutput = {"target1": np.random.normal(loc=1, scale=1, size=100000)} input_output_data = pd.DataFrame(data=dictoutput) data = Data(input_input_data, input_n_cutpoints, input_discretization, input_output_data, input_target_data, input_minsupp) yield data
def generate_inputvalues_two_targets(constant_parameters): input_input_data, input_n_cutpoints, input_discretization, input_target_data, input_minsupp = constant_parameters # targets dictoutput = { "target1": np.array(["below50" if i < 50 else "above49" for i in range(100)]), "target2": np.array(["below99" if i < 99 else "above99" for i in range(100)]) } input_output_data = pd.DataFrame(data=dictoutput) data_class = Data(input_input_data, input_n_cutpoints, input_discretization, input_output_data, input_target_data, input_minsupp) yield data_class
def generate_input_dataframe_two_target_normal(constant_parameters): input_n_cutpoints, input_discretization, input_target_data, input_minsupp = constant_parameters dictinput = { "attribute1": np.arange(100000), "attribute2": np.array( ["below1000" if i < 1000 else "above999" for i in range(100000)]) } input_input_data = pd.DataFrame(data=dictinput) dictoutput = { "target1": np.concatenate((np.random.normal(loc=20, scale=3, size=16666), np.random.normal(loc=100, scale=6, size=83334)), axis=None), "target2": np.concatenate((np.random.normal(loc=10, scale=2, size=16666), np.random.normal(loc=50, scale=5, size=83334)), axis=None) } input_output_data = pd.DataFrame(data=dictoutput) data = Data(input_input_data, input_n_cutpoints, input_discretization, input_output_data, input_target_data, input_minsupp) yield data
def test_gaussian_twotargets(self, generate_input_dataframe_two_target, constant_parameters): input_input_data, input_output_data = generate_input_dataframe_two_target input_n_cutpoints, input_discretization, input_target_data, input_minsupp = constant_parameters expected_number_targets = 2 expected_number_attributes = 2 expected_number_instances = 100 expected_attribute_names = {"attribute1", "attribute2"} expected_target_names = {"target1", "target2"} output_data = Data(input_input_data, input_n_cutpoints, input_discretization, input_output_data, input_target_data, input_minsupp) pd.testing.assert_frame_equal(input_input_data, output_data.input_data) pd.testing.assert_frame_equal(input_output_data, output_data.target_data) assert expected_number_attributes == output_data.number_attributes assert expected_number_attributes == len(output_data.attributes) assert expected_number_targets == output_data.number_targets assert expected_number_instances == output_data.number_instances assert expected_attribute_names == output_data.attribute_names assert expected_target_names == output_data.target_names @pytest.mark.xfail def test_name_not_present(self): pass @pytest.mark.xfail def test_category_not_present(self): pass @pytest.mark.xfail def test_receives_series(self): pass
def test_gaussian_onetarget(self, generate_input_dataframe_one_target, constant_parameters): input_input_data, input_output_data = generate_input_dataframe_one_target input_n_cutpoints, input_discretization, input_target_data, input_minsupp = constant_parameters expected_number_targets = 1 expected_number_attributes = 2 expected_number_instances = 100 expected_attribute_names = {"attribute1", "attribute2"} expected_target_names = {"target1"} output_data = Data(input_input_data, input_n_cutpoints, input_discretization, input_output_data, input_target_data, input_minsupp) pd.testing.assert_frame_equal(input_input_data, output_data.input_data) pd.testing.assert_frame_equal(input_output_data, output_data.target_data) assert expected_number_attributes == output_data.number_attributes assert expected_number_attributes == len(output_data.attributes) assert expected_number_targets == output_data.number_targets assert expected_number_instances == output_data.number_instances assert expected_attribute_names == output_data.attribute_names assert expected_target_names == output_data.target_names
def generate_input_dataframe_two_target(constant_parameters): input_input_data, input_output_data, input_n_cutpoints, input_discretization, input_target_data ,input_minsupp\ = constant_parameters data = Data(input_input_data, input_n_cutpoints, input_discretization, input_output_data, input_target_data, input_minsupp) yield data