Exemplo n.º 1
0
    def fit(self,X,Y):
        """Fit the model according to the given training datastructure.
        Parameters
        ----------
        df : pandas dataframe with name variables with last column as target
        variable.
        Returns
        -------
        self : object
        """
        is_nominal_target  = is_classifier(self)
        start_time = time()
        #self._rulelist = _fit_rulelist(
        #        X,Y, self.target_model, self.max_depth,self.beam_width,self.min_support, self.n_cutpoints,
        #        self.task,self.discretization,self.max_rules,self.alpha_gain)

        data = Data(input_data=X, n_cutpoints=self.n_cutpoints, discretization=self.discretization,
                    target_data=Y, target_model=self.target_model, min_support=self.min_support)

        if is_nominal_target:
            self._rulelist = CategoricalRuleList(data, self.task, self.max_depth, self.beam_width, self.min_support, self.max_rules,
                                                       self.alpha_gain)
        else:
            self._rulelist = GaussianRuleList(data, self.task, self.max_depth, self.beam_width, self.min_support, self.max_rules,
                                                       self.alpha_gain)
        self._rulelist = greedy_and_beamsearch(data, self._rulelist)
        self._rulelist.add_description()
        self.runtime = time() - start_time
        self.number_rules = self._rulelist.number_rules
        self.rule_sets = [bitset2indexes(bitset) for bitset in self._rulelist.bitset_rules]

        return self
Exemplo n.º 2
0
    def test_add_rule_2items(self, search_parameters,
                             generate_input_dataframe_two_target_normal,
                             generate_subgroup_2subgroups):
        data = generate_input_dataframe_two_target_normal
        input_target_model, input_max_depth, input_beam_width, input_minsupp, input_max_rules, input_alpha_gain = search_parameters
        subgroup2add1, subgroup2add2 = generate_subgroup_2subgroups
        input_task = "discovery"

        output_ruleset = GaussianRuleList(data, input_task, input_max_depth,
                                          input_beam_width, input_minsupp,
                                          input_max_rules, input_alpha_gain)
        output_ruleset.add_rule(subgroup2add1, data)
        output_ruleset.add_rule(subgroup2add2, data)

        expected_number_instances = data.number_instances
        expected_bitset_uncovered = mpz()
        expected_bitset_covered = bit_mask(100000)
        expected_number_rules = 2
        expected_length_model = universal_code_integers(2) + \
                                universal_code_integers(1) +uniform_combination_code(1, 2) +\
                                universal_code_integers_maximum(1, 2) + uniform_code(10)+ \
                                universal_code_integers(1) + uniform_combination_code(1, 2) + \
                                universal_code_integers_maximum(1, 1) + uniform_code(2)

        actual_numberinstances1 = popcount(output_ruleset.subgroups[0].bitarray) + \
                                  popcount(output_ruleset.subgroups[1].bitarray &~ output_ruleset.subgroups[0].bitarray) + \
                                  popcount(output_ruleset.bitset_uncovered)
        actual_numberinstances2 = output_ruleset.support_covered + output_ruleset.support_uncovered
        actual_numberinstances3 = popcount(output_ruleset.bitset_covered) + \
                                  popcount(output_ruleset.bitset_uncovered)
        actual_numberinstances4 = output_ruleset.subgroups[0].usage + output_ruleset.subgroups[1].usage +\
                                  output_ruleset.default_rule_statistics.usage

        assert expected_number_instances == actual_numberinstances1
        assert expected_number_instances == actual_numberinstances2
        assert expected_number_instances == actual_numberinstances3
        assert expected_number_instances == actual_numberinstances4
        assert expected_bitset_uncovered == output_ruleset.bitset_uncovered
        assert expected_bitset_covered == output_ruleset.bitset_covered
        assert expected_number_rules == output_ruleset.number_rules
        assert expected_length_model == pytest.approx(
            output_ruleset.length_model)
Exemplo n.º 3
0
    def test_add_rule_itemnumeric(self, search_parameters,
                                  generate_input_dataframe_two_target_normal,
                                  generate_subgroup_oneitem_numeric):
        data = generate_input_dataframe_two_target_normal
        input_target_model, input_max_depth, input_beam_width, input_minsupp, input_max_rules, input_alpha_gain = search_parameters
        subgroup2add = generate_subgroup_oneitem_numeric
        input_task = "discovery"

        output_ruleset = GaussianRuleList(data, input_task, input_max_depth,
                                          input_beam_width, input_minsupp,
                                          input_max_rules, input_alpha_gain)
        output_ruleset.add_rule(subgroup2add, data)

        expected_number_instances = data.number_instances
        expected_bitset_uncovered = indexes2bitset(
            [i for i in range(expected_number_instances) if i > 16666])
        expected_bitset_covered = indexes2bitset(
            [i for i in range(expected_number_instances) if i < 16666 + 1])
        expected_number_rules = 1
        expected_length_model = universal_code_integers(1) + universal_code_integers(1)+\
                                uniform_combination_code(1, 2) + universal_code_integers_maximum(1,2)+ \
                                uniform_code(10)


        actual_numberinstances1 = popcount(output_ruleset.subgroups[0].bitarray) +\
                                 popcount(output_ruleset.bitset_uncovered)
        actual_numberinstances2 = output_ruleset.support_covered + output_ruleset.support_uncovered
        actual_numberinstances3 = popcount(output_ruleset.bitset_covered) +\
                                 popcount(output_ruleset.bitset_uncovered)
        actual_numberinstances4 = output_ruleset.subgroups[
            0].usage + output_ruleset.default_rule_statistics.usage

        assert expected_number_instances == actual_numberinstances1
        assert expected_number_instances == actual_numberinstances2
        assert expected_number_instances == actual_numberinstances3
        assert expected_number_instances == actual_numberinstances4
        assert expected_bitset_uncovered == output_ruleset.bitset_uncovered
        assert expected_bitset_covered == output_ruleset.bitset_covered
        assert expected_number_rules == output_ruleset.number_rules
        assert expected_length_model == pytest.approx(
            output_ruleset.length_model)
Exemplo n.º 4
0
def make_rulelist(generate_input_dataframe_two_target_normal):
    data = generate_input_dataframe_two_target_normal
    input_target_model = "gaussian"
    input_task = "discovery"
    input_max_depth = 5
    input_beam_width = 10
    input_max_rules = 10
    input_alpha_gain = 1
    input_minsupp = 0
    input_ruleset = GaussianRuleList(data, input_task, input_max_depth,
                                     input_beam_width, input_minsupp,
                                     input_max_rules, input_alpha_gain)
    yield input_ruleset
Exemplo n.º 5
0
    def test_initialization_1target_prediction(
            self, search_parameters, generate_input_dataframe_one_target):
        # the length of the datastructure is infinity
        data = generate_input_dataframe_one_target
        input_target_model, input_max_depth, input_beam_width, input_minsupp, input_max_rules, input_alpha_gain = search_parameters
        input_task = "prediction"

        expected_statistic_type = GaussianFreeStatistic
        expected_length_data_minimum = 0.5 * 100000 + 0.5 * 100000

        output_ruleset = GaussianRuleList(data, input_task, input_max_depth,
                                          input_beam_width, input_minsupp,
                                          input_max_rules, input_alpha_gain)

        assert isinstance(output_ruleset.default_rule_statistics,
                          expected_statistic_type)
        assert expected_length_data_minimum < output_ruleset.length_defaultrule
Exemplo n.º 6
0
    def test_initialization_2targets_prediction(
            self, search_parameters,
            generate_input_dataframe_two_target_normal):
        data = generate_input_dataframe_two_target_normal
        input_target_model, input_max_depth, input_beam_width, input_minsupp, input_max_rules, input_alpha_gain = search_parameters
        input_task = "prediction"

        expected_statistic_type = GaussianFreeStatistic
        expected_length_data_minimum = 0.5 * 100000 * (log2(
            2 * pi * 1 * exp(1))) + 0.5 * 100000 * (log2(2 * pi * 1 * exp(1)))

        output_ruleset = GaussianRuleList(data, input_task, input_max_depth,
                                          input_beam_width, input_minsupp,
                                          input_max_rules, input_alpha_gain)

        assert isinstance(output_ruleset.default_rule_statistics,
                          expected_statistic_type)
        assert expected_length_data_minimum < output_ruleset.length_defaultrule
Exemplo n.º 7
0
    def test_initialization_lengthinfinity(
            self, generate_input_dataframe_two_target_variancezero,
            search_parameters):
        # the length of the datastructure is infinity
        data = generate_input_dataframe_two_target_variancezero
        input_target_model, input_max_depth, input_beam_width, input_minsupp, input_max_rules, input_alpha_gain = search_parameters
        input_task = "discovery"

        expected_statistic_type = GaussianFixedStatistic
        expected_length_data = np.inf
        expected_length_original = np.inf
        expected_length_defaultrule = np.inf

        output_ruleset = GaussianRuleList(data, input_task, input_max_depth,
                                          input_beam_width, input_minsupp,
                                          input_max_rules, input_alpha_gain)

        assert isinstance(output_ruleset.default_rule_statistics,
                          expected_statistic_type)
        assert expected_length_data == output_ruleset.length_data
        assert expected_length_original == output_ruleset.length_original
        assert expected_length_defaultrule == output_ruleset.length_defaultrule
Exemplo n.º 8
0
class BaseRuleList(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta):
    """Base class for decision trees.
    Warning: This class should not be used directly.
    Use derived classes instead.
    """

    @abstractmethod
    def __init__(self,*,max_depth, beam_width, min_support, n_cutpoints, discretization = "static",
                 max_rules = np.inf, alpha_gain = 1.0):

        if not isinstance(max_depth, (int, np.integer)) or max_depth < 1:
            raise ValueError("max_depth incorrectly selected, please select a "
                             "positive integer greater or equal to 1.")

        if not isinstance(beam_width, (int, np.integer)) or beam_width < 1:
            raise ValueError("beam_width incorrectly selected, please select a "
                             "positive integer greater or equal to 1.")

        if not isinstance(n_cutpoints, (int, np.integer)) or n_cutpoints < 2:
            raise ValueError("n_cutpoints incorrectly selected, please select a "
                             "positive integer greater or equal to 2.")

        if discretization not in ["static","dynamic"]:
            raise ValueError("At this moment we only support \"static\" or \"dynamic\" discretizations.")

        if not isinstance(n_cutpoints, (int, np.integer)) or max_rules < 0:
            raise ValueError("max_rules incorrectly selected, please select a "
                             "zero or a positive integer.")

        if alpha_gain < 0 or alpha_gain > 1:
            raise ValueError("alpha_gain incorrectly selected, please select a "
                             "between zero and 1 inclusive.")

        self.alpha_gain = alpha_gain
        self.max_depth = max_depth
        self.beam_width = beam_width
        self.min_support = min_support
        self.n_cutpoints = n_cutpoints
        self.discretization = discretization
        self.number_rules = 0
        self.max_rules = max_rules
        self._rulelist = None

    #TODO:  def __repr__
    def __str__(self):
        text2print = self._rulelist.description if self.number_rules > 0 else "Model not fitted"
        return text2print

    def fit(self,X,Y):
        """Fit the model according to the given training datastructure.
        Parameters
        ----------
        df : pandas dataframe with name variables with last column as target
        variable.
        Returns
        -------
        self : object
        """
        is_nominal_target  = is_classifier(self)
        start_time = time()
        #self._rulelist = _fit_rulelist(
        #        X,Y, self.target_model, self.max_depth,self.beam_width,self.min_support, self.n_cutpoints,
        #        self.task,self.discretization,self.max_rules,self.alpha_gain)

        data = Data(input_data=X, n_cutpoints=self.n_cutpoints, discretization=self.discretization,
                    target_data=Y, target_model=self.target_model, min_support=self.min_support)

        if is_nominal_target:
            self._rulelist = CategoricalRuleList(data, self.task, self.max_depth, self.beam_width, self.min_support, self.max_rules,
                                                       self.alpha_gain)
        else:
            self._rulelist = GaussianRuleList(data, self.task, self.max_depth, self.beam_width, self.min_support, self.max_rules,
                                                       self.alpha_gain)
        self._rulelist = greedy_and_beamsearch(data, self._rulelist)
        self._rulelist.add_description()
        self.runtime = time() - start_time
        self.number_rules = self._rulelist.number_rules
        self.rule_sets = [bitset2indexes(bitset) for bitset in self._rulelist.bitset_rules]

        return self


    def predict(self,X):
        """ Predicts the target variable for an input data X.
        ----------
        X : a numpy array or pandas dataframe with the variables in the same
            poistion (column number) as given in "fit" function.

        Returns a numpy array y with the predicted values according to the
        fitted rule list (obtained using the "fit" function above). y has the
        same length as X.shape[0] (number of rows).
        -------
        self : object
        """
        y_hat = predict_rulelist(X, self)
        return y_hat