def test_different_input_sizes(self):
     vector_1 = [2, 0]
     vector_2 = [1, 2, 3]
     with self.assertRaises(AssertionError):
         mutual_information(vector_1, vector_2)
     with self.assertRaises(AssertionError):
         mutual_information(vector_2, vector_1)
Пример #2
0
def jmi(data, target_variable, prev_variables_index, candidate_variable_index,
        **kwargs):
    """
    This estimator computes the Joint Mutual Information criterion.

    Parameters
    ----------
    data : np.array matrix
        Matrix of data set. Columns are variables, rows are observations.
    target_variable : int or float
        Target variable. Can not be in data!
    prev_variables_index: list of ints
        Indexes of previously selected variables.
    candidate_variable_index : int
        Index of candidate variable in data matrix.

    Returns
    -------
    j_criterion_value : float
        J_criterion approximated by the Joint Mutual Information.
    """

    assert isinstance(data,
                      np.ndarray), "Argument 'data' must be a numpy matrix"
    assert isinstance(
        target_variable,
        np.ndarray), "Argument 'target_variable' must be a numpy matrix"
    assert isinstance(
        candidate_variable_index,
        int), "Argument 'candidate_variable_index' must be an integer"

    assert len(
        data.shape) == 2, "For 'data' argument use numpy array of shape (n,p)"
    assert data.shape[0] == len(
        target_variable
    ), "Number of rows in 'data' must equal target_variable length"
    assert candidate_variable_index < data.shape[
        1], "Index 'candidate_variable_index' out of range in 'data'"

    for i in prev_variables_index:
        assert isinstance(i, int), "All previous variable indexes must be int."
    candidate_variable = data[:, candidate_variable_index]
    prev_variables_len = 1 if len(prev_variables_index) == 0 else len(
        prev_variables_index)

    redundancy_sum = 0
    for var in prev_variables_index:

        a = mutual_information(data[:, var], candidate_variable)
        b = conditional_mutual_information(data[:, var], candidate_variable,
                                           target_variable)
        redundancy_sum += a - b

    return mutual_information(
        candidate_variable,
        target_variable) - 1 / prev_variables_len * redundancy_sum
    def test_theoretical_output(self):
        """
        proper_value is calculated using R function infotheo::mutinformation(method="emp")
        """
        input_1 = [9, 8, 7, 6, 5, 4, 3, 2, 9]
        input_2 = [1, 1, 1, 1, 0, 0, 0, 0, 0]
        proper_value_1 = 0.5329289
        self.assertAlmostEqual(mutual_information(input_1, input_2),
                               proper_value_1,
                               places=3)

        input_3 = [0, 0, 0, 0, 1, 0, 0, 0, 0]
        input_4 = [1, 1, 1, 1, 0, 0, 0, 0, 0]
        proper_value_2 = 0.07083075
        self.assertAlmostEqual(mutual_information(input_3, input_4),
                               proper_value_2,
                               places=3)
Пример #4
0
 def test_theoretical_value(self):
     integer_matrix = np.random.randint(0, 10, (100, 50))
     diverse_target = np.random.randint(0, 10, (100))
     candidate_index = 1
     self.assertAlmostEqual(mim(integer_matrix, diverse_target,
                                candidate_index),
                            mutual_information(
                                diverse_target,
                                integer_matrix[:, candidate_index]),
                            places=5)
Пример #5
0
def mim(data, target_variable, candidate_variable_index, **kwargs):
    """
    This estimator computes the Mutual Information Maximisation criterion.

    Parameters
    ----------
    data : np.array matrix
        Matrix of data set. Columns are variables, rows are observations.
    target_variable : int or float
        Target variable. Can not be in data!
    candidate_variable_index : int
        Index of candidate variable in data matrix.

    Returns
    -------
    j_criterion_value : float
        J_criterion approximated by the Mutual Information Maximisation.
    """

    assert isinstance(data,
                      np.ndarray), "Argument 'data' must be a numpy matrix"
    assert isinstance(
        target_variable,
        np.ndarray), "Argument 'target_variable' must be a numpy matrix"
    assert isinstance(
        candidate_variable_index,
        int), "Argument 'candidate_variable_index' must be an integer"

    assert len(
        data.shape) == 2, "For 'data' argument use numpy array of shape (n,p)"
    assert data.shape[0] == len(
        target_variable
    ), "Number of rows in 'data' must equal target_variable length"
    assert candidate_variable_index < data.shape[
        1], "Index 'candidate_variable_index' out of range in 'data'"

    candidate_variable = data[:, candidate_variable_index]
    return mutual_information(candidate_variable, target_variable)
Пример #6
0
    def test_criterion_filter_values(self):
        target = [1, 0, 1, 1, 0, 1, 0, 1]
        a = [1, 0, 0, 1, 0, 1, 0, 1]
        b = [1, 0, 0, 1, 1, 1, 0, 1]
        c = [1, 1, 1, 1, 1, 1, 1, 0]
        d = [1, 0, 0, 1, 1, 0, 0, 1]

        X = np.array([a, b, c, d]).transpose()
        y = np.array(target).transpose()
        costs = [1, 0.5, 0.25, 0.1]
        normalized_costs = list((np.array(costs) - min(costs) + 0.0001) /
                                (max(costs) - min(costs) + 0.0001))

        # MIM
        r = 0
        feature_index, filter_value, criterion_value, cost = fraction_find_best_feature(
            j_criterion_func=mim,
            r=r,
            data=X,
            target_variable=y,
            possible_variables_index=[0, 1, 2, 3],
            costs=costs,
            normalized_costs=normalized_costs)
        self.assertAlmostEqual(mutual_information(y, X[:, feature_index]),
                               criterion_value)

        r = 1.2
        feature_index, filter_value, criterion_value, cost = fraction_find_best_feature(
            j_criterion_func=mim,
            r=r,
            data=X,
            target_variable=y,
            possible_variables_index=[0, 1, 2, 3],
            costs=costs,
            normalized_costs=normalized_costs)
        self.assertAlmostEqual(mutual_information(y, X[:, feature_index]),
                               criterion_value)
        self.assertAlmostEqual(
            mutual_information(y, X[:, feature_index]) /
            normalized_costs[feature_index]**r, filter_value)

        # MIFS
        r = 0
        feature_index, filter_value, criterion_value, cost = fraction_find_best_feature(
            j_criterion_func=mifs,
            r=r,
            data=X,
            target_variable=y,
            possible_variables_index=[1, 2],
            costs=costs,
            normalized_costs=normalized_costs,
            prev_variables_index=[0, 3])
        mifs_value = mutual_information(
            y, X[:, feature_index]) - mutual_information(
                X[:, feature_index], X[:, 0]) - mutual_information(
                    X[:, feature_index], X[:, 3])
        self.assertAlmostEqual(mifs_value, criterion_value)

        r = 1
        feature_index, filter_value, criterion_value, cost = fraction_find_best_feature(
            j_criterion_func=mifs,
            r=r,
            data=X,
            target_variable=y,
            possible_variables_index=[1, 2],
            costs=costs,
            normalized_costs=normalized_costs,
            prev_variables_index=[0, 3])
        mifs_value = mutual_information(
            y, X[:, feature_index]) - mutual_information(
                X[:, feature_index], X[:, 0]) - mutual_information(
                    X[:, feature_index], X[:, 3])
        m = abs(
            min([
                mutual_information(y, X[:, 1]) -
                mutual_information(X[:, 1], X[:, 0]) -
                mutual_information(X[:, 1], X[:, 3]),
                mutual_information(y, X[:, 2]) -
                mutual_information(X[:, 2], X[:, 0]) -
                mutual_information(X[:, 2], X[:, 3])
            ]))
        self.assertAlmostEqual(mifs_value, criterion_value)
        self.assertAlmostEqual(
            (mifs_value + m) / normalized_costs[feature_index]**r,
            filter_value)
 def test_one_number_input(self):
     self.assertEqual(mutual_information([1], [0]), 0.0)
 def test_empty_input(self):
     vector_1 = []
     vector_2 = []
     with self.assertRaises(AssertionError):
         mutual_information(vector_1, vector_2)
    def test_commutative_property(self):
        input_1 = [9, 8, 7, 6, 5, 4, 3, 2, 9]
        input_2 = [1, 1, 1, 1, 0, 0, 0, 0, 0]

        self.assertEqual(mutual_information(input_1, input_2),
                         mutual_information(input_2, input_1))
    def test_the_same_vectors(self):
        input_1 = [9, 8, 7, 6, 5, 4, 3, 2, 9]
        input_2 = [9, 8, 7, 6, 5, 4, 3, 2, 9]

        self.assertEqual(mutual_information(input_1, input_2),
                         entropy(input_1))
 def test_nparray_input(self):
     vector_1 = np.array([1, 2, 3, 5, 432, 42, 31234, 342, 34])
     vector_2 = np.array([1, 1, 1, 1, 0, 0, 0, 0, 0])
     self.assertIsInstance(mutual_information(vector_1, vector_2), float)
 def test_list_input(self):
     vector_1 = [1, 2, 3, 5, 432, 42, 31234, 342, 34]
     vector_2 = [1, 1, 1, 1, 0, 0, 0, 0, 0]
     self.assertIsInstance(mutual_information(vector_1, vector_2), float)
Пример #13
0
def mifs(data, target_variable, prev_variables_index, candidate_variable_index,
         **kwargs):
    """
    This estimator computes the Mutual Information Feature Selection criterion.

    Parameters
    ----------
    data : np.array matrix
        Matrix of data set. Columns are variables, rows are observations.
    target_variable : int or float
        Target variable. Can not be in data!
    prev_variables_index: list of ints, set of ints
        Indexes of previously selected variables.
    candidate_variable_index : int
        Index of candidate variable in data matrix.
    beta: float
        Impact of redundancy segment in MIFS approximation. Higher the beta is, higher the impact.

    Returns
    -------
    j_criterion_value : float
        J_criterion approximated by the Mutual Information Feature Selection.
    """
    assert isinstance(data,
                      np.ndarray), "Argument 'data' must be a numpy matrix"
    assert isinstance(
        target_variable,
        np.ndarray), "Argument 'target_variable' must be a numpy matrix"
    assert isinstance(
        candidate_variable_index,
        int), "Argument 'candidate_variable_index' must be an integer"

    assert len(
        data.shape) == 2, "For 'data' argument use numpy array of shape (n,p)"
    assert data.shape[0] == len(
        target_variable
    ), "Number of rows in 'data' must equal target_variable length"
    assert candidate_variable_index < data.shape[
        1], "Index 'candidate_variable_index' out of range in 'data'"

    for i in prev_variables_index:
        assert isinstance(i, int), "All previous variable indexes must be int."

    if kwargs.get('beta') is None:
        beta = 1
        warnings.warn(
            "Parameter `beta` not provided, default value of 1 is selected.",
            Warning)
    else:
        beta = kwargs.pop('beta')

    assert isinstance(beta, int) or isinstance(
        beta, float), "Argument 'beta' must be int or float"

    candidate_variable = data[:, candidate_variable_index]

    redundancy_sum = 0
    for var in prev_variables_index:
        redundancy_sum += mutual_information(data[:, var], candidate_variable)

    return mutual_information(candidate_variable,
                              target_variable) - beta * redundancy_sum