def test_different_input_sizes(self): vector_1 = [2, 0] vector_2 = [1, 2, 3] with self.assertRaises(AssertionError): mutual_information(vector_1, vector_2) with self.assertRaises(AssertionError): mutual_information(vector_2, vector_1)
def jmi(data, target_variable, prev_variables_index, candidate_variable_index, **kwargs): """ This estimator computes the Joint Mutual Information criterion. Parameters ---------- data : np.array matrix Matrix of data set. Columns are variables, rows are observations. target_variable : int or float Target variable. Can not be in data! prev_variables_index: list of ints Indexes of previously selected variables. candidate_variable_index : int Index of candidate variable in data matrix. Returns ------- j_criterion_value : float J_criterion approximated by the Joint Mutual Information. """ assert isinstance(data, np.ndarray), "Argument 'data' must be a numpy matrix" assert isinstance( target_variable, np.ndarray), "Argument 'target_variable' must be a numpy matrix" assert isinstance( candidate_variable_index, int), "Argument 'candidate_variable_index' must be an integer" assert len( data.shape) == 2, "For 'data' argument use numpy array of shape (n,p)" assert data.shape[0] == len( target_variable ), "Number of rows in 'data' must equal target_variable length" assert candidate_variable_index < data.shape[ 1], "Index 'candidate_variable_index' out of range in 'data'" for i in prev_variables_index: assert isinstance(i, int), "All previous variable indexes must be int." candidate_variable = data[:, candidate_variable_index] prev_variables_len = 1 if len(prev_variables_index) == 0 else len( prev_variables_index) redundancy_sum = 0 for var in prev_variables_index: a = mutual_information(data[:, var], candidate_variable) b = conditional_mutual_information(data[:, var], candidate_variable, target_variable) redundancy_sum += a - b return mutual_information( candidate_variable, target_variable) - 1 / prev_variables_len * redundancy_sum
def test_theoretical_output(self): """ proper_value is calculated using R function infotheo::mutinformation(method="emp") """ input_1 = [9, 8, 7, 6, 5, 4, 3, 2, 9] input_2 = [1, 1, 1, 1, 0, 0, 0, 0, 0] proper_value_1 = 0.5329289 self.assertAlmostEqual(mutual_information(input_1, input_2), proper_value_1, places=3) input_3 = [0, 0, 0, 0, 1, 0, 0, 0, 0] input_4 = [1, 1, 1, 1, 0, 0, 0, 0, 0] proper_value_2 = 0.07083075 self.assertAlmostEqual(mutual_information(input_3, input_4), proper_value_2, places=3)
def test_theoretical_value(self): integer_matrix = np.random.randint(0, 10, (100, 50)) diverse_target = np.random.randint(0, 10, (100)) candidate_index = 1 self.assertAlmostEqual(mim(integer_matrix, diverse_target, candidate_index), mutual_information( diverse_target, integer_matrix[:, candidate_index]), places=5)
def mim(data, target_variable, candidate_variable_index, **kwargs): """ This estimator computes the Mutual Information Maximisation criterion. Parameters ---------- data : np.array matrix Matrix of data set. Columns are variables, rows are observations. target_variable : int or float Target variable. Can not be in data! candidate_variable_index : int Index of candidate variable in data matrix. Returns ------- j_criterion_value : float J_criterion approximated by the Mutual Information Maximisation. """ assert isinstance(data, np.ndarray), "Argument 'data' must be a numpy matrix" assert isinstance( target_variable, np.ndarray), "Argument 'target_variable' must be a numpy matrix" assert isinstance( candidate_variable_index, int), "Argument 'candidate_variable_index' must be an integer" assert len( data.shape) == 2, "For 'data' argument use numpy array of shape (n,p)" assert data.shape[0] == len( target_variable ), "Number of rows in 'data' must equal target_variable length" assert candidate_variable_index < data.shape[ 1], "Index 'candidate_variable_index' out of range in 'data'" candidate_variable = data[:, candidate_variable_index] return mutual_information(candidate_variable, target_variable)
def test_criterion_filter_values(self): target = [1, 0, 1, 1, 0, 1, 0, 1] a = [1, 0, 0, 1, 0, 1, 0, 1] b = [1, 0, 0, 1, 1, 1, 0, 1] c = [1, 1, 1, 1, 1, 1, 1, 0] d = [1, 0, 0, 1, 1, 0, 0, 1] X = np.array([a, b, c, d]).transpose() y = np.array(target).transpose() costs = [1, 0.5, 0.25, 0.1] normalized_costs = list((np.array(costs) - min(costs) + 0.0001) / (max(costs) - min(costs) + 0.0001)) # MIM r = 0 feature_index, filter_value, criterion_value, cost = fraction_find_best_feature( j_criterion_func=mim, r=r, data=X, target_variable=y, possible_variables_index=[0, 1, 2, 3], costs=costs, normalized_costs=normalized_costs) self.assertAlmostEqual(mutual_information(y, X[:, feature_index]), criterion_value) r = 1.2 feature_index, filter_value, criterion_value, cost = fraction_find_best_feature( j_criterion_func=mim, r=r, data=X, target_variable=y, possible_variables_index=[0, 1, 2, 3], costs=costs, normalized_costs=normalized_costs) self.assertAlmostEqual(mutual_information(y, X[:, feature_index]), criterion_value) self.assertAlmostEqual( mutual_information(y, X[:, feature_index]) / normalized_costs[feature_index]**r, filter_value) # MIFS r = 0 feature_index, filter_value, criterion_value, cost = fraction_find_best_feature( j_criterion_func=mifs, r=r, data=X, target_variable=y, possible_variables_index=[1, 2], costs=costs, normalized_costs=normalized_costs, prev_variables_index=[0, 3]) mifs_value = mutual_information( y, X[:, feature_index]) - mutual_information( X[:, feature_index], X[:, 0]) - mutual_information( X[:, feature_index], X[:, 3]) self.assertAlmostEqual(mifs_value, criterion_value) r = 1 feature_index, filter_value, criterion_value, cost = fraction_find_best_feature( j_criterion_func=mifs, r=r, data=X, target_variable=y, possible_variables_index=[1, 2], costs=costs, normalized_costs=normalized_costs, prev_variables_index=[0, 3]) mifs_value = mutual_information( y, X[:, feature_index]) - mutual_information( X[:, feature_index], X[:, 0]) - mutual_information( X[:, feature_index], X[:, 3]) m = abs( min([ mutual_information(y, X[:, 1]) - mutual_information(X[:, 1], X[:, 0]) - mutual_information(X[:, 1], X[:, 3]), mutual_information(y, X[:, 2]) - mutual_information(X[:, 2], X[:, 0]) - mutual_information(X[:, 2], X[:, 3]) ])) self.assertAlmostEqual(mifs_value, criterion_value) self.assertAlmostEqual( (mifs_value + m) / normalized_costs[feature_index]**r, filter_value)
def test_one_number_input(self): self.assertEqual(mutual_information([1], [0]), 0.0)
def test_empty_input(self): vector_1 = [] vector_2 = [] with self.assertRaises(AssertionError): mutual_information(vector_1, vector_2)
def test_commutative_property(self): input_1 = [9, 8, 7, 6, 5, 4, 3, 2, 9] input_2 = [1, 1, 1, 1, 0, 0, 0, 0, 0] self.assertEqual(mutual_information(input_1, input_2), mutual_information(input_2, input_1))
def test_the_same_vectors(self): input_1 = [9, 8, 7, 6, 5, 4, 3, 2, 9] input_2 = [9, 8, 7, 6, 5, 4, 3, 2, 9] self.assertEqual(mutual_information(input_1, input_2), entropy(input_1))
def test_nparray_input(self): vector_1 = np.array([1, 2, 3, 5, 432, 42, 31234, 342, 34]) vector_2 = np.array([1, 1, 1, 1, 0, 0, 0, 0, 0]) self.assertIsInstance(mutual_information(vector_1, vector_2), float)
def test_list_input(self): vector_1 = [1, 2, 3, 5, 432, 42, 31234, 342, 34] vector_2 = [1, 1, 1, 1, 0, 0, 0, 0, 0] self.assertIsInstance(mutual_information(vector_1, vector_2), float)
def mifs(data, target_variable, prev_variables_index, candidate_variable_index, **kwargs): """ This estimator computes the Mutual Information Feature Selection criterion. Parameters ---------- data : np.array matrix Matrix of data set. Columns are variables, rows are observations. target_variable : int or float Target variable. Can not be in data! prev_variables_index: list of ints, set of ints Indexes of previously selected variables. candidate_variable_index : int Index of candidate variable in data matrix. beta: float Impact of redundancy segment in MIFS approximation. Higher the beta is, higher the impact. Returns ------- j_criterion_value : float J_criterion approximated by the Mutual Information Feature Selection. """ assert isinstance(data, np.ndarray), "Argument 'data' must be a numpy matrix" assert isinstance( target_variable, np.ndarray), "Argument 'target_variable' must be a numpy matrix" assert isinstance( candidate_variable_index, int), "Argument 'candidate_variable_index' must be an integer" assert len( data.shape) == 2, "For 'data' argument use numpy array of shape (n,p)" assert data.shape[0] == len( target_variable ), "Number of rows in 'data' must equal target_variable length" assert candidate_variable_index < data.shape[ 1], "Index 'candidate_variable_index' out of range in 'data'" for i in prev_variables_index: assert isinstance(i, int), "All previous variable indexes must be int." if kwargs.get('beta') is None: beta = 1 warnings.warn( "Parameter `beta` not provided, default value of 1 is selected.", Warning) else: beta = kwargs.pop('beta') assert isinstance(beta, int) or isinstance( beta, float), "Argument 'beta' must be int or float" candidate_variable = data[:, candidate_variable_index] redundancy_sum = 0 for var in prev_variables_index: redundancy_sum += mutual_information(data[:, var], candidate_variable) return mutual_information(candidate_variable, target_variable) - beta * redundancy_sum