def conditional_entropy(universe, feature_b, feature_a, display=False): """ to calculate the conditional entropy, H(B|A) or H(B|A1,A2...) :param universe: the universe of objects(feature vector/sample/instance) :param feature_b: list, features' index :param feature_a: list, features' index :param display: to display the probability :return: """ partitions_a = partition(universe, feature_a) partitions_b = partition(universe, feature_b) total = 0 for a in partitions_a: inner_total = 0 # length = len(a) for b in partitions_b: length = len(b) a_b = [i for i in a if i in b] probability = len(a_b) / length if display: print(probability) if probability > 0: inner_total += probability * math.log2(probability) total += inner_total * length / universe.shape[0] return -1 * total
def proximity_of_objects_in_boundary_region_to_mean_positive_region_based_distance( universe, features_1, features_2, distance): """ proximity_of_objects_in_boundary_region_from_mean_positive_region don't consider the condition that the positive region is empty :param universe: the universe of objects(feature vector/sample/instance) :param features_1: list, a set of features' serial number :param features_2: list, a set of features' serial number :param distance: the method to calculate the distance of objects :return: float """ partition_2 = partition(universe, features_2) boundary = [] positive = [] for subset in partition_2: boundary.extend(boundary_region_of_sample_subset(universe, subset, features_1)) boundary = list(set(boundary)) positive.extend(positive_region_of_sample_subset(universe, subset, features_1)) if len(boundary) == 0: return 1 if len(positive) == 0: return 1 / len(boundary) mean = NoiseResistantDependencyMeasure.mean_positive_region(universe, positive, features_1) proximity_of_object_in_boundary_from_mean = 0 for y in boundary: proximity_of_object_in_boundary_from_mean += distance(mean, universe[y], features_1) return 1 / proximity_of_object_in_boundary_from_mean
def joint_entropy(universe, feature_a, feature_b): """ calculate the joint entropy of a and b, H(a,b) :param universe: the universe of objects(feature vector/sample/instance) :param feature_b: list, features' index :param feature_a: list, features' index :return: """ partitions_a = partition(universe, feature_a) partitions_b = partition(universe, feature_b) total = 0 for a in partitions_a: for b in partitions_b: a_b = [i for i in a if i in b] probability = len(a_b) / universe.shape[0] if probability > 0: total += probability * math.log2(probability) return -1 * total
def classical_rough_set_partition_all(): data = pd.read_csv("mushroom.csv", header=None) data = np.array(data) print(data.shape) attributes = [i for i in range(data.shape[1])] print("partition_by_array_equal") start_time = time.time() # matrix1 = partition_by_equal_array(data, attributes) partition_by_equal_array(data, attributes) print('The time used: {} seconds'.format(time.time() - start_time)) print("partition") start_time = time.time() # matrix2 = partition(data, attributes) partition(data, attributes) print('The time used: {} seconds'.format(time.time() - start_time)) print() pass
def calculate_positive_region(self, attributes): """ FH hint the NT FM hint the NS a NT : the nearest instance from the same label with x NS : the nearest instance whose label is different from x step 4-9 先计算the margin of instance of x, 如果无法计算出来, 执行step 11-18 step 11-18 :param attributes: :return: """ positive_region = [] if self.distance == euclidean_distance: distance_matrix = generate_euclidean_distance_matrix_by_vector( self.universe, attributes) elif self.distance == standardized_euclidean_distance: distance_matrix = generate_euclidean_distance_matrix_by_vector( self.universe, attributes, standard=True) else: distance_matrix = generate_distance_matrix(self.universe, attributes, self.distance) for label in self.labels: # 针对每个label label_positive_region = [] elementary_sets = partition(self.universe, [label]) # 计算其划分 for elementary_set in elementary_sets: exclude = [ j for j in [i for i in range(self.universe.shape[0])] if j not in elementary_set ] for x in elementary_set: # print(distance_matrix[x]) # 同类 # print(elementary_set) # print(distance_matrix[x][elementary_set]) # print(heapq.nsmallest(2, distance_matrix[x][elementary_set])) if len(elementary_set) == 1: margin = 0 # 异类 # print(exclude) # print(distance_matrix[x][exclude]) # print(heapq.nsmallest(2, distance_matrix[x][exclude])) elif len(exclude) == 1: margin = 1 else: margin = heapq.nsmallest(2, distance_matrix[x][exclude])[1] - \ heapq.nsmallest(2, distance_matrix[x][elementary_set])[1] # print(margin, x) if margin > 0: label_positive_region.append(x) positive_region.append(label_positive_region) return positive_region
def entropy(universe, feature): """ to calculate the entropy of feature, H(a) :param universe: the universe of objects(feature vector/sample/instance) :param feature: list, features' index :return: """ partitions = partition(universe, feature) total = 0 for yi in partitions: probability = len(yi) / universe.shape[0] total += probability * math.log2(probability) return -1 * total
def noisy_dependency_of_feature_subset_d_on_feature_subset_c(universe, feature_subset_c, feature_subset_d): """ :param universe: the universe of objects(feature vector/sample/instance) :param feature_subset_c: list, a set of features' serial number :param feature_subset_d: list, a set of features' serial number :return: noisy dependency of feature subset a on feature subset b """ partition_d = partition(universe, feature_subset_d) total_dependency = 0 for p in partition_d: the_dependency = NoiseResistantDependencyMeasure. \ proximity_of_boundary_region_to_positive_region_based_portion(universe, p, feature_subset_c) total_dependency += the_dependency return total_dependency
def lower_approximations_of_universe_neighborhood(universe, attributes, labels, delta): """ get the features lower approximations of U/R :param universe: the universe of objects(feature vector/sample/instance) :param attributes: features' index :param labels: labels' index :param delta: radius :return: list, lower_approximations is composed by a set of objects' index """ lower_approximations = [] partition_1 = generate_delta_neighborhood(universe, attributes, delta) partition_2 = partition(universe, labels) for x in partition_1: if set_is_include(x, partition_2): lower_approximations.append(x[0]) lower_approximations.sort() return lower_approximations
def dep_density(self, attributes): if len(attributes) == 0: return 0 card_s = 0 density_neighborhoods = \ generate_density_neighborhood(self.universe, attributes, distance=self.distance) partitions = partition(self.universe, self.decision_features) for density_neighborhood in density_neighborhoods: for single_partition in partitions: if density_neighborhood[0] in single_partition: card_s += len([ j for j in density_neighborhood if j in single_partition ]) - 1 dep_s = card_s / self.universe.shape[0] return dep_s
def proximity_of_boundary_region_to_positive_region_based_portion(universe, sample_subset, feature_subset): """ a noise measure function to describe the information contain by the boundary of partition(universe, sample_subset) :param universe: the universe of objects(feature vector/sample/instance) :param sample_subset: list, a set of objects' serial number :param feature_subset: list, a set of features' serial number :return: float, the proximity """ partition_1 = partition(universe, feature_subset) total = 0 for elementary_set in partition_1: related_information = NoiseResistantDependencyMeasure.related_information_of_subset_b( elementary_set, sample_subset) if related_information != 1: total += related_information return total / (len(partition_1))
def main(): # entropy data = pd.read_csv("./../Resources/watermelon2 train.csv", header=None) result = entropy(np.array(data), [0]) print(result) # part entropy partitions_ = partition(np.array(data), [0]) print(partitions_) for part in partitions_: result = part_entropy(np.array(data), part, [6]) print(result) # conditional entropy for feature in range(6): print(feature, "#") conditional_entropy(np.array(data), [feature], [6], True) conditional_entropy(np.array(data), [4], [6]) # conditional mutual information(no example to check) # conditional_mutual_information(np.array(data), [], [], []) return