def _calculate_information_gain(self, y, y1, y2): p = len(y1) / len(y) entropy = calculate_entropy(y) info_gain = entropy \ - p * calculate_entropy(y1) \ - (1-p) * calculate_entropy(y2) return info_gain
def _calculate_information_gain(self, y, y1, y2): # Calculate information gain p = len(y1) / len(y) entropy = calculate_entropy(y) info_gain = entropy - p * calculate_entropy(y1) - ( 1 - p) * calculate_entropy(y2) # print("info_gain",info_gain) return info_gain
def _calculate_information_gain(self, y, y1, y2): # Calculate information gain p = len(y1) / len(y) entropy = calculate_entropy(y) info_gain = entropy - p * \ calculate_entropy(y1) - (1 - p) * \ calculate_entropy(y2) # print("info_gain",info_gain) return info_gain
def get_diversity_metrics(self, checkpoint, x_test, y_test, num_samples=10, num_iterations=3): x_test_repeated = np.repeat(x_test, num_samples, axis=0) y_test_repeated = np.repeat(y_test, num_samples, axis=0) entropy_list = [] uni_diversity = [] bi_diversity = [] with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess, checkpoint) for _ in tqdm(range(num_iterations)): total_ent = 0 uni = 0 bi = 0 answer_logits = [] pred_sentences = [] for batch_i, (input_batch, output_batch, source_sent_lengths, tar_sent_lengths) in enumerate( utils.get_batches_xy(x_test_repeated, y_test_repeated, self.batch_size)): result = sess.run(self.inference_logits, feed_dict={self.input_data: input_batch, self.source_sentence_length: source_sent_lengths, self.keep_prob: 1.0, self.word_dropout_keep_prob: 1.0, self.z_temperature: self.z_temp}) answer_logits.extend(result) for idx, (actual, pred) in enumerate(zip(x_test_repeated, answer_logits)): pred_sentences.append(" ".join([self.decoder_idx_word[i] for i in pred if i not in [self.pad, self.eos]])) if (idx + 1) % num_samples == 0: word_list = [word_tokenize(p) for p in pred_sentences] corpus = [item for sublist in word_list for item in sublist] total_ent += utils.calculate_entropy(corpus) diversity_result = utils.calculate_ngram_diversity(corpus) uni += diversity_result[0] bi += diversity_result[1] pred_sentences = [] entropy_list.append(total_ent / len(x_test)) uni_diversity.append(uni / len(x_test)) bi_diversity.append(bi / len(x_test)) print('Entropy = {:>.3f} | Distinct-1 = {:>.3f} | Distinct-2 = {:>.3f}'.format(np.mean(entropy_list), np.mean(uni_diversity), np.mean(bi_diversity)))
def __init__(self, features, labels, num_cls): # features: List[List[any]], labels: List[int], num_cls: int self.features = np.array(features) self.labels = labels self.children = [] self.num_cls = num_cls branch = [] # find the most common labels in current node count_max = 0 for label in np.unique(labels): branch.append(self.labels.count(label)) if branch[-1] > count_max: count_max = labels.count(label) self.cls_max = label # splittable is false when all features belongs to one class self.entropy = Util.calculate_entropy(branch) if len(np.unique(labels)) < 2 or len(self.features[0]) == 0: self.splittable = False else: self.splittable = True self.dim_split = None # the index of the feature to be split self.feature_uniq_split = None # the possible unique values of the feature to be split
def create_node(arr): def get_positive_key(type, index): return type.value + str(InputData.BENIGN) + str(index) def get_negative_key(type, index): return type.value + str(InputData.MALIGNANT) + str(index) def get_key(item, type, index): return item.value + str(type) + str(index) positive_count = 0 negative_count = 0 max = -sys.maxsize - 1 max_class = None count_map = {} for i in range(1, 11): for item in list(InputType): count_map[get_positive_key(item, i)] = 0 count_map[get_negative_key(item, i)] = 0 for item in arr: if item.type == InputData.BENIGN: positive_count += 1 else: negative_count += 1 for input_type in list(InputType): key = get_key(input_type, item.type, item.get_value(input_type)) count_map[key] = count_map[key] + 1 # print(positive_count, negative_count) if positive_count == 0 and negative_count == 0: result = Node(None) result.result = InputData.MALIGNANT return result if positive_count > 0 and negative_count == 0: result = Node(None) result.result = InputData.BENIGN return result if negative_count > 0 and positive_count == 0: result = Node(None) result.result = InputData.MALIGNANT return result # print(count_map) total = len(arr) entropy_total = calculate_entropy(positive_count, negative_count) ig_map = {} for item in list(InputType): ig_map[item] = entropy_total for i in range(1, 11): for item in list(InputType): value = calculate_total_entropy( count_map[get_positive_key(item, i)], count_map[get_negative_key(item, i)], total) ig_map[item] = ig_map[item] - value # print(ig_map) for key, value in ig_map.items(): if value > max: max = value max_class = key # print(max) node = Node(max_class) node.arr = arr for i in range(1, 11): child = Node(max_class, i) for item in arr: if i == item.get_value(max_class): child.add_item(item) if child.length == 0: child.result = -1 node.add_child(child) return node
def create_node(arr): positive_count = 0 negative_count = 0 max = -sys.maxsize - 1 max_class = None count_map = {} for outlook in list(Outlook): count_map[str(outlook.name) + InputData.NEGATIVE] = 0 count_map[str(outlook.name) + InputData.POSITIVE] = 0 for temperature in list(Temperature): count_map[str(temperature.name) + InputData.NEGATIVE] = 0 count_map[str(temperature.name) + InputData.POSITIVE] = 0 for humidity in list(Humidity): count_map[str(humidity.name) + InputData.NEGATIVE] = 0 count_map[str(humidity.name) + InputData.POSITIVE] = 0 for wind in list(Wind): count_map[str(wind.name) + InputData.NEGATIVE] = 0 count_map[str(wind.name) + InputData.POSITIVE] = 0 for item in arr: if item.result == 1: positive_count += 1 else: negative_count += 1 for value in list(Outlook): if item.outlook == value: key = str(value.name) + str(item.result) count_map[key] = count_map[key] + 1 for value in list(Temperature): if item.temperature == value: key = str(value.name) + str(item.result) count_map[key] = count_map[key] + 1 for value in list(Humidity): if item.humidity == value: key = str(value.name) + str(item.result) count_map[key] = count_map[key] + 1 for value in list(Wind): if item.wind == value: key = str(value.name) + str(item.result) count_map[key] = count_map[key] + 1 if positive_count > 0 and negative_count == 0: result = Node(None) result.result = InputData.POSITIVE return result if negative_count > 0 and positive_count == 0: result = Node(None) result.result = InputData.NEGATIVE return result total = len(arr) entropy_total = calculate_entropy(positive_count, negative_count) ig_outlook = entropy_total for outlook in list(Outlook): ig_outlook -= calculate_total_entropy( count_map[str(outlook.name) + InputData.POSITIVE], count_map[str(outlook.name) + InputData.NEGATIVE], total) # print("outlook:", ig_outlook) ig_temperature = entropy_total for temperature in list(Temperature): ig_temperature -= calculate_total_entropy( count_map[str(temperature.name) + InputData.POSITIVE], count_map[str(temperature.name) + InputData.NEGATIVE], total) # print("temp:", ig_temperature) ig_humidity = entropy_total for humidity in list(Humidity): ig_humidity -= calculate_total_entropy( count_map[str(humidity.name) + InputData.POSITIVE], count_map[str(humidity.name) + InputData.NEGATIVE], total) # print("humidity:", ig_humidity) ig_wind = entropy_total for wind in list(Wind): ig_wind -= calculate_total_entropy( count_map[str(wind.name) + InputData.POSITIVE], count_map[str(wind.name) + InputData.NEGATIVE], total) # print("wind:", ig_wind) if ig_outlook > max: max = ig_outlook max_class = Outlook if ig_temperature > max: max = ig_temperature max_class = Temperature if ig_humidity > max: max = ig_humidity max_class = Humidity if ig_wind > max: max = ig_wind max_class = Wind # print("max", max) # print("value", max_class) node = Node(max_class) node.arr = arr if max_class == Outlook: sunny_node = Node(Outlook, Outlook.SUNNY) overcast_node = Node(Outlook, Outlook.OVERCAST) runny_node = Node(Outlook, Outlook.RAIN) node.add_child(sunny_node) node.add_child(overcast_node) node.add_child(runny_node) for item in arr: if item.outlook == Outlook.SUNNY: sunny_node.add_item(item) if item.outlook == Outlook.OVERCAST: overcast_node.add_item(item) if item.outlook == Outlook.RAIN: runny_node.add_item(item) if max_class == Temperature: hot_node = Node(Temperature, Temperature.HOT) mild_node = Node(Temperature, Temperature.MILD) cool_node = Node(Temperature, Temperature.COOL) node.add_child(hot_node) node.add_child(mild_node) node.add_child(cool_node) for item in arr: if item.temperature == Temperature.HOT: hot_node.add_item(item) if item.temperature == Temperature.MILD: mild_node.add_item(item) if item.temperature == Temperature.COOL: cool_node.add_item(item) if max_class == Humidity: high_node = Node(Humidity, Humidity.HIGH) normal_node = Node(Humidity, Humidity.NORMAL) node.add_child(high_node) node.add_child(normal_node) for item in arr: if item.humidity == Humidity.HIGH: high_node.add_item(item) if item.humidity == Humidity.NORMAL: normal_node.add_item(item) if max_class == Wind: weak_node = Node(Wind, Wind.WEAK) strong_node = Node(Wind, Wind.STRONG) node.add_child(weak_node) node.add_child(strong_node) for item in arr: if item.wind == Wind.WEAK: weak_node.add_item(item) if item.wind == Wind.STRONG: strong_node.add_item(item) return node