def compute(self): statistic_summary = [] class_list = self.class_dict.keys() for itemset_key, freq in self.freq_itemset_dict.itemsets.items(): ''' Compute entropy for each item-set (not contain class item) ''' itemset = string_2_itemset(itemset_key) if self.itemset_formatter(itemset) == True: continue entropy_value = 0 statistic_detail = {} flag = False for class_name in class_list: p = self.lookup_frequency(itemset, class_name) / freq statistic_detail[class_name] = p if p != 0: flag = True entropy_value += (-p * math.log2(p)) ''' Only add value when at least one class has value. ''' if (flag == True): statistic_detail['entropy'] = entropy_value statistic_detail['freq'] = freq statistic_summary.append( (itemset_2_string(itemset), statistic_detail)) return sorted(statistic_summary, key=lambda x: (x[1]['entropy']))
def findInclusiveItemPairs(self, minsup, exclusive_item_filter): inclusive_items_dict = {} apriori_model = Apriori() L1 = apriori_model.generate_L1(self.data_set, minsup) freq_one_item_itemset_dict = L1.get_itemset_dictionary() freq_one_item_itemset_dict.ntransactions = self.data_set.size() L2 = HashTable() apriori_model.generate_Lk(minsup, L1, L2, k=2) freq_two_item_itemset_dict = L2.get_itemset_dictionary() freq_two_item_itemset_dict.ntransactions = self.data_set.size() nitems = len(freq_one_item_itemset_dict.itemsets) all_items = list(freq_one_item_itemset_dict.itemsets.keys()) for i in range(nitems - 1): first_item = all_items[i] nfirst = freq_one_item_itemset_dict.get_frequency(first_item[0]) if exclusive_item_filter(first_item): continue for j in range(i + 1, nitems): second_item = all_items[j] if exclusive_item_filter(second_item): continue merge_key = itemset_2_string( merge_itemsets(first_item, second_item)) nboth = freq_two_item_itemset_dict.get_frequency(merge_key) if nboth == 0: continue nsecond = freq_one_item_itemset_dict(second_item[0]) if nboth / nfirst >= 0.999 or nboth / nsecond >= 0.999: inclusive_items_dict[merge_key] = True return inclusive_items_dict
def _complement_condition(self, r1, r2): merged_itemset = merge_itemsets(r1.left_items, r2.left_items) s = self.get_frequency(itemset_2_string(merged_itemset)) sl = self.get_frequency(r1.lhs_string()) sr = self.get_frequency(r2.lhs_string()) #if s > 0: return True return max(s / sl, s / sr)
def enumerate_subsets(self, bit_mask, item_set, position, rule_collection, both_frequency): ''' Run out of items --> create rule and check format criterion ''' if position >= len(item_set): lhs = [] rhs = [] for index in range(len(bit_mask)): if bit_mask[index] == True: lhs.append(item_set[index]) else: rhs.append(item_set[index]) if (len(lhs) > 0 and len(rhs) > 0): rule = AssociationRule(lhs, rhs) if (self.rule_formatter == None or self.rule_formatter(rule) == True): rule_collection.add(rule) return value_domain = [True, False] ''' Include position-th item into LHS ''' for value in value_domain: bit_mask[position] = value if (value == False): lhs_itemset = [] for index in range(len(bit_mask)): if bit_mask[index] == True: lhs_itemset.append(item_set[index]) lhs_frequency = self.freq_itemset_dict.get_frequency( itemset_2_string(lhs_itemset)) confidence = 0 if lhs_frequency > 0: confidence = both_frequency / lhs_frequency if confidence < self.min_conf: bit_mask[position] = True continue self.enumerate_subsets(bit_mask, item_set, position + 1, rule_collection, both_frequency) else: self.enumerate_subsets(bit_mask, item_set, position + 1, rule_collection, both_frequency) bit_mask[position] = True
def subsets(self, bits, item_set, k, rule_collection, total_freq): ''' Run out of items --> create rule and check format criterion ''' if k >= len(item_set): left = [] right = [] for index in range(len(bits)): if bits[index] == True: left.append(item_set[index]) else: right.append(item_set[index]) if (len(left) > 0 and len(right) > 0): rule = AssociationRule(left, right) if (self.rule_formatter == None or self.rule_formatter(rule) == True): rule_collection.add(rule) return value_domain = [True, False] ''' Include k-th item into LHS ''' for value in value_domain: bits[k] = value if (value == False): left_itemset = [] for index in range(len(bits)): if bits[index] == True: left_itemset.append(item_set[index]) left_value = self.freq_itemset_dict.get_frequency( itemset_2_string(left_itemset)) confident = 0 if left_value > 0: confident = total_freq / left_value if confident < self.min_conf: bits[k] = True continue self.subsets(bits, item_set, k + 1, rule_collection, total_freq) else: self.subsets(bits, item_set, k + 1, rule_collection, total_freq) bits[k] = True
def generate_rules(self, freq_itemsets_collection, output_file_name): total_rules = 0 remaining_rules = 0 k = 0 rule_collection = RulesCollection() with open(output_file_name, 'w') as _: print('clear old file...') for itemset in freq_itemsets_collection: ''' Check item-set first if it can generate a rule ''' if len(itemset) == 1: continue if self.itemset_formatter is not None and \ self.itemset_formatter(itemset) == False: continue ''' Write generated rule_collection into file ''' k += 1 if k % 200 == 0: print('writing some rule_collection to file: ' + str(k)) total_rules += rule_collection.size() rule_collection.remove_redundancy(self.freq_itemset_dict) rule_collection.save(output_file_name, True) remaining_rules += rule_collection.size() rule_collection.clear() ''' Generating association rule_collection. ''' total_freq = self.freq_itemset_dict.get_frequency( itemset_2_string(itemset)) bits = [True] * len(itemset) self.subsets(bits, itemset, 0, rule_collection, total_freq) print('writing last rule_collection to file: ' + str(k)) total_rules += rule_collection.size() rule_collection.remove_redundancy(self.freq_itemset_dict) rule_collection.save(output_file_name, True) remaining_rules += rule_collection.size() rule_collection.clear() print('Finish for sub frequent item-sets!!!') print('Number of redundant rules ' + str(total_rules - remaining_rules) + '/' + str(total_rules))
def generate_network(self, special_summary, class_name): item_pairs_and_frequency = {} for summary_detail in special_summary: if (summary_detail[1]['p-value']) <= 0.05: item_set = string_2_itemset(summary_detail[0]) for i in range(len(item_set) - 1): for j in range(i + 1, len(item_set)): combination = [item_set[i], item_set[j]] combination_key = itemset_2_string(combination) if combination_key in item_pairs_and_frequency: continue item_pairs_and_frequency[ combination_key] = self.lookup_frequency( combination, class_name) return item_pairs_and_frequency '''
def generate_rules(self, freq_itemsets_collection, output_file_name): total_rules = 0 remaining_rules = 0 k = 0 rule_collection = RulesCollection() x = open(output_file_name, 'w') x.close() for itemset in freq_itemsets_collection: ''' Check item-set first if it can generate a rule ''' if len(itemset) == 1: continue if self.itemset_formatter is not None and self.itemset_formatter( itemset) == False: continue ''' Write generated rule_collection into file ''' k += 1 if k % 200 == 0: #print ('writing some rule_collection to file: ' + str(k)) total_rules += rule_collection.size() rule_collection.remove_redundancy(self.freq_itemset_dict) rule_collection.save(output_file_name, True) remaining_rules += rule_collection.size() rule_collection.clear() ''' Generating association rule_collection. ''' both_frequency = self.freq_itemset_dict.get_frequency( itemset_2_string(itemset)) bit_mask = [True] * len(itemset) self.enumerate_subsets(bit_mask, itemset, 0, rule_collection, both_frequency) #print ('writing last rule_collection to file: ' + str(k)) total_rules += rule_collection.size() rule_collection.remove_redundancy(self.freq_itemset_dict) rule_collection.save(output_file_name, True) remaining_rules += rule_collection.size() rule_collection.clear()
def itemset_string(self): itemset = self.get_itemset() return itemset_2_string(itemset)
def rhs_string(self): return itemset_2_string(self.right_items)
def lhs_string(self): return itemset_2_string(self.left_items)
def lookup_frequency(self, item_set, class_name): merge_itemset = merge_itemsets(item_set, [class_name]) merge_itemset_key = itemset_2_string(merge_itemset) if self.freq_itemset_dict.exists(merge_itemset_key): return self.freq_itemset_dict.get_frequency(merge_itemset_key) return 0