Пример #1
0
    def compute(self):
        statistic_summary = []

        class_list = self.class_dict.keys()
        for itemset_key, freq in self.freq_itemset_dict.itemsets.items():
            '''
            Compute entropy for each item-set (not contain class item)
            '''
            itemset = string_2_itemset(itemset_key)
            if self.itemset_formatter(itemset) == True: continue

            entropy_value = 0
            statistic_detail = {}
            flag = False

            for class_name in class_list:
                p = self.lookup_frequency(itemset, class_name) / freq
                statistic_detail[class_name] = p
                if p != 0:
                    flag = True
                    entropy_value += (-p * math.log2(p))
            '''
            Only add value when at least one class has value.
            '''
            if (flag == True):
                statistic_detail['entropy'] = entropy_value
                statistic_detail['freq'] = freq

                statistic_summary.append(
                    (itemset_2_string(itemset), statistic_detail))

        return sorted(statistic_summary, key=lambda x: (x[1]['entropy']))
Пример #2
0
    def generate_rules_for_class(self, general_summary, class_name):
        special_summary = []
        for summary_detail in general_summary:
            if summary_detail[1][class_name] > 0:
                special_summary.append(summary_detail)
                '''
                Compute p-value
                '''
                item_set = string_2_itemset(summary_detail[0])
                satisfy_rule = self.freq_itemset_dict.get_frequency(
                    summary_detail[0])
                no_satisfy_rule = self.freq_itemset_dict.ntransactions - satisfy_rule

                correct_predict = self.lookup_frequency(item_set, class_name)
                incorrect_predict = satisfy_rule - correct_predict

                belong_to_class = self.freq_itemset_dict.get_frequency(
                    class_name)
                no_rule_belong_to_class = belong_to_class - correct_predict
                contingency_matrix = np.array(
                    [[correct_predict, incorrect_predict],
                     [
                         no_rule_belong_to_class,
                         no_satisfy_rule - no_rule_belong_to_class
                     ]])

                _, p_value = stats.fisher_exact(contingency_matrix)
                summary_detail[1]['p-value'] = p_value

        return special_summary
Пример #3
0
    def split(self, nChunk):
        itemsets_names = self.itemsets.keys()
        nItemsets = len(itemsets_names)

        #print ('Number of frequent item-sets: ' + str(nItemsets))
        itemset_chunks = [[] for _ in range(nChunk)]
        size_of_chunk = (int)(nItemsets / nChunk) + 1

        index = 0
        counter = 0

        for itemset_key in itemsets_names:
            if counter < size_of_chunk:
                itemset_chunks[index].append(string_2_itemset(itemset_key))
                counter += 1
            elif counter == size_of_chunk:
                index += 1
                itemset_chunks[index].append(string_2_itemset(itemset_key))
                counter = 1

        return itemset_chunks
Пример #4
0
    def load_file(self, file_name):
        self.itemsets.clear()

        with open(file_name, "r") as text_file:
            self.ntransactions = int(text_file.readline())
            for line in text_file:

                subStrings = line.split(':')
                itemset_key = subStrings[0].strip()
                frequency = int(subStrings[1].strip())

                self.itemsets[itemset_key] = frequency
                m = len(string_2_itemset(itemset_key))
                if m > self.length_of_max_itemset:
                    self.length_of_max_itemset = m
Пример #5
0
    def generate_network(self, special_summary, class_name):
        item_pairs_and_frequency = {}
        for summary_detail in special_summary:
            if (summary_detail[1]['p-value']) <= 0.05:
                item_set = string_2_itemset(summary_detail[0])
                for i in range(len(item_set) - 1):
                    for j in range(i + 1, len(item_set)):
                        combination = [item_set[i], item_set[j]]
                        combination_key = itemset_2_string(combination)
                        if combination_key in item_pairs_and_frequency:
                            continue
                        item_pairs_and_frequency[
                            combination_key] = self.lookup_frequency(
                                combination, class_name)

        return item_pairs_and_frequency
        '''
Пример #6
0
    def generate_Lk_w(min_sup_src, L_k1, C_k_file, k, inclusive_items_dict):
        #print('generate candidates with ' + str(k) + ' items')
        file_writer = open(C_k_file, 'w')
        for key, hash_item_collection in L_k1.get_items():
            for index in range(hash_item_collection.size() - 1):

                index_th_item = hash_item_collection.get_item(index)
                new_key = ''
                if key == '':
                    new_key = index_th_item.last_item
                else:
                    new_key = key + ',' + index_th_item.last_item
                new_hash_collection = HashItemCollection()

                #check if it is infrequent item-set
                previous_itemset = string_2_itemset(new_key)
                for item in hash_item_collection.get_items_from(index + 1):
                    '''
                    Check if the itemset contains any inclusive pair of items.
                    '''
                    if Apriori.checkInclusiveItems(previous_itemset,
                                                   item.last_item,
                                                   inclusive_items_dict):
                        continue
                    '''
                    Create new itemset and check its support
                    '''
                    new_item = HashItem(item.last_item)
                    inter_items = set(index_th_item.tids).intersection(
                        item.tids)
                    if len(inter_items) >= min_sup_src:
                        new_item.add_tids(list(inter_items))
                        new_hash_collection.add_item(new_item)
                '''
                Write the new itemsets into file if there's any.
                '''
                if new_hash_collection.size() > 0:
                    file_writer.write(new_key)
                    file_writer.write('\n')
                    file_writer.write(new_hash_collection.serialize())
                    file_writer.write('\n')
        file_writer.close()
Пример #7
0
 def string_2_rule(s):
     subStrings = s.split(">")
     left = string_2_itemset(subStrings[0].strip())
     right = string_2_itemset(subStrings[1].strip())
     return AssociationRule(left, right)