def test_tree_iter(): tree = FPTree() (item_count, _) = count_item_frequency_in(test_transactions) expected = Counter() for transaction in test_transactions: sort_transaction(transaction, item_count) tree.insert(transaction) expected[frozenset(transaction)] += 1 observed = Counter() for (transaction, count) in tree: observed[frozenset(transaction)] += count assert (expected == observed)
def test_tree_iter(): tree = FPTree() item_count = count_item_frequency_in(test_transactions) expected = Counter() for transaction in [list(map(Item, t)) for t in test_transactions]: sort_transaction(transaction, item_count) tree.insert(transaction) expected[frozenset(transaction)] += 1 stored_transactions = set() observed = Counter() for (transaction, count) in tree: observed[frozenset(transaction)] += count assert(expected == observed)
def tree_global_change(tree, other_item_count): assert (tree.is_sorted()) change = 0.0 for (path, count) in tree: sorted_path = sort_transaction(path, other_item_count) distance = levenstein_distance(path, sorted_path) change += (distance**2) / (len(path)**2) return change / tree.num_transactions
def mine_cp_tree_stream(transactions, min_support, sort_interval, window_size): # Yields (window_start_index, window_length, patterns) tree = FPTree() sliding_window = deque() frequency = None num_transactions = 0 for transaction in transactions: num_transactions += 1 transaction = sort_transaction(map(Item, transaction), frequency) tree.insert(transaction) sliding_window.append(transaction) if len(sliding_window) > window_size: transaction = sliding_window.popleft() transaction = sort_transaction(transaction, frequency) tree.remove(transaction, 1) assert (len(sliding_window) == window_size) assert (tree.num_transactions == window_size) if (num_transactions % sort_interval) == 0: tree.sort() frequency = tree.item_count.copy() if (num_transactions % window_size) == 0: if (num_transactions % sort_interval) != 0: # We won't have sorted due to the previous check, so we # need to sort before mining. tree.sort() frequency = tree.item_count.copy() assert (tree.num_transactions == len(sliding_window)) assert (len(sliding_window) == window_size) min_count = min_support * tree.num_transactions patterns = fp_growth(tree, min_count, []) yield (num_transactions - len(sliding_window), len(sliding_window), patterns) else: # We didn't just mine on the last transaction, we need to mine now, # else we'll miss data. if (num_transactions % window_size) != 0: if (num_transactions % sort_interval) != 0: tree.sort() frequency = tree.item_count.copy() min_count = min_support * tree.num_transactions patterns = fp_growth(tree, min_count, []) yield (num_transactions - len(sliding_window), len(sliding_window), patterns)
def build_tree(window, item_count): path_len_sum = 0 path_count = 0 tree = FPTree() for bucket in window: for (transaction, count) in bucket.tree: sorted_transaction = sort_transaction(transaction, item_count) path_len_sum += count * len(sorted_transaction) path_count += count tree.insert(sorted_transaction, count) avg_path_len = path_len_sum / path_count return (tree, avg_path_len)
def append(self, other_bucket): for (transaction, count) in other_bucket.tree: self.tree.insert( sort_transaction(transaction, self.sorting_counter), count) self.tree.sort() # TODO: Is this necessary? self.sorting_counter = self.tree.item_count.copy()
def add(self, transaction): self.tree.insert(sort_transaction(transaction, self.sorting_counter))