def test_add_nodes(self): ab = charitems.to_binary('ab') cd = charitems.to_binary('cd') bc = charitems.to_binary('bc') G = Graph() assert len(G.components) == 0 # Add first model with nodes to the graph test_model1 = test_helper.init_simple_model() G.add_nodes(ab, test_model1) assert len(G.components) == 1 assert G.components[0].model == test_model1 assert len(G.components[0].model.C) == 1 assert ab in G.components[0].model.C # Added disjoint model test_model2 = test_helper.init_simple_model() G.add_nodes(cd, test_model2) assert len(G.components) == 2 assert test_model2 in [model for model in G.model_iterator()] assert len(G.components[0].model.C) == 1 assert len(G.components[1].model.C) == 1 for model in G.model_iterator(): assert len(model.C) == 1 assert ab in model.C or cd in model.C if ab in model.C: assert not (cd in model.C) elif cd in model.C: assert not (ab in model.C) else: assert False # Add model that will join all existing components test_model3 = test_helper.init_simple_model() G.add_nodes(bc, test_model3) assert len(G.components) == 1 models = [model for model in G.model_iterator()] assert len(models) == 1 assert test_model3 in models assert len(G.components[0].model.C) == 3 assert ab in test_model3.C assert cd in test_model3.C assert bc in test_model3.C
class MTV(object): def __init__(self, D, initial_C=[], k=DEFAULT_K, m=DEFAULT_M, s=DEFAULT_S, z=DEFAULT_Z, v=DEFAULT_V, q=DEFAULT_Q, add_negated=DEFAULT_ADD_NEGATED, greedy=DEFAULT_GREEDY, headers=None): super(MTV, self).__init__() # Mine up to k itemsets self.k = k # Maximum itemset size self.m = m # Support self.s = s # Constraint on max model size self.q = q # If q is set, we will black list singletons from models # having reached the max size self.black_list_singletons = set() # Be verbose self.v = v # Header strings for attributes self.headers = headers # If set to True, MTV will also produce negated patterns self.add_negated = add_negated # If true FindBestItemset will be more greedy self.greedy = greedy # Number of candidate itemsets FindBestItemSet should search for # Will result in a list of top-z highest heuristics self.z = z # Dataset, is it right to always remove empty rows? tmp = [] for i in D: if i != 0: tmp.append(i) self.D = tmp # Singletons self.I = itemsets.singletons(self.D) if self.add_negated: self.D = dataset_with_negations(self.D, self.I) self.I = itemsets.singletons(self.D) # Cached frequency counts in D self.fr_cache = {} # Global summary self.C = list() self.union_of_C = itemsets.union_of_itemsets(self.C) self.BIC_scores = {} self.heuristics = {} # Create a model for holding all singletons # Singletons not used by model in the graph # will be in this model self.singleton_model = Model(self) self.singleton_model.I = self.I.copy() self.singleton_model.iterative_scaling() # Cached queries self.query_cache = {} # List to track history of disjoint components self.independent_components = [] # List to track history of C size self.summary_sizes = [] # List to track how large a search space was searched self.search_space = [] # List to track history of timings of a loop in mtv self.loop_times = [] # Initialize Graph of independent models self.graph = Graph() self.__init_graph(initial_C) self.BIC_scores['initial_score'] = self.score() def mtv(self): """ Run the mtv algorithm """ timer_stopwatch('run') # Added 0 loop_times for seeded itemsets for X in self.C: self.loop_times.append(0) # Run until we have converged while not self.finished(): start = time() X = self.find_best_itemset() if not (self.validate_best_itemset(X)): break self.add_itemset(X) self.loop_times.append(time()-start) if self.v: print 'Found itemset (%.2f secs): %s, BIC-score: %f, model-sizes: %s, searched-nodes: %d' % (timer_stopwatch_time('run'), itemsets.to_index_list(X), self.BIC_scores[X], self.summary_sizes[-1], self.search_space[-1]) def query(self, y): """ Query using models intersected by y """ timer_start('mtv_query') # query intersected models independently mask = y p = 1.0 for model in self.graph.model_iterator(): # Is this an intersected model? if y & model.union_of_C != 0: # get intersection intersection = model.union_of_C & mask # remove from mask mask = intersection ^ mask # query the intersected model p *= model.query(intersection) # disjoint singletons p *= self.singleton_model.query(mask) timer_stop('mtv_query') return p def query_headers(self, itemset_headers): """ Query an itemset by its header names. This method will give a ValueError if the queried headers are not in the headers property of MTV :param itemset_headers: A list of headers :return: model query of the queried itemset """ # itemset for the provided header names, will throw ValueError # if a header name is not in the self.headers property itemset = itemsets.itemset_for_headers(itemset_headers, self.headers) return self.query(itemset) def score(self): total_score = self.singleton_model.score() for model in self.graph.model_iterator(): total_score += model.score() total_score += 0.5 * len(self.C) * log(len(self.D), 2) return total_score def finished(self): """ Return True if the model has converged, or if k is provided, that k itemsets have been found. :return: """ if not (self.k is None): return self.k <= len(self.C) if 1 < len(self.C): # If previous score is lower, the model score has increased # and we should finish. return self.BIC_scores[self.C[-2]] < self.BIC_scores[self.C[-1]] return False def add_itemset(self, X): """ Add an itemset X to C and update MTV. warning: Adding itemsets to C should always be done with this methods, or MTV will be left in an invalid state. :param X: Itemset to be added to C :return: """ heuristic = h(self.fr(X), self.query(X)) # Add X to global summary self.C.append(X) self.union_of_C = itemsets.union_of_itemsets(self.C) self.heuristics[X] = heuristic self.update_graph(X) # Compute score self.BIC_scores[X] = self.score() def __init_graph(self, initial_c): """ Init the graph with itemsets in C :return: """ # Build graph for X in initial_c: self.add_itemset(X) # initialize independent models # and removed singletons from singleton model # for model in self.graph.independent_models(): # model.iterative_scaling() # self.singleton_model.I -= model.I # finally initialize the singleton model # self.singleton_model.iterative_scaling() def update_graph(self, X): """ Updates the graph with a new itemset X. This will always results in a new model being initialized. The new model's C corresponds to a new graph component, that may contain a merge of one or more existing graph components. """ timer_start('Build independent models') new_model, components = self.graph.add_nodes(X, Model(self)) new_model.iterative_scaling() self.update_model_constraints(new_model) # Update the singleton model self.singleton_model.I -= new_model.I self.singleton_model.iterative_scaling() timer_stop('Build independent models') self.graph_stats(components) def cached_itemset_query(self, X): """ Helper function to cache queries. Note this must only be used from e.g. FindBestItemSet, where the model parameters are not altered between cache hits. :param X: Itemset :return: Model estimate of X """ timer_start('Cached query') estimate = 0.0 if X in self.query_cache: estimate = self.query_cache[X] else: estimate = self.query(X) self.query_cache[X] = estimate timer_stop('Cached query') return estimate def find_best_itemset(self): """ Find best itemset in the sample space defined by I. Subject to model parameters z, m, s and the heuristic function h :return: Best itemset Z """ # reset query caches self.query_cache = {} timer_start('Find best itemset') self.search_space.append(0) Z = self.find_best_itemset_rec(0, self.I.copy() - self.black_list_singletons, [(0,0)]) timer_stop('Find best itemset') # Edge case, where we only find singletons not exactly described by the model # We search the top 10 Zs to see if there was a non singleton itemset for z in Z: if not (z in self.I) and z != 0: return z[0] print 'No valid z in Z: ', Z return Z[0][0] def validate_negated_pattern(self, X, y): """ Return true if y unioned with X is a valid itemset when negated attributes are included. The builds on the assumption that only 1 negated attribute can be added to a pattern, which may not be meaningful in all domains. X|y will not be valid if a negated attribute is already in X or if the positive counterpart of y, is already in X :param X: Itemset :param y: Itemset to be unioned with X, if allowed when negated attributes are used :return: True, if y can be unioned with X """ assert self.add_negated # MTV should be setup so half of the attributes # positive positive_attributes = int(len(self.I)/2.) # Check if y is a negated attribute if 2**positive_attributes <= y: # check no other negated attribute is set if X >> positive_attributes != 0: return False # Check if positive counterpart of y is set pos = y >> positive_attributes if pos & X == pos: return False else: # Check y negative counter part is not set pos = y << positive_attributes if pos & X == pos: return False return True def find_best_itemset_rec(self, X, Y, Z, X_length=0, parent_h=0): """ :param X: itemset :param Y: remaining itemsets :param Z: currently best itemset :param s: min support :param m: max itemset size :param X_length: number of items in X. No pretty, but since X is an int, this is the fastest way to know its length :return: Best itemsets Z """ fr_X = self.fr(X) if fr_X < self.s or X in self.C: return Z p_X = self.cached_itemset_query(X) h_X = h(fr_X, p_X) # Greedy approach if self.greedy and h_X < parent_h: return Z if h_X > Z[-1][1] or len(Z) < self.z: # print 'Z improved: ', h_X Z.append((X, h_X)) # Sort by descending heuristic Z.sort(lambda x, y: x[1] < y[1] and 1 or -1) if self.z < len(Z): Z.pop() XY = X | itemsets.union_of_itemsets(Y) fr_XY = self.fr(XY) p_XY = self.cached_itemset_query(XY) b = max(h(fr_X, p_XY), h(fr_XY, p_X)) if Z[0][0] == 0 or b > Z[-1][1]: if self.m == 0 or X_length < self.m: while 0 < len(Y): y = Y.pop() self.search_space[-1] += 1 # If we are mining negated patterns # we have to check that y can be unioned with X if not self.add_negated or self.validate_negated_pattern(X, y): Z = self.find_best_itemset_rec(X | y, Y.copy(), Z, X_length + 1, parent_h=h_X) return Z def fr(self, x): """ :param x: Itemset :return: Frequency of x in D """ if x in self.fr_cache: return self.fr_cache[x] p = 0.0 for xi in self.D: if itemsets.contains(xi, x): p += 1 p = p / len(self.D) assert p <= 1.0 self.fr_cache[x] = p return p def U(self): """ Returns U over all models :return: U """ U = {} for model in self.graph.model_iterator(): U.update(model.U) return U def u0(self): """ Returns u0 over all models :return: u0 """ u0 = self.singleton_model.u0 for model in self.graph.model_iterator(): u0 *= model.u0 return u0 def update_model_constraints(self, newest_model): """ If C_i in some model has grow larger than q, we remove the singletons in the model from the search space :param newest_model: """ if not (self.q is None): # Blacklist model singletons if len(newest_model.C) >= self.q: self.black_list_singletons = self.black_list_singletons.union(newest_model.I) def validate_best_itemset(self, itemset): """ Returns True if an itemset can be added to C. Singletons or the empty set should not be added to C, but this can happen in cases where e.g. thresholds for support or minimum itemset size are too strict. """ if itemset in self.I: print 'X was a singleton! These should not be possible from the heurestic' return False if itemset == 0: print 'Best itemset found was the empty set (0). This could ' \ 'mean the heurestic could not find any itemset ' \ 'not already predicted by the model, or above ' \ 'the provided thresholds. Exiting MTV' return False return True def graph_stats(self, components): """ Log stats on the graph :param components: Current Graph Components """ self.independent_components.append(len(components)) C_sizes = [] for component in components: C_sizes.append(len(component.model.C)) self.summary_sizes.append(C_sizes) counter_max('Independent models', len(components))