def testJoin(): iss = cItemsets.cItemsets(3) a = cAssociationRules() u1 = urlparse.urlparse('http://eins') u2 = urlparse.urlparse('http://zwei') u3 = urlparse.urlparse('http://drei') u4 = urlparse.urlparse('http://vier') u5 = urlparse.urlparse('http://fuenf') i1 = cItemset.cItemset() i1.SetUrls([u1,u2,u3]) i2 = cItemset.cItemset() i2.SetUrls([u1,u2,u4]) i3 = cItemset.cItemset() i3.SetUrls([u1,u3,u4]) i4 = cItemset.cItemset() i4.SetUrls([u1,u3,u5]) i5 = cItemset.cItemset() i5.SetUrls([u2,u3,u4]) iss.AddItemset(i1) iss.AddItemset(i2) iss.AddItemset(i3) iss.AddItemset(i4) iss.AddItemset(i5) a.Join(iss).Print()
def __init__(self): """Constructor.""" self.sItemsetsFileName = 'data/itemsets.xml' self.sRulesFileName = 'data/assorules.xml' # store only candidate one itemsets in a file self.Itemsets = cItemsets.cItemsets(1) self.Rules = cRules.cRules()
def ComputeCandidateOneItemsets(self, lSessions): """Computes candidate one itemsets from session lSessions -- list of sessions return -- candidate one itemsets """ oneitemsets = cItemsets.cItemsets(1) for session in lSessions: for click in session.GetClicks(): iset = cItemset.cItemset() iset.SetUrls([click.GetUrl(),]) oneitemsets.AddItemset(iset) return oneitemsets
def Join(self, itemsets): """Join itemsets. itemsets -- Lk-1 large k-1 itemsets return -- Ck candidate k itemsets """ k = itemsets.GetSize() + 1 candidates = cItemsets.cItemsets(k) lJoined = [] for item_i in itemsets.GetList(): for item_j in itemsets.GetList(): list_i = item_i.GetUrls() list_j = item_j.GetUrls() # use dict to eliminate double occurences dict = {} for url in list_i: dict[url] = 1 for url in list_j: dict[url] = 1 if len(dict) == k: if dict not in lJoined: lJoined.append(dict) for dict in lJoined: i = cItemset.cItemset() for url in dict.keys(): i.AddUrl(url) candidates.AddItemset(i) if len(candidates.lData) == 0: return None else: return candidates