def testJoin(): iss = cItemsets.cItemsets(3) a = cAssociationRules() u1 = urlparse.urlparse('http://eins') u2 = urlparse.urlparse('http://zwei') u3 = urlparse.urlparse('http://drei') u4 = urlparse.urlparse('http://vier') u5 = urlparse.urlparse('http://fuenf') i1 = cItemset.cItemset() i1.SetUrls([u1,u2,u3]) i2 = cItemset.cItemset() i2.SetUrls([u1,u2,u4]) i3 = cItemset.cItemset() i3.SetUrls([u1,u3,u4]) i4 = cItemset.cItemset() i4.SetUrls([u1,u3,u5]) i5 = cItemset.cItemset() i5.SetUrls([u2,u3,u4]) iss.AddItemset(i1) iss.AddItemset(i2) iss.AddItemset(i3) iss.AddItemset(i4) iss.AddItemset(i5) a.Join(iss).Print()
def test(): """Built-in test method for this class.""" items = cItemsets(2) items.OpenFile('/tmp/foo.xml') urltuple = urlparse.urlparse('http://slashdot.org') urltuple2 = urlparse.urlparse('http://harth.org') itemset1 = cItemset.cItemset() itemset2 = cItemset.cItemset() itemset1.SetUrls([urltuple, urltuple]) itemset1.SetCount(4711) itemset1.SetUrls([urltuple2, urltuple2]) itemset1.SetCount(4712) items.AddItemset(itemset1) items.AddItemset(itemset2) items.CloseFile() items2 = cItemsets(2) items2.OpenFile('/tmp/foo.xml') items2.Print() #XXXprint 'now prune...' #items2.Prune(2) items2.Print() items2.CloseFile()
def SetElements(self, lEls): """Read elements into internal representation. lEls -- elements <itemset count="74"> <url>http://slashdot.org</url> ... </itemset> """ for el in lEls: itemset = cItemset.cItemset() itemset.SetElement(el) self.lData.append(itemset)
def ComputeCandidateOneItemsets(self, lSessions): """Computes candidate one itemsets from session lSessions -- list of sessions return -- candidate one itemsets """ oneitemsets = cItemsets.cItemsets(1) for session in lSessions: for click in session.GetClicks(): iset = cItemset.cItemset() iset.SetUrls([click.GetUrl(),]) oneitemsets.AddItemset(iset) return oneitemsets
def Join(self, itemsets): """Join itemsets. itemsets -- Lk-1 large k-1 itemsets return -- Ck candidate k itemsets """ k = itemsets.GetSize() + 1 candidates = cItemsets.cItemsets(k) lJoined = [] for item_i in itemsets.GetList(): for item_j in itemsets.GetList(): list_i = item_i.GetUrls() list_j = item_j.GetUrls() # use dict to eliminate double occurences dict = {} for url in list_i: dict[url] = 1 for url in list_j: dict[url] = 1 if len(dict) == k: if dict not in lJoined: lJoined.append(dict) for dict in lJoined: i = cItemset.cItemset() for url in dict.keys(): i.AddUrl(url) candidates.AddItemset(i) if len(candidates.lData) == 0: return None else: return candidates