def getTagInfo(self, tag): """ DOCUMENT @return: (documents, totalsize, [rtag:(intersect,total)]) """ docs = self.tddb[tag] rel = dict((tag, (len(intersect), len(self.tddb[tag]))) for tag, intersect in invert_multimap((doc, self.dtdb[doc]) for doc in docs).iteritems()) prod = dict((prod, (len(ps), len(self.pddb[prod]))) for prod, ps in invert_multimap((doc, self.dppb[doc]) for doc in docs).iteritems()) return TagInfo(tag, docs, rel, prod, self.totalsize)
def inferRelProds(self, prod): """ Infer a set of related producers for the given producer. This implementation selects producers that hold the documents in the representative documents set of the given source producer. """ rel = invert_multimap((doc, self.dppb[doc]) for doc in prod.rep_d) if rel: del rel[prod.nsid] # if it's not empty, then it must refer back to itself return dict((nsid, float(len(docs))/len(self.pddb[nsid])) for nsid, docs in rel.iteritems())
def generatePTables(self): """ DOCUMENT """ name = "ptables" id_u = dict((nsid, vid) for vid, nsid in enumerate(self.socgr.vs[NID])) base_h = len(id_u) lab_h, id_h = zip(*((nsid, (nsid, base_h+vid)) for vid, nsid in enumerate(self.gumap.iterkeys()))) if self.gumap else ([], []) id_h = dict(id_h) base_g = len(id_h) + base_h lab_g, id_g = zip(*((nsid, (nsid, base_g+vid)) for vid, nsid in enumerate(self.sprdgr.vs[NID]))) if len(self.sprdgr.vs) > 0 else ([], []) id_g = dict(id_g) ptabgr = self.socgr.copy() edges = set() # add arcs to self edges.update((vid, vid) for vid in xrange(0, base_h)) # add arcs to indexes for nsid, users in self.gumap.iteritems(): hvid = id_h[nsid] edges.update((id_u[user], hvid) for user in users) # add arcs to tgraphs phmap = {} for i, hvids in enumerate(self.comm): gvid = base_g + i for hid in self.prodgr.vs.select(hvids)[NID]: if hid in id_u: continue for user in self.gumap[hid]: if user in phmap: phmap[user].add(gvid) else: phmap[user] = set([gvid]) # only add some of these, to prevent a user linking to eg. 400 tgraphs for user, gvids in phmap.iteritems(): pvid = id_u[user] for gvid in sample(gvids, int(len(gvids)**0.5)): edges.add((pvid, gvid)) ptabgr.add_vertices(len(id_h) + len(id_g)) eend = len(ptabgr.es) ptabgr.add_edges(edges) ptabgr.es[eend:][AAT] = [0.5] * len(edges) ptabgr.es.select(ptabgr.get_eid(vid, vid) for vid in xrange(0, base_h))[AAT] = [1.0] * base_h ptabgr.vs[base_h:][NID] = lab_h + lab_g ptabgr["base_z"] = 0 ptabgr["base_h"] = base_h ptabgr["base_g"] = base_g self.ptabgr = ptabgr # for easy access / human readability ugmap = dict((nsid, set(gnsid)) for nsid, gnsid in invert_multimap(self.gumap.iteritems(), dict((nsid, []) for nsid in self.socgr.vs[NID])).iteritems()) for i, pvid in enumerate(self.comm): spid = self.sprdgr.vs[i][NID] for nsid in self.prodgr.vs.select(pvid)[NID]: if nsid in ugmap: continue for uid in self.gumap[nsid]: ugmap[uid].add(spid) self.ptbmap = ugmap