示例#1
0
	def getTagInfo(self, tag):
		"""
		DOCUMENT

		@return: (documents, totalsize, [rtag:(intersect,total)])
		"""
		docs = self.tddb[tag]
		rel = dict((tag, (len(intersect), len(self.tddb[tag]))) for tag, intersect in
		  invert_multimap((doc, self.dtdb[doc]) for doc in docs).iteritems())
		prod = dict((prod, (len(ps), len(self.pddb[prod]))) for prod, ps in
		  invert_multimap((doc, self.dppb[doc]) for doc in docs).iteritems())
		return TagInfo(tag, docs, rel, prod, self.totalsize)
示例#2
0
	def inferRelProds(self, prod):
		"""
		Infer a set of related producers for the given producer.

		This implementation selects producers that hold the documents in the
		representative documents set of the given source producer.
		"""
		rel = invert_multimap((doc, self.dppb[doc]) for doc in prod.rep_d)
		if rel:
			del rel[prod.nsid] # if it's not empty, then it must refer back to itself
		return dict((nsid, float(len(docs))/len(self.pddb[nsid])) for nsid, docs in rel.iteritems())
示例#3
0
	def generatePTables(self):
		"""
		DOCUMENT
		"""
		name = "ptables"

		id_u = dict((nsid, vid) for vid, nsid in enumerate(self.socgr.vs[NID]))

		base_h = len(id_u)
		lab_h, id_h = zip(*((nsid, (nsid, base_h+vid)) for vid, nsid in enumerate(self.gumap.iterkeys()))) if self.gumap else ([], [])
		id_h = dict(id_h)

		base_g = len(id_h) + base_h
		lab_g, id_g = zip(*((nsid, (nsid, base_g+vid)) for vid, nsid in enumerate(self.sprdgr.vs[NID]))) if len(self.sprdgr.vs) > 0 else ([], [])
		id_g = dict(id_g)

		ptabgr = self.socgr.copy()

		edges = set()
		# add arcs to self
		edges.update((vid, vid) for vid in xrange(0, base_h))
		# add arcs to indexes
		for nsid, users in self.gumap.iteritems():
			hvid = id_h[nsid]
			edges.update((id_u[user], hvid) for user in users)
		# add arcs to tgraphs
		phmap = {}
		for i, hvids in enumerate(self.comm):
			gvid = base_g + i
			for hid in self.prodgr.vs.select(hvids)[NID]:
				if hid in id_u:
					continue
				for user in self.gumap[hid]:
					if user in phmap:
						phmap[user].add(gvid)
					else:
						phmap[user] = set([gvid])
		# only add some of these, to prevent a user linking to eg. 400 tgraphs
		for user, gvids in phmap.iteritems():
			pvid = id_u[user]
			for gvid in sample(gvids, int(len(gvids)**0.5)):
				edges.add((pvid, gvid))

		ptabgr.add_vertices(len(id_h) + len(id_g))
		eend = len(ptabgr.es)
		ptabgr.add_edges(edges)
		ptabgr.es[eend:][AAT] = [0.5] * len(edges)
		ptabgr.es.select(ptabgr.get_eid(vid, vid) for vid in xrange(0, base_h))[AAT] = [1.0] * base_h
		ptabgr.vs[base_h:][NID] = lab_h + lab_g
		ptabgr["base_z"] = 0
		ptabgr["base_h"] = base_h
		ptabgr["base_g"] = base_g
		self.ptabgr = ptabgr

		# for easy access / human readability
		ugmap = dict((nsid, set(gnsid)) for nsid, gnsid in invert_multimap(self.gumap.iteritems(),
		  dict((nsid, []) for nsid in self.socgr.vs[NID])).iteritems())
		for i, pvid in enumerate(self.comm):
			spid = self.sprdgr.vs[i][NID]
			for nsid in self.prodgr.vs.select(pvid)[NID]:
				if nsid in ugmap:
					continue
				for uid in self.gumap[nsid]:
					ugmap[uid].add(spid)
		self.ptbmap = ugmap