def scoreDoc(id, k): eseq = g.es.select(g.adjacent(id, IN)) try: return union_ind(k*e[AAT]/sc_t[e.source-self.base_t] for e in eseq) except IndexError: print list(e.source for e in eseq) raise
def createTGraph(self, totalsize, pgdb, display=False, node_attr={ "style": ("filled", "filled"), "fillcolor":("firebrick1", "limegreen"), "shape":("ellipse","doublecircle"), }): """ Creates a graph representing this producer as a tgraph. @param totalsize: total number of documents in the entire world @param pgdb: an open database of {prid:Producer} (for tgraphs) @param display: whether to generate for display (adds attributes to pretty up the graph) @param node_attr: {attr:(tag,prod)} node attributes for graphviz; each attribute should be mapped to a (tag,prod) pair that holds the attribute value for the respective type of node; this only has an effect if <display> is True an effect if <display> is True """ # estimate total size from producer's own perspective # the formula is pulled out of my ass but should give passable results # - neighbours are not independent => total lower than this # - neighbours are not entire network => total higher than this total = union_ind(chain([self.size()], (pgdb[self.docgr.vs[pid]["id"]].size() for pid in self.prange())), totalsize) # print "producer %s (%s): total size of network estimated to be %s (actual %s)" % (self.nsid, self.size(), total, totalsize) gg = self.docgr.copy() del gg.vs[NAA] gg["base_t"] = 0 gg["base_g"] = self.base_p - self.base_t # node-attrs for prange gg.vs[self.base_p:][NAT] = [pgdb[gg.vs[pid][NID]].size()/float(total) for pid in self.prange()] # infer arcs between tags mem = [filter(lambda id: id in self.drange(), gg.successors(tid)) for tid in self.trange()] edges, arc_a = infer_arcs(mem, total) gg.delete_vertices(self.drange()) gg.add_edges(edges) #assert gg.es[-len(edges):][AAT] == [None] * len(edges) gg.es[-len(edges):][AAT] = arc_a if display: gg.vs["label"] = gg.vs[NID] del gg.vs[NID] for attr, val in node_attr.iteritems(): gg.vs[attr] = [val[0] for i in self.drange()] + [val[1] for i in self.trange()] + [val[2] for i in self.prange()] return gg
def selectTagsFromClusters(self, tset_s, tset_t): """ Selects tags from the intersection between each cluster for a source tag, and the target tagset. The representatives of the cluster are also selected, if the intersection is large enough. @param tset_s: source tag-set @param tset_t: target tag-set @return: (rtags, htags), where rtags = {rtag:[tag]} associates tags on the target side to related tags on the source side, and htags = {htag:e_attr} associates "high-level" tags (which might not exist on the target side) to appropriate arc-attributes. """ #LOG.debug("III enter selectTagsFromClusters: %s %s" % (len(tset_s), len(tset_t))) rtags = {} htags = {} if type(tset_t) != set: tset_t = set(tset_t) for tag in tset_s: for cluster in self.tcdb[tag]: tset_x = tset_t.intersection(cluster) # add intersection to rtags for rtag in tset_x: if rtag in rtags: rtags[rtag].append(tag) else: rtags[rtag] = [tag] # if intersection is big enough, add "representative" tags of # this cluster to htags if 3*len(tset_x) > len(cluster): # TWEAK # on flickr, this is the first 3 tags attr = len(tset_x)/float(len(cluster)) for rtag in cluster[0:3]: if rtag in htags: htags[rtag].append(attr) rtags[rtag].append(tag) else: htags[rtag] = [attr] rtags[rtag] = [tag] #LOG.debug("XXX exit selectTagsFromClusters: %s %s" % (len(tset_s), len(tset_t))) return rtags, dict((htag, union_ind(attrs)) for htag, attrs in htags.iteritems())
def inferScores(self, init=0.5): """ Infer scores for docs and tags. DOCUMENT more detail """ g = self.docgr # doc-tag weight is P(t|d) # tags and docs are considered as bags of meaning # a producer = union of tags = union of docs # Infer P(t|this) = union_ind(P(t|d) over all d attached to t) # # Justification: roughly, 1 match out of any is satisfactory. We have # no further information so assume P(t|d) independent over d. sc_t = list(union_ind(g.es.select(g.adjacent(id, OUT))[AAT]) for id in self.trange()) # Infer P(d|this) = union_ind(P(d|t) over all t attached to d) # # We assume that P(d|this) = P(d). This is NOT theoretically sound, but # it doesn't matter because this heuristic is only used within this # producer, to rank documents. (In reality, P(d|this) >> P(d).) # # We rewrite P(d|t) in terms of P(t|d); this results in a formula with # P(d) on both sides; we use iterconverge to find a non-zero solution. # # Special case: if there is only 1 tag, its weight is 1.0, and its arc # weight is 1.0, then iteration will always return the inital value. # So we'll arbitrarily choose init=0.5 by default. sc_d = [] def scoreDoc(id, k): eseq = g.es.select(g.adjacent(id, IN)) try: return union_ind(k*e[AAT]/sc_t[e.source-self.base_t] for e in eseq) except IndexError: print list(e.source for e in eseq) raise for id in self.drange(): sc_d.append(iterconverge(partial(scoreDoc, id), (0,1), init, eps=2**-32, maxsteps=0x40)) self.docgr.vs[NAA] = sc_d + sc_t