def load(self, filename): f = open(filename, 'r') self.__tags = Tags(filename) outfilename = filename + '.ranks/all.tags' # save new subgraph withou tags. try: os.mkdir(filename + '.ranks') pass except: pass self.__tags.save_tag_freqs(outfilename) self.__lines = f.readlines() self.__filename = filename f.close()
def build_tag_graph(self, tagged_graph_path, threshold=None): filename = tagged_graph_path # def __init__(self, filename, bound_tag_dist=None, calc_tag_distance=False): tags = Tags(filename, threshold, True) outfilename = filename + '.tags' tags.save_tag_freqs(outfilename) dist_dict = tags.get_tag_dist() tag_freq = tags.get_tag_freq() print 'tag set with distance: %d' % len(tags.get_tag_set()) print 'tag edges with distance: %d' % len(dist_dict.keys()) nodes = [] for node in tags.get_tag_set(): nodes.append((node,tag_freq[node])) #print str((node,tag_freq[node])) dists = [] for key, val in dist_dict.iteritems(): dists.append(val) max_dist, min_dist = max(dists), min(dists) # use opposite distances as weights. edges = [] for key, val in dist_dict.iteritems(): edges.append((key[0],key[1],(max_dist-val)/(max_dist-min_dist))) #edges.sort(thr_fst_cmp, None, True) self.__graph.add_nodes_from(nodes) self.__graph.add_edges_from(edges) self.set_corpus_size(tags.get_corpus_size()) dist_graph_path = filename + '.tags.graph' self.save(dist_graph_path)
class RankerByTags: # without extension def load(self, filename): f = open(filename, 'r') self.__tags = Tags(filename) outfilename = filename + '.ranks/all.tags' # save new subgraph withou tags. try: os.mkdir(filename + '.ranks') pass except: pass self.__tags.save_tag_freqs(outfilename) self.__lines = f.readlines() self.__filename = filename f.close() def best_tag(self, tag): return self.__tags.best_tag(tag) def tag_weight(self, tag): return self.__tags.tag_weight(tag) def __filter_two_sized_comp(self, nodes, edges): return nodes, edges # filter nodes and edges for 2-sized components. # code into integers, to avoid use of dictionaries everywhere. #print 'mapping nodes to integers' print 'Before filtering!!!' print 'Nodes: %d' % len(nodes) print 'Edges: %d' % len(edges) nodes = list(nodes) map_node_int = {} map_int_node = {} for i in range(len(nodes)): node = nodes[i] map_node_int[node] = i map_int_node[i] = node outbound = [] inbound = [] for i in range(len(nodes)): outbound.append([]) inbound.append([]) #print 'extracting output neighbors per node' for n1, n2 in edges: n1 = map_node_int[n1] n2 = map_node_int[n2] inbound[n2].append(n1) outbound[n1].append(n2) removable_nodes, removable_edges = [], [] for n1, n2 in edges: n1 = map_node_int[n1] n2 = map_node_int[n2] if len(inbound[n1])==0 and len(outbound[n2])==0: removable_nodes.append(map_int_node[n1]) removable_nodes.append(map_int_node[n2]) removable_edges.append((map_int_node[n1],map_int_node[n2])) nodes, edges = set(nodes), set(edges) nodes = nodes.difference( set( removable_nodes ) ) edges = edges.difference( set( removable_edges ) ) print 'After filtering!!!' print 'Nodes: %d' % len(nodes) print 'Edges: %d' % len(edges) return nodes, list(edges) def filter_one_tag(self, tag): nodes = set([]) edges = set([]) tag_lines = self.__tags.get_lines(tag) if not tag_lines: raise Exception("no tagged graph loaded!") for line in tag_lines: cols = line.split('\t\t') if len(cols) < 3: continue src = cols[0] dst = cols[1] tags = cols[2].strip().lower().split('|') if tag in tags: nodes.add(src) nodes.add(dst) edges.add((src,dst)) nodes,edges = self.__filter_two_sized_comp(nodes, edges) self.__nodes = nodes self.__edges = edges def filter_by_nodes(self, filter_nodes, tags=None): nodes = set([]) edges = set([]) if not self.__lines: raise Exception("no tagged graph loaded!") for line in self.__lines: cols = line.split('\t\t') if len(cols) < 3: continue src = cols[0] dst = cols[1] #tags = cols[2].strip().lower().split('|') if src in filter_nodes and dst in filter_nodes: #nodes.add(src) #nodes.add(dst) edges.add((src,dst)) nodes = filter_nodes nodes,edges = self.__filter_two_sized_comp(nodes, edges) self.__nodes = nodes self.__edges = edges def filter_by_nodes_and_tag(self, filter_nodes, tag=None): nodes = set([]) edges = set([]) if not self.__lines: raise Exception("no tagged graph loaded!") for line in self.__lines: cols = line.split('\t\t') if len(cols) < 3: continue src = cols[0] dst = cols[1] tags = cols[2].strip().lower().split('|') if src in filter_nodes and dst in filter_nodes and (not tag or tag in tags): #nodes.add(src) #nodes.add(dst) edges.add((src,dst)) nodes = filter_nodes nodes,edges = self.__filter_two_sized_comp(nodes, edges) self.__nodes = nodes self.__edges = edges # a tag proposition def filter(self, tag_prop): nodes = set([]) edges = set([]) if not self.__lines: raise Exception("no tagged graph loaded!") for line in self.__lines: cols = line.split('\t\t') if len(cols) < 3: continue src = cols[0] dst = cols[1] tags = cols[2].strip().lower().split('|') if tag_prop.match(tags): nodes.add(src) nodes.add(dst) edges.add((src,dst)) nodes,edges = self.__filter_two_sized_comp(nodes, edges) self.__nodes = nodes self.__edges = edges # a tag proposition def filter_save(self, tag_prop, save_file): nodes = set([]) edges = set([]) sf = open(save_file, 'w') if not self.__lines: raise Exception("no tagged graph loaded!") sa for line in self.__lines: cols = line.split('\t\t') if len(cols) < 3: continue src = cols[0] dst = cols[1] #tags = cols[2].strip().lower().split('|') if tag_prop.match(tags): sf.write(line) self.__nodes = nodes self.__edges = edges def get_nodes(self): return self.__nodes # save graph after filtering def save(self, outfilename): nodes = self.__nodes edges = self.__edges print 'writing %s, nodes: %d, edges: %d' % (outfilename, len(nodes), len(edges)) out = open(outfilename, 'w') out.write('2\n') out.write('0\n') for n in nodes: out.write('%s\n' % n) out.write('2\n') out.write('----\n') for n1,n2 in edges: out.write('%s\n' % n1) out.write('%s\n' % n2) out.write('2\n') out.close() # save graph after filtering def save_edges(self, outfilename): nodes = self.__nodes edges = self.__edges print 'Before saving method save_edges!!!' print 'Nodes: %d' % len(nodes) print 'Edges: %d' % len(edges) print 'writing %s' % outfilename out = open(outfilename, 'w') # map nodes to ints map = {} for i,node in zip(range(len(nodes)), nodes): # begin counting in 1 map[node] = i + 1 for n1,n2 in edges: try: out.write('%d %d\n' % (map[n1], map[n2])) except: pass out.close() print 'nodes: %d' % len(nodes) print 'edges: %d' % len(edges) # save graph after filtering def save_nwb(self, outfilename): nodes = self.__nodes edges = self.__edges print 'writing %s' % outfilename out = open(outfilename, 'w') # map nodes to ints map = {} for i,node in zip(range(len(nodes)), nodes): # begin counting in 1 map[node] = i + 1 out.write('*Nodes %d\n' % len(nodes)) out.write('id*int label*string\n') for node, int_val in map.iteritems(): out.write(' %d "%d" \n' % (int_val, int_val)) out.write('*DirectedEdges\n') out.write('source*int target*int\n') for n1,n2 in edges: try: out.write(' %d %d\n' % (map[n1], map[n2])) except: pass out.close() print 'nodes: %d' % len(nodes) print 'edges: %d' % len(edges) def __snd_cmp(self,A,B): ret = A[1] - B[1] if ret < 0: return -1 elif ret > 0: return 1 else: return 0 def rank(self, iterations=50, damping_factor=0.85, accurate=False): # pagerank = PageRankNumarray(list(self.__nodes), self.__edges) # self.__pagerank = pagerank.rank() use_native = len(self.__edges) > 0 pagerank = PageRank(list(self.__nodes), self.__edges, use_native, damping_factor) self.__pagerank = pagerank.ranking(-1, iterations) def get_rank(self, many=None): if many: return self.__pagerank[:many] else: return self.__pagerank def saveRank(self, outfilenamerank): pagerank = self.__pagerank print 'writing %s' % outfilenamerank f = open(outfilenamerank, 'w') for t in pagerank: f.write( '%s %.24f\n' % t ) f.close() def all_ranks(self, filename, compute_ranks, compute_mono_rank, top_tags=None): #self.load(filename) tag_freqs = self.__tags.get_top_tags(top_tags) # save new subgraph withou tags. try: os.mkdir(filename + '.ranks') pass except: pass if compute_ranks: try: os.mkdir(filename + '.ranks') except: pass i = 0 secs = time.time() for tag, tag_freq in tag_freqs: print '%d tags of %d' % (i,len(tag_freqs)) i += 1 if len(tag) > 0: #print tag self.filter_one_tag(tag) # save new subgraph withou tags. outfilename = filename + '.ranks/%s.graph' % tag #self.save(outfilename) # self.save_edges(outfilename + '.edges') # compute rank print 'compute rank: %s' % tag max_iters = 10 self.rank(10) # save rank outfilenamerank = outfilename + '.rank' if len(tag) < 64: self.saveRank(outfilenamerank) secs = time.time() - secs open('log.txt','aw').write('faceted singleton ranks, %f seconds, dataset %s\n' % (secs, filename)) if compute_mono_rank: # compute monolitic rank of the graph print 'computing monolitic rank' # filter by tag boolean formula tag_form = TagBooleanFormula() self.filter(tag_form) outfilename = filename + '.ranks/.graph' + '.rank' #outfilename = filename + '-.graph.rank' # compute rank print 'compute rank: complete graph' max_iters = 10 secs = time.time() self.rank(10) secs = time.time() - secs open('log.txt','aw').write('single rank, %f seconds, dataset %s\n' % (secs, filename)) # save rank print 'saving monolitic rank' self.saveRank(outfilename) print 'finish.'
def run(self, filename, top_many_tags=20): tags = Tags(filename) top_tags = tags.get_top_tags(top_many_tags) ranker = RankerByTags() ranker.load(filename) self.__filename = filename # add goldstarndart to filename filename += "." + self.__offline_type top_tags = map(lambda x: x[0], top_tags) monolitic_tag_ranks, online_tag_ranks = self.get_tag_ranks(ranker, top_tags) bool_pairs = [(True, True)] # ,(False,True),(True,False),(False,False)] # bool_pairs = [(True,True)] begin_top_many_users = 1 end_top_many_users = 10 # exponential step 2, 4 ,8 16 step_top_many_users = 2 if "online1" in self.__exps_list: print print "-------------------------------------------------------------------------" print " OFFLINE VERSUS ONLINE (EXPERIMENT)" or_filename = filename + ".tags-%d.online_vs_offline_or.txt" % top_many_tags and_filename = filename + ".tags-%d.online_vs_offline_and.txt" % top_many_tags tag_ranks = online_tag_ranks self.experiment_details( top_tags, tag_ranks, ranker, bool_pairs, or_filename, and_filename, begin_top_many_users, end_top_many_users, step_top_many_users, ) if "online2" in self.__exps_list: print print "-------------------------------------------------------------------------" print " OFFLINE VERSUS ONLINE2 (EXPERIMENT)" or_filename = filename + ".tags-%d.online2_vs_offline_or.txt" % top_many_tags and_filename = filename + ".tags-%d.online2_vs_offline_and.txt" % top_many_tags tag_ranks = online_tag_ranks self.experiment_details( top_tags, tag_ranks, ranker, bool_pairs, or_filename, and_filename, begin_top_many_users, end_top_many_users, step_top_many_users, "2", ) if "online3" in self.__exps_list: print print "-------------------------------------------------------------------------" print " OFFLINE VERSUS ONLINE3 (EXPERIMENT)" or_filename = filename + ".tags-%d.online2_vs_offline_or.txt" % top_many_tags and_filename = filename + ".tags-%d.online3_vs_offline_and.txt" % top_many_tags tag_ranks = online_tag_ranks self.experiment_details( top_tags, tag_ranks, ranker, bool_pairs, or_filename, and_filename, begin_top_many_users, end_top_many_users, step_top_many_users, "3", ) if "online4" in self.__exps_list: print print "-------------------------------------------------------------------------" print " OFFLINE VERSUS ONLINE4 (EXPERIMENT)" or_filename = filename + ".tags-%d.online4_vs_offline_or.txt" % top_many_tags and_filename = filename + ".tags-%d.online4_vs_offline_and.txt" % top_many_tags tag_ranks = online_tag_ranks self.experiment_details( top_tags, tag_ranks, ranker, bool_pairs, or_filename, and_filename, begin_top_many_users, end_top_many_users, step_top_many_users, "4", ) if "mono" in self.__exps_list: print print "-------------------------------------------------------------------------" print " OFFLINE VERSUS MONOLITIC (EXPERIMENT)" or_filename = filename + ".tags-%d.mono_vs_offline_or.txt" % top_many_tags and_filename = filename + ".tags-%d.mono_vs_offline_and.txt" % top_many_tags tag_ranks = monolitic_tag_ranks self.experiment_details( top_tags, tag_ranks, ranker, bool_pairs, or_filename, and_filename, begin_top_many_users, end_top_many_users, step_top_many_users, "mono", )
def main(): # create and load ranker = RankerByTags() #filename = '../data/rpoland--1000.tagged_graph' #filename = '../data/rpoland--2000.tagged_graph' #filename = '../data/jcl5m--39370.tagged_graph' #filename = '../data/MIX.tagged_graph' #filename = '../data/flickr.tagged_graph' #filename = '../data/jcl5m-cuantos.tagged_graph' #filename = '../data/flickr_med.tagged_graph' #filename = '../data/yt.tagged_graph' #filename = '../data/fr.tagged_graph' #filename = '../data/yt_nd.tagged_graph' filename = '../data/fr_nd.tagged_graph' top_tags_size = 20 ranker.load(filename) # filter by tag boolean formula tag_form = TagBooleanFormula() # tag_form.run_tests() # and1 = TagBooleanConjunction() # and1.addAtom(TagBooleanAtom(True,'fun')) # tag_form.addTagAnd(and1) # and2 = TagBooleanConjunction() # and2.addAtom(TagBooleanAtom(True,'fun')) # tag_form.addTagAnd(and2) print str(tag_form) ranker.filter(tag_form) # save new subgraph withou tags. #outfilename = filename + '-%s.graph' % str(tag_form) outfilename = filename #ranker.save(outfilename) ranker.save_edges(outfilename + '.edges') ranker.save_nwb(outfilename + '.nwb') # now save graphs of top tags tags = Tags(filename) top_tags = tags.get_top_tags(top_tags_size) #top_tags = map(lambda x: x[0], top_tags) #top_tags = ['music', 'funny'] top_tags = ['blue', 'flower'] for tag in top_tags: # filter by tag boolean formula tag_form = TagBooleanFormula() # tag_form.run_tests() and1 = TagBooleanConjunction() and1.addAtom(TagBooleanAtom(True,tag)) tag_form.addTagAnd(and1) # and2 = TagBooleanConjunction() # and2.addAtom(TagBooleanAtom(True,'fun')) # tag_form.addTagAnd(and2) print str(tag_form) ranker.filter(tag_form) # save new subgraph withou tags. #outfilename = filename + '-%s.graph' % str(tag_form) outfilename = filename #ranker.save(outfilename) ranker.save_edges(outfilename + '.'+ str(tag_form) + '.edges') ranker.save_nwb(outfilename + '.' + str(tag_form) + '.nwb') # now save graphs of top tags by pairs, ANDed. tags = Tags(filename) top_tags = tags.get_top_tags(top_tags_size) #top_tags = map(lambda x: x[0], top_tags) #top_tags = ['music', 'funny'] top_tags = ['blue', 'flower'] for tag1, i in zip(top_tags,range(len(top_tags))): for tag2 in top_tags[i+1:]: # AND # filter by tag boolean formula tag_form = TagBooleanFormula() # tag_form.run_tests() and1 = TagBooleanConjunction() and1.addAtom(TagBooleanAtom(True,tag1)) and1.addAtom(TagBooleanAtom(True,tag2)) tag_form.addTagAnd(and1) # and2 = TagBooleanConjunction() # and2.addAtom(TagBooleanAtom(True,tag2)) # tag_form.addTagAnd(and2) print str(tag_form) ranker.filter(tag_form) # save new subgraph withou tags. #outfilename = filename + '-%s.graph' % str(tag_form) outfilename = filename #ranker.save(outfilename) ranker.save_edges(outfilename + '.'+ str(tag_form) + '.edges') ranker.save_nwb(outfilename + '.' + str(tag_form) + '.nwb') ranker.rank(10) ranker.saveRank(outfilename + '.ranks/' + str(tag_form) + '.graph.rank' ) # OR # filter by tag boolean formula tag_form = TagBooleanFormula() # tag_form.run_tests() and1 = TagBooleanConjunction() and1.addAtom(TagBooleanAtom(True,tag1)) tag_form.addTagAnd(and1) and2 = TagBooleanConjunction() and2.addAtom(TagBooleanAtom(True,tag2)) tag_form.addTagAnd(and2) # and1.addAtom(TagBooleanAtom(True,tag2)) # tag_form.addTagAnd(and1) # and2 = TagBooleanConjunction() # and2.addAtom(TagBooleanAtom(True,tag2)) # tag_form.addTagAnd(and2) print str(tag_form) ranker.filter(tag_form) # save new subgraph withou tags. #outfilename = filename + '-%s.graph' % str(tag_form) outfilename = filename #ranker.save(outfilename) ranker.save_edges(outfilename + '.'+ str(tag_form) + '.edges') ranker.save_nwb(outfilename + '.' + str(tag_form) + '.nwb') ranker.rank() ranker.saveRank(outfilename + '.ranks/' + str(tag_form) + '.graph.rank' ) print 'finish.'