Пример #1
0
 def load(self, filename):        
     f = open(filename, 'r')
     self.__tags = Tags(filename)
     outfilename = filename + '.ranks/all.tags'
     # save new subgraph withou tags.
     try:
         os.mkdir(filename + '.ranks')
         pass
     except:
         pass
     self.__tags.save_tag_freqs(outfilename)            
     self.__lines = f.readlines()
     self.__filename = filename
     f.close()
Пример #2
0
    def build_tag_graph(self, tagged_graph_path, threshold=None):
        
        filename = tagged_graph_path
#    def __init__(self, filename, bound_tag_dist=None, calc_tag_distance=False):
        tags = Tags(filename, threshold, True)
        outfilename = filename + '.tags'
        tags.save_tag_freqs(outfilename)
        
        dist_dict = tags.get_tag_dist()
        tag_freq = tags.get_tag_freq()
        
        print 'tag set with distance: %d' % len(tags.get_tag_set())
        print 'tag edges with distance: %d' % len(dist_dict.keys())    
        
        nodes = []
        for node in tags.get_tag_set():
            nodes.append((node,tag_freq[node]))
            #print str((node,tag_freq[node]))
        
         
        dists = []
        for key, val in dist_dict.iteritems():
            dists.append(val)
        max_dist, min_dist = max(dists), min(dists)
        
        # use opposite distances as weights.        
        edges = []
        for key, val in dist_dict.iteritems():
            edges.append((key[0],key[1],(max_dist-val)/(max_dist-min_dist)))
        
        #edges.sort(thr_fst_cmp, None, True)

        self.__graph.add_nodes_from(nodes)
        self.__graph.add_edges_from(edges)
        
        self.set_corpus_size(tags.get_corpus_size())
        dist_graph_path = filename + '.tags.graph'
        self.save(dist_graph_path)
Пример #3
0
class RankerByTags:

    # without extension 
    def load(self, filename):        
        f = open(filename, 'r')
        self.__tags = Tags(filename)
        outfilename = filename + '.ranks/all.tags'
        # save new subgraph withou tags.
        try:
            os.mkdir(filename + '.ranks')
            pass
        except:
            pass
        self.__tags.save_tag_freqs(outfilename)            
        self.__lines = f.readlines()
        self.__filename = filename
        f.close()

    def best_tag(self, tag):
        return self.__tags.best_tag(tag)

    def tag_weight(self, tag):
        return self.__tags.tag_weight(tag)

    def __filter_two_sized_comp(self, nodes, edges):
	 return nodes, edges
         # filter nodes and edges for 2-sized components.
         # code into integers, to avoid use of dictionaries everywhere.
         #print 'mapping nodes to integers'

         print 'Before filtering!!!'
	 print 'Nodes: %d' % len(nodes)
	 print 'Edges: %d' % len(edges)

         
         nodes = list(nodes)
         
         map_node_int = {}
         map_int_node = {}     
         for i in range(len(nodes)):
             node = nodes[i]
             map_node_int[node] = i
             map_int_node[i] = node
    
         outbound = []
         inbound = []
         for i in range(len(nodes)):
             outbound.append([])
             inbound.append([])

         #print 'extracting output neighbors per node'
         for n1, n2 in edges:
             n1 = map_node_int[n1]
             n2 = map_node_int[n2]
             inbound[n2].append(n1)
             outbound[n1].append(n2)

         removable_nodes, removable_edges = [], []
         for n1, n2 in edges:
             n1 = map_node_int[n1]
             n2 = map_node_int[n2]
	     if len(inbound[n1])==0 and len(outbound[n2])==0:
                  removable_nodes.append(map_int_node[n1])
                  removable_nodes.append(map_int_node[n2])
                  removable_edges.append((map_int_node[n1],map_int_node[n2]))
         nodes, edges = set(nodes), set(edges)
         nodes = nodes.difference( set( removable_nodes ) )
         edges = edges.difference( set( removable_edges ) )

	 print 'After filtering!!!'
	 print 'Nodes: %d' % len(nodes)
	 print 'Edges: %d' % len(edges)


         return nodes, list(edges)

    def filter_one_tag(self, tag):
        nodes = set([])
        edges = set([])
        
        tag_lines = self.__tags.get_lines(tag)
        if not tag_lines:
            raise Exception("no tagged graph loaded!")
        
        for line in tag_lines:
            cols = line.split('\t\t')
            if len(cols) < 3:
                continue
            src = cols[0]
            dst = cols[1]
            tags = cols[2].strip().lower().split('|')
            
            if tag in tags:            
                nodes.add(src)
                nodes.add(dst)
                edges.add((src,dst))
                
        nodes,edges = self.__filter_two_sized_comp(nodes, edges)

        self.__nodes = nodes
        self.__edges = edges

    def filter_by_nodes(self, filter_nodes, tags=None):
        nodes = set([])
        edges = set([])
        
        if not self.__lines:
            raise Exception("no tagged graph loaded!")
        
        for line in self.__lines:
            cols = line.split('\t\t')
            if len(cols) < 3:
                continue
            src = cols[0]
            dst = cols[1]
            #tags = cols[2].strip().lower().split('|')
            
            if src in filter_nodes and dst in filter_nodes:
                #nodes.add(src)
                #nodes.add(dst)
                edges.add((src,dst))
                
        nodes = filter_nodes
        nodes,edges = self.__filter_two_sized_comp(nodes, edges)

        self.__nodes = nodes
        self.__edges = edges

    def filter_by_nodes_and_tag(self, filter_nodes, tag=None):
        nodes = set([])
        edges = set([])
        
        if not self.__lines:
            raise Exception("no tagged graph loaded!")
        
        for line in self.__lines:
            cols = line.split('\t\t')
            if len(cols) < 3:
                continue
            src = cols[0]
            dst = cols[1]
            tags = cols[2].strip().lower().split('|')
            
            if src in filter_nodes and dst in filter_nodes and (not tag or tag in tags):
                #nodes.add(src)
                #nodes.add(dst)
                edges.add((src,dst))
                
        nodes = filter_nodes
        nodes,edges = self.__filter_two_sized_comp(nodes, edges)

        self.__nodes = nodes
        self.__edges = edges

    # a tag proposition
    def filter(self, tag_prop):
        nodes = set([])
        edges = set([])
        
        if not self.__lines:
            raise Exception("no tagged graph loaded!")
        
        for line in self.__lines:
            cols = line.split('\t\t')
            if len(cols) < 3:
                continue
            src = cols[0]
            dst = cols[1]
            tags = cols[2].strip().lower().split('|')
            
            if tag_prop.match(tags):
                nodes.add(src)
                nodes.add(dst)
                edges.add((src,dst))
                
        nodes,edges = self.__filter_two_sized_comp(nodes, edges)


        self.__nodes = nodes
        self.__edges = edges
     
    # a tag proposition
    def filter_save(self, tag_prop, save_file):
        nodes = set([])
        edges = set([])
        
        sf = open(save_file, 'w')
        
        if not self.__lines:
            raise Exception("no tagged graph loaded!")
        sa
        for line in self.__lines:
            cols = line.split('\t\t')
            if len(cols) < 3:
                continue
            src = cols[0]
            dst = cols[1]
            #tags = cols[2].strip().lower().split('|')
            
            if tag_prop.match(tags):
                sf.write(line)
                
        self.__nodes = nodes
        self.__edges = edges
     
    def get_nodes(self):
        return self.__nodes
     
    # save graph after filtering
    def save(self, outfilename):
        nodes = self.__nodes
        edges = self.__edges

        print 'writing %s, nodes: %d, edges: %d' % (outfilename, len(nodes), len(edges))
        out = open(outfilename, 'w')
        
        out.write('2\n')
        out.write('0\n')
        for n in nodes:
            out.write('%s\n' % n)
            out.write('2\n')
        out.write('----\n')
        for n1,n2 in edges:
            out.write('%s\n' % n1)
            out.write('%s\n' % n2)
            out.write('2\n')
        
        out.close()
           

    # save graph after filtering
    def save_edges(self, outfilename):
        nodes = self.__nodes
        edges = self.__edges

	print 'Before saving method save_edges!!!'
	print 'Nodes: %d' % len(nodes)
	print 'Edges: %d' % len(edges)

        
        print 'writing %s' % outfilename
        out = open(outfilename, 'w')

        # map nodes to ints
        map = {}
        for i,node in zip(range(len(nodes)), nodes):
            # begin counting in 1
            map[node] = i + 1
            
        for n1,n2 in edges:
	    try:
	            out.write('%d %d\n' % (map[n1], map[n2]))
            except:
		    pass
        out.close()
            
        print 'nodes: %d' % len(nodes)
        print 'edges: %d' % len(edges)

    # save graph after filtering
    def save_nwb(self, outfilename):
        nodes = self.__nodes
        edges = self.__edges
        
        print 'writing %s' % outfilename
        out = open(outfilename, 'w')

        # map nodes to ints
        map = {}
        for i,node in zip(range(len(nodes)), nodes):
            # begin counting in 1
            map[node] = i + 1

        out.write('*Nodes        %d\n' % len(nodes))
        out.write('id*int      label*string\n')
        for node, int_val in map.iteritems():
            out.write(' %d "%d"                      \n' % (int_val, int_val))
        out.write('*DirectedEdges\n')
        out.write('source*int      target*int\n')           

        for n1,n2 in edges:
	    try:
                out.write(' %d %d\n' % (map[n1], map[n2]))
            except:
                pass
        
        out.close()
            
        print 'nodes: %d' % len(nodes)
        print 'edges: %d' % len(edges)

    def __snd_cmp(self,A,B):
        ret = A[1] - B[1]
        if ret < 0:
            return -1
        elif ret > 0:
            return 1
        else:
            return 0

    def rank(self, iterations=50, damping_factor=0.85, accurate=False):
#        pagerank = PageRankNumarray(list(self.__nodes), self.__edges)
#        self.__pagerank = pagerank.rank()
        use_native = len(self.__edges) > 0
        pagerank = PageRank(list(self.__nodes), self.__edges, use_native, damping_factor)
        self.__pagerank = pagerank.ranking(-1, iterations)      


    def get_rank(self, many=None):
        if many:
            return self.__pagerank[:many]
        else:
            return self.__pagerank
        

    def saveRank(self, outfilenamerank):
        pagerank = self.__pagerank
        print 'writing %s' % outfilenamerank
        f = open(outfilenamerank, 'w')

        for t in pagerank:
            f.write( '%s %.24f\n' % t )

        f.close()
        
    def all_ranks(self, filename, compute_ranks, compute_mono_rank, top_tags=None):
        #self.load(filename)               
        tag_freqs = self.__tags.get_top_tags(top_tags)

        # save new subgraph withou tags.
        try:
            os.mkdir(filename + '.ranks')
            pass
        except:
            pass

        if compute_ranks:	
            try:
                os.mkdir(filename + '.ranks')
            except:
                pass
            i = 0
            secs = time.time()            	       
            for tag, tag_freq in tag_freqs:
                print '%d tags of %d' % (i,len(tag_freqs))
                i += 1 	
                if len(tag) > 0:
    
                    #print tag
                    self.filter_one_tag(tag)    
                    
                    # save new subgraph withou tags.

                    outfilename = filename + '.ranks/%s.graph' % tag
                    #self.save(outfilename)        
                    #   	     self.save_edges(outfilename + '.edges')
                
                    # compute rank
                    print 'compute rank: %s' % tag
                    max_iters = 10
                    self.rank(10)
            
                    # save rank
                    outfilenamerank = outfilename + '.rank'
                    if len(tag) < 64:
                        self.saveRank(outfilenamerank)
            secs = time.time() - secs
            open('log.txt','aw').write('faceted singleton ranks, %f seconds, dataset %s\n' % (secs, filename))

            
    
        if compute_mono_rank:
        
            
        
            # compute monolitic rank of the graph
            print 'computing monolitic rank'
            # filter by tag boolean formula
            tag_form = TagBooleanFormula()
            self.filter(tag_form)    
       
            outfilename = filename + '.ranks/.graph' + '.rank'        
            #outfilename = filename + '-.graph.rank'
            # compute rank
            print 'compute rank: complete graph'
            max_iters = 10
            
            secs = time.time()                      
            self.rank(10)
            secs = time.time() - secs
            open('log.txt','aw').write('single rank, %f seconds, dataset %s\n' % (secs, filename))
            
            
            # save rank
            print 'saving monolitic rank'
            self.saveRank(outfilename)
            
        
        print 'finish.'
    def run(self, filename, top_many_tags=20):
        tags = Tags(filename)
        top_tags = tags.get_top_tags(top_many_tags)

        ranker = RankerByTags()
        ranker.load(filename)
        self.__filename = filename

        # add goldstarndart to filename
        filename += "." + self.__offline_type

        top_tags = map(lambda x: x[0], top_tags)
        monolitic_tag_ranks, online_tag_ranks = self.get_tag_ranks(ranker, top_tags)

        bool_pairs = [(True, True)]  # ,(False,True),(True,False),(False,False)]
        # bool_pairs = [(True,True)]

        begin_top_many_users = 1
        end_top_many_users = 10
        # exponential step 2, 4 ,8 16
        step_top_many_users = 2

        if "online1" in self.__exps_list:
            print
            print "-------------------------------------------------------------------------"
            print " OFFLINE VERSUS ONLINE (EXPERIMENT)"
            or_filename = filename + ".tags-%d.online_vs_offline_or.txt" % top_many_tags
            and_filename = filename + ".tags-%d.online_vs_offline_and.txt" % top_many_tags
            tag_ranks = online_tag_ranks
            self.experiment_details(
                top_tags,
                tag_ranks,
                ranker,
                bool_pairs,
                or_filename,
                and_filename,
                begin_top_many_users,
                end_top_many_users,
                step_top_many_users,
            )

        if "online2" in self.__exps_list:
            print
            print "-------------------------------------------------------------------------"
            print " OFFLINE VERSUS ONLINE2 (EXPERIMENT)"
            or_filename = filename + ".tags-%d.online2_vs_offline_or.txt" % top_many_tags
            and_filename = filename + ".tags-%d.online2_vs_offline_and.txt" % top_many_tags
            tag_ranks = online_tag_ranks
            self.experiment_details(
                top_tags,
                tag_ranks,
                ranker,
                bool_pairs,
                or_filename,
                and_filename,
                begin_top_many_users,
                end_top_many_users,
                step_top_many_users,
                "2",
            )

        if "online3" in self.__exps_list:
            print
            print "-------------------------------------------------------------------------"
            print " OFFLINE VERSUS ONLINE3 (EXPERIMENT)"
            or_filename = filename + ".tags-%d.online2_vs_offline_or.txt" % top_many_tags
            and_filename = filename + ".tags-%d.online3_vs_offline_and.txt" % top_many_tags
            tag_ranks = online_tag_ranks
            self.experiment_details(
                top_tags,
                tag_ranks,
                ranker,
                bool_pairs,
                or_filename,
                and_filename,
                begin_top_many_users,
                end_top_many_users,
                step_top_many_users,
                "3",
            )

        if "online4" in self.__exps_list:
            print
            print "-------------------------------------------------------------------------"
            print " OFFLINE VERSUS ONLINE4 (EXPERIMENT)"
            or_filename = filename + ".tags-%d.online4_vs_offline_or.txt" % top_many_tags
            and_filename = filename + ".tags-%d.online4_vs_offline_and.txt" % top_many_tags
            tag_ranks = online_tag_ranks
            self.experiment_details(
                top_tags,
                tag_ranks,
                ranker,
                bool_pairs,
                or_filename,
                and_filename,
                begin_top_many_users,
                end_top_many_users,
                step_top_many_users,
                "4",
            )

        if "mono" in self.__exps_list:
            print
            print "-------------------------------------------------------------------------"
            print " OFFLINE VERSUS MONOLITIC (EXPERIMENT)"
            or_filename = filename + ".tags-%d.mono_vs_offline_or.txt" % top_many_tags
            and_filename = filename + ".tags-%d.mono_vs_offline_and.txt" % top_many_tags
            tag_ranks = monolitic_tag_ranks
            self.experiment_details(
                top_tags,
                tag_ranks,
                ranker,
                bool_pairs,
                or_filename,
                and_filename,
                begin_top_many_users,
                end_top_many_users,
                step_top_many_users,
                "mono",
            )
Пример #5
0
def main():

    # create and load
    ranker = RankerByTags()
    #filename = '../data/rpoland--1000.tagged_graph'
    #filename = '../data/rpoland--2000.tagged_graph'
    #filename = '../data/jcl5m--39370.tagged_graph'    
    #filename = '../data/MIX.tagged_graph'
    #filename = '../data/flickr.tagged_graph'
    #filename = '../data/jcl5m-cuantos.tagged_graph'
    #filename = '../data/flickr_med.tagged_graph'
    #filename = '../data/yt.tagged_graph'
    #filename = '../data/fr.tagged_graph'

    #filename = '../data/yt_nd.tagged_graph'
    filename = '../data/fr_nd.tagged_graph'

    top_tags_size = 20

    ranker.load(filename)
    
        # filter by tag boolean formula
    tag_form = TagBooleanFormula()
#    tag_form.run_tests()
#    and1 = TagBooleanConjunction()
#    and1.addAtom(TagBooleanAtom(True,'fun'))
#    tag_form.addTagAnd(and1)
#    and2 = TagBooleanConjunction()
#    and2.addAtom(TagBooleanAtom(True,'fun'))
#    tag_form.addTagAnd(and2)
    print str(tag_form)
    ranker.filter(tag_form)    

           
    # save new subgraph withou tags.
    #outfilename = filename + '-%s.graph' % str(tag_form)
    outfilename = filename
    #ranker.save(outfilename)
    ranker.save_edges(outfilename + '.edges')
    ranker.save_nwb(outfilename + '.nwb')
    
    # now save graphs of top tags
    tags = Tags(filename)
    top_tags = tags.get_top_tags(top_tags_size)
    #top_tags = map(lambda x: x[0], top_tags)
    #top_tags = ['music', 'funny']
    top_tags = ['blue', 'flower']

    for tag in top_tags:

            # filter by tag boolean formula
        tag_form = TagBooleanFormula()
    #    tag_form.run_tests()
        and1 = TagBooleanConjunction()
        and1.addAtom(TagBooleanAtom(True,tag))
        tag_form.addTagAnd(and1)
    #    and2 = TagBooleanConjunction()
    #    and2.addAtom(TagBooleanAtom(True,'fun'))
    #    tag_form.addTagAnd(and2)
        print str(tag_form)
        ranker.filter(tag_form)    
        # save new subgraph withou tags.
        #outfilename = filename + '-%s.graph' % str(tag_form)
        outfilename = filename
        #ranker.save(outfilename)
        ranker.save_edges(outfilename + '.'+ str(tag_form) + '.edges')
        ranker.save_nwb(outfilename + '.' + str(tag_form) + '.nwb')
    

    # now save graphs of top tags by pairs, ANDed.
    tags = Tags(filename)
    top_tags = tags.get_top_tags(top_tags_size)
    #top_tags = map(lambda x: x[0], top_tags)
    #top_tags = ['music', 'funny']
    top_tags = ['blue', 'flower']

    for tag1, i in zip(top_tags,range(len(top_tags))):
        for tag2 in top_tags[i+1:]:

	    # AND
                # filter by tag boolean formula
            tag_form = TagBooleanFormula()
        #    tag_form.run_tests()
            and1 = TagBooleanConjunction()
            and1.addAtom(TagBooleanAtom(True,tag1))
            and1.addAtom(TagBooleanAtom(True,tag2))
            tag_form.addTagAnd(and1)
#            and2 = TagBooleanConjunction()
#            and2.addAtom(TagBooleanAtom(True,tag2))
#            tag_form.addTagAnd(and2)
            print str(tag_form)
            ranker.filter(tag_form)    
            # save new subgraph withou tags.
            #outfilename = filename + '-%s.graph' % str(tag_form)
            outfilename = filename
            #ranker.save(outfilename)
            ranker.save_edges(outfilename + '.'+ str(tag_form) + '.edges')
            ranker.save_nwb(outfilename + '.' + str(tag_form) + '.nwb')
	    ranker.rank(10)
	    ranker.saveRank(outfilename + '.ranks/' + str(tag_form) + '.graph.rank' )
        
	    # OR
                # filter by tag boolean formula
            tag_form = TagBooleanFormula()
        #    tag_form.run_tests()
            and1 = TagBooleanConjunction()
            and1.addAtom(TagBooleanAtom(True,tag1))
	    tag_form.addTagAnd(and1)
            
            and2 = TagBooleanConjunction()
            and2.addAtom(TagBooleanAtom(True,tag2))
	    tag_form.addTagAnd(and2)
            
#	    and1.addAtom(TagBooleanAtom(True,tag2))
#            tag_form.addTagAnd(and1)
#            and2 = TagBooleanConjunction()
#            and2.addAtom(TagBooleanAtom(True,tag2))
#            tag_form.addTagAnd(and2)
            print str(tag_form)
            ranker.filter(tag_form)    
            # save new subgraph withou tags.
            #outfilename = filename + '-%s.graph' % str(tag_form)
            outfilename = filename
            #ranker.save(outfilename)
            ranker.save_edges(outfilename + '.'+ str(tag_form) + '.edges')
            ranker.save_nwb(outfilename + '.' + str(tag_form) + '.nwb')
	    ranker.rank()
	    ranker.saveRank(outfilename + '.ranks/' + str(tag_form) + '.graph.rank' )        
    
    print 'finish.'