Exemplo n.º 1
0
 def __init__(self,lang,xmlsize=None,inputfilename=None,forcedistrust=False,threshold=0,verbose=False):
     sax.handler.ContentHandler.__init__(self)
     #lang of wikipedia network
     self.lang = lang
     self.read = False
     self.validdisc = False # valid discussion
     self.xmlsize = xmlsize
     self.inputfilename = inputfilename
     self.count = 0
     self.last_perc_print = ''
     self.threshold = threshold
     self.verbose = verbose
     #set parse parameter for this language
     self.i18n = i18n[self.lang]
     #made the comparison case insensitive
     self.i18n = (self.i18n[0].lower(), self.i18n[1].lower(),self.i18n[2].lower() )
     
     self.allusers = set()
     
     #this three parameters contains the Network,
     #the first as XDiGraph
     #the second/third as list of tuple
     self.network = Network()
     self.edges = []
     self.nodes = []
     
     if inputfilename:
         assert 'current' in inputfilename
Exemplo n.º 2
0
 def getNetwork(self):
     W = Network()
     
     for user,authors in self.pages:
         
         W.add_node(node(user))
         for a,num_edit in authors.iteritems():
             # add node
             W.add_node(node(a))
             #add edges
             # add edge from 'a' who have done the edit
             # a 'user' who receive the edit
             W.add_edge(node(a),node(user),pool({'value':str(num_edit)}))
             
     return W
Exemplo n.º 3
0
def main():
    m = Network()

    #read users

    occupation = {
        "0":  "other",
        "1":  "academic/educator",
        "2":  "artist",
        "3":  "clerical/admin",
        "4":  "college/grad student",
        "5":  "customer service",
        "6":  "doctor/health care",
        "7":  "executive/managerial",
        "8":  "farmer",
        "9":  "homemaker",
        "10":  "K-12 student",
        "11":  "lawyer",
        "12":  "programmer",
        "13":  "retired",
        "14":  "sales/marketing",
        "15":  "scientist",
        "16":  "self-employed",
        "17":  "technician/engineer",
        "18":  "tradesman/craftsman",
        "19":  "unemployed",
        "20":  "writer",
        }

    users = {}

    for x in [(int(x[0]),x[0]+' '+x[1]+' '+x[2]+occupation[x[3]]) for x in [x.strip().split('::') for x in file('users.dat').readlines()]]:
        users[x[0]] = x[1]
    #error: the file 'users.dat' is not in the SVN. There is the .tar.gz file but then it would make more sense to have a line to untar, unzip this file at the beginning of this script. REMOVE this comment if/when fixed.

    #read ratings
    # [(userid, movieid, rating)]

    ratings = [tuple(map(int,x.strip().split('::')[:-1])) for x in file('ratings.dat').readlines()]
    #print ratings

    #build graph
    m = Network()

    for u in users:
        m.add_node(users[u])

    for u in enumerate(users):
        print int(100.0 * (u[0]+1) / len(users)),"%"
        u = u[1]

        for v in users:
            if u<v:
                #this can be slow

                #list -> dict
                # (id,movie,rating) -> [movie] = rating
                
                #user u
                ur = {}
                for t in filter(lambda x: x[0] is u,ratings):
                    ur[t[1]] = t[2]
                #user v
                vr = {}
                for t in filter(lambda x: x[0] is v,ratings):
                    vr[t[1]] = t[2]

                #print u,v,len(ur),len(vr)

                uvm = [] #common movies
                for i in ur:
                    if i in vr:
                        uvm.append(i)
                if uvm:
                    #creation of egde
                    s = 0
                    for movie in uvm:
                        s += abs(ur[movie] - vr[movie])
                    if not s:
                        value = 1.0 #users u and v completly agree
                    else:
                        value = 1.0 * len(uvm) / s # 1 / avg
                        # s==0 and s==1 is the same
                
                    m.add_edge(u,v,{'value':str(value)})
                    m.add_edge(v,u,{'value':str(value)})
                    
                    
    write_dot(m,'graph.dot')
Exemplo n.º 4
0
class WikiCurrentContentHandler(sax.handler.ContentHandler):
    """
    This class handle the xml for a wiki current dump 
    """
    
    # startElement: is called when a tag begin, and give as parameter the name and the attributes of the tag
    # endElement: is called when a tag end, and give as parameter the name of the tag
    #characters: is called between start and end of a tag. as parameter will be given the data between tag
    #getNetwork: return a Network with the data calculated
    #getPyNetwork: return a tuple with two list:
    #              1. list of string that represent the nodes
    #              2. list of tuple that represent the edges (who_edit,who_receive_edit,numbers_of_edit_of_who_edit)

    def __init__(self,lang,xmlsize=None,inputfilename=None,forcedistrust=False,threshold=0,verbose=False):
        sax.handler.ContentHandler.__init__(self)
        #lang of wikipedia network
        self.lang = lang
        self.read = False
        self.validdisc = False # valid discussion
        self.xmlsize = xmlsize
        self.inputfilename = inputfilename
        self.count = 0
        self.last_perc_print = ''
        self.threshold = threshold
        self.verbose = verbose
        #set parse parameter for this language
        self.i18n = i18n[self.lang]
        #made the comparison case insensitive
        self.i18n = (self.i18n[0].lower(), self.i18n[1].lower(),self.i18n[2].lower() )
        
        self.allusers = set()
        
        #this three parameters contains the Network,
        #the first as XDiGraph
        #the second/third as list of tuple
        self.network = Network()
        self.edges = []
        self.nodes = []
        
        if inputfilename:
            assert 'current' in inputfilename

    def startElement(self,name,attrs):
        
        #disable loading of contents
        if name == u'text':
            self.read = u'text'
            self.ltext = u''
        elif name == u'title':
            self.read = u'title'
            self.ltitle = u''
            self.lusername = u''
        else:
            self.read = False

    def endElement(self,name):

        if name == u'text' and self.validdisc:
            self.network.add_node(node(self.lusername))
            #see documentation of getCollaborators
            collaborators = getCollaborators(self.ltext,self.lang)
            if collaborators:
                self.nodes.append(self.lusername)
                for u,n in collaborators:
                    #only if the number of edit is higher than the threshold
                    if n>=self.threshold:

                        try:
                            edge = self.network.get_edge(node(u),node(self.lusername))
                            n += int(edge['value'])
                        except NetworkXError:
                            pass

                        self.network.add_node(node(u))
                        self.network.add_edge(node(u),node(self.lusername),pool({'value':str(n)}))
                        self.edges.append( (u,self.lusername,n) )                        

        elif name == u'title':

            ### 'Discussion utente:Paolo-da-skio'
            title = self.ltitle.split('/')[0].split(':')
            #  comparison case insensitive
            title[0] = title[0].lower()

            # if the discussion is in english or in the language of this wiki, and name of user is not ''
            if (len(title) > 1) and (( title[0] == self.i18n[0]) or (title[0] == i18n['en'][0].lower()) ) and title[1]:
                self.lusername = title[1]
                self.validdisc = True
            else:
                self.validdisc = False

            # True if is a talk page or user page                 add talk and user page in english
            if len(title) > 1 and title[0] in (self.i18n[1],self.i18n[0],i18n['en'][0].lower(), i18n['en'][1].lower() ) and title[1]:
                self.allusers.add(title[1])

    def characters(self,contents):
        #fill the value

        if self.read == u'username':
            self.lusername += contents.strip()
        elif self.read == u'title':
            self.ltitle += contents.strip()
        elif self.read == u'text':
            self.ltext += contents.strip()

        #print an approximation of the percentage of computation
        if self.xmlsize and self.verbose:
            self.count += len(contents)
            perc = 100*self.count/self.xmlsize
            if perc != self.last_perc_print:
                print '>%d%% ~%d%%'%(perc,perc*100/88)
                self.last_perc_print = perc

    def getNetwork(self):        
        return self.network

    def getPyNetwork(self):
        '''return list of edges'''
        return (self.nodes,self.edges)