def __init__(self, lang, xmlsize=None, inputfilename=None, forcedistrust=False, threshold=0, verbose=False): sax.handler.ContentHandler.__init__(self) #lang of wikipedia network self.lang = lang self.read = False self.validdisc = False # valid discussion self.xmlsize = xmlsize self.inputfilename = inputfilename self.count = 0 self.last_perc_print = '' self.threshold = threshold self.verbose = verbose #set parse parameter for this language self.i18n = i18n[self.lang] #made the comparison case insensitive self.i18n = (self.i18n[0].lower(), self.i18n[1].lower(), self.i18n[2].lower()) self.allusers = set() #this three parameters contains the Network, #the first as XDiGraph #the second/third as list of tuple self.network = Network() self.edges = [] self.nodes = [] if inputfilename: assert 'current' in inputfilename
def __init__(self,lang,xmlsize=None,inputfilename=None,forcedistrust=False,threshold=0,verbose=False): sax.handler.ContentHandler.__init__(self) #lang of wikipedia network self.lang = lang self.read = False self.validdisc = False # valid discussion self.xmlsize = xmlsize self.inputfilename = inputfilename self.count = 0 self.last_perc_print = '' self.threshold = threshold self.verbose = verbose #set parse parameter for this language self.i18n = i18n[self.lang] #made the comparison case insensitive self.i18n = (self.i18n[0].lower(), self.i18n[1].lower(),self.i18n[2].lower() ) self.allusers = set() #this three parameters contains the Network, #the first as XDiGraph #the second/third as list of tuple self.network = Network() self.edges = [] self.nodes = [] if inputfilename: assert 'current' in inputfilename
def getNetwork(self): W = Network() for user, authors in self.pages: W.add_node(node(user)) for a, num_edit in authors.iteritems(): # add node W.add_node(node(a)) #add edges # add edge from 'a' who have done the edit # a 'user' who receive the edit W.add_edge(node(a), node(user), pool({'value': str(num_edit)})) return W
def getNetwork(self): W = Network() for user,authors in self.pages: W.add_node(node(user)) for a,num_edit in authors.iteritems(): # add node W.add_node(node(a)) #add edges # add edge from 'a' who have done the edit # a 'user' who receive the edit W.add_edge(node(a),node(user),pool({'value':str(num_edit)})) return W
class WikiCurrentContentHandler(sax.handler.ContentHandler): """ This class handle the xml for a wiki current dump """ # startElement: is called when a tag begin, and give as parameter the name and the attributes of the tag # endElement: is called when a tag end, and give as parameter the name of the tag #characters: is called between start and end of a tag. as parameter will be given the data between tag #getNetwork: return a Network with the data calculated #getPyNetwork: return a tuple with two list: # 1. list of string that represent the nodes # 2. list of tuple that represent the edges (who_edit,who_receive_edit,numbers_of_edit_of_who_edit) def __init__(self, lang, xmlsize=None, inputfilename=None, forcedistrust=False, threshold=0, verbose=False): sax.handler.ContentHandler.__init__(self) #lang of wikipedia network self.lang = lang self.read = False self.validdisc = False # valid discussion self.xmlsize = xmlsize self.inputfilename = inputfilename self.count = 0 self.last_perc_print = '' self.threshold = threshold self.verbose = verbose #set parse parameter for this language self.i18n = i18n[self.lang] #made the comparison case insensitive self.i18n = (self.i18n[0].lower(), self.i18n[1].lower(), self.i18n[2].lower()) self.allusers = set() #this three parameters contains the Network, #the first as XDiGraph #the second/third as list of tuple self.network = Network() self.edges = [] self.nodes = [] if inputfilename: assert 'current' in inputfilename def startElement(self, name, attrs): #disable loading of contents if name == u'text': self.read = u'text' self.ltext = u'' elif name == u'title': self.read = u'title' self.ltitle = u'' self.lusername = u'' else: self.read = False def endElement(self, name): if name == u'text' and self.validdisc: self.network.add_node(node(self.lusername)) #see documentation of getCollaborators collaborators = getCollaborators(self.ltext, self.lang) if collaborators: self.nodes.append(self.lusername) for u, n in collaborators: #only if the number of edit is higher than the threshold if n >= self.threshold: try: edge = self.network.get_edge( node(u), node(self.lusername)) n += int(edge['value']) except NetworkXError: pass self.network.add_node(node(u)) self.network.add_edge(node(u), node(self.lusername), pool({'value': str(n)})) self.edges.append((u, self.lusername, n)) elif name == u'title': ### 'Discussion utente:Paolo-da-skio' title = self.ltitle.split('/')[0].split(':') # comparison case insensitive title[0] = title[0].lower() # if the discussion is in english or in the language of this wiki, and name of user is not '' if (len(title) > 1) and ( (title[0] == self.i18n[0]) or (title[0] == i18n['en'][0].lower())) and title[1]: self.lusername = title[1] self.validdisc = True else: self.validdisc = False # True if is a talk page or user page add talk and user page in english if len(title) > 1 and title[0] in ( self.i18n[1], self.i18n[0], i18n['en'][0].lower(), i18n['en'][1].lower()) and title[1]: self.allusers.add(title[1]) def characters(self, contents): #fill the value if self.read == u'username': self.lusername += contents.strip() elif self.read == u'title': self.ltitle += contents.strip() elif self.read == u'text': self.ltext += contents.strip() #print an approximation of the percentage of computation if self.xmlsize and self.verbose: self.count += len(contents) perc = 100 * self.count / self.xmlsize if perc != self.last_perc_print: print '>%d%% ~%d%%' % (perc, perc * 100 / 88) self.last_perc_print = perc def getNetwork(self): return self.network def getPyNetwork(self): '''return list of edges''' return (self.nodes, self.edges)
def main(): m = Network() #read users occupation = { "0": "other", "1": "academic/educator", "2": "artist", "3": "clerical/admin", "4": "college/grad student", "5": "customer service", "6": "doctor/health care", "7": "executive/managerial", "8": "farmer", "9": "homemaker", "10": "K-12 student", "11": "lawyer", "12": "programmer", "13": "retired", "14": "sales/marketing", "15": "scientist", "16": "self-employed", "17": "technician/engineer", "18": "tradesman/craftsman", "19": "unemployed", "20": "writer", } users = {} for x in [(int(x[0]), x[0] + ' ' + x[1] + ' ' + x[2] + occupation[x[3]]) for x in [x.strip().split('::') for x in file('users.dat').readlines()]]: users[x[0]] = x[1] #error: the file 'users.dat' is not in the SVN. There is the .tar.gz file but then it would make more sense to have a line to untar, unzip this file at the beginning of this script. REMOVE this comment if/when fixed. #read ratings # [(userid, movieid, rating)] ratings = [ tuple(map(int, x.strip().split('::')[:-1])) for x in file('ratings.dat').readlines() ] #print ratings #build graph m = Network() for u in users: m.add_node(users[u]) for u in enumerate(users): print int(100.0 * (u[0] + 1) / len(users)), "%" u = u[1] for v in users: if u < v: #this can be slow #list -> dict # (id,movie,rating) -> [movie] = rating #user u ur = {} for t in filter(lambda x: x[0] is u, ratings): ur[t[1]] = t[2] #user v vr = {} for t in filter(lambda x: x[0] is v, ratings): vr[t[1]] = t[2] #print u,v,len(ur),len(vr) uvm = [] #common movies for i in ur: if i in vr: uvm.append(i) if uvm: #creation of egde s = 0 for movie in uvm: s += abs(ur[movie] - vr[movie]) if not s: value = 1.0 #users u and v completly agree else: value = 1.0 * len(uvm) / s # 1 / avg # s==0 and s==1 is the same m.add_edge(u, v, {'value': str(value)}) m.add_edge(v, u, {'value': str(value)}) write_dot(m, 'graph.dot')
def main(): m = Network() #read users occupation = { "0": "other", "1": "academic/educator", "2": "artist", "3": "clerical/admin", "4": "college/grad student", "5": "customer service", "6": "doctor/health care", "7": "executive/managerial", "8": "farmer", "9": "homemaker", "10": "K-12 student", "11": "lawyer", "12": "programmer", "13": "retired", "14": "sales/marketing", "15": "scientist", "16": "self-employed", "17": "technician/engineer", "18": "tradesman/craftsman", "19": "unemployed", "20": "writer", } users = {} for x in [(int(x[0]),x[0]+' '+x[1]+' '+x[2]+occupation[x[3]]) for x in [x.strip().split('::') for x in file('users.dat').readlines()]]: users[x[0]] = x[1] #error: the file 'users.dat' is not in the SVN. There is the .tar.gz file but then it would make more sense to have a line to untar, unzip this file at the beginning of this script. REMOVE this comment if/when fixed. #read ratings # [(userid, movieid, rating)] ratings = [tuple(map(int,x.strip().split('::')[:-1])) for x in file('ratings.dat').readlines()] #print ratings #build graph m = Network() for u in users: m.add_node(users[u]) for u in enumerate(users): print int(100.0 * (u[0]+1) / len(users)),"%" u = u[1] for v in users: if u<v: #this can be slow #list -> dict # (id,movie,rating) -> [movie] = rating #user u ur = {} for t in filter(lambda x: x[0] is u,ratings): ur[t[1]] = t[2] #user v vr = {} for t in filter(lambda x: x[0] is v,ratings): vr[t[1]] = t[2] #print u,v,len(ur),len(vr) uvm = [] #common movies for i in ur: if i in vr: uvm.append(i) if uvm: #creation of egde s = 0 for movie in uvm: s += abs(ur[movie] - vr[movie]) if not s: value = 1.0 #users u and v completly agree else: value = 1.0 * len(uvm) / s # 1 / avg # s==0 and s==1 is the same m.add_edge(u,v,{'value':str(value)}) m.add_edge(v,u,{'value':str(value)}) write_dot(m,'graph.dot')
class WikiCurrentContentHandler(sax.handler.ContentHandler): """ This class handle the xml for a wiki current dump """ # startElement: is called when a tag begin, and give as parameter the name and the attributes of the tag # endElement: is called when a tag end, and give as parameter the name of the tag #characters: is called between start and end of a tag. as parameter will be given the data between tag #getNetwork: return a Network with the data calculated #getPyNetwork: return a tuple with two list: # 1. list of string that represent the nodes # 2. list of tuple that represent the edges (who_edit,who_receive_edit,numbers_of_edit_of_who_edit) def __init__(self,lang,xmlsize=None,inputfilename=None,forcedistrust=False,threshold=0,verbose=False): sax.handler.ContentHandler.__init__(self) #lang of wikipedia network self.lang = lang self.read = False self.validdisc = False # valid discussion self.xmlsize = xmlsize self.inputfilename = inputfilename self.count = 0 self.last_perc_print = '' self.threshold = threshold self.verbose = verbose #set parse parameter for this language self.i18n = i18n[self.lang] #made the comparison case insensitive self.i18n = (self.i18n[0].lower(), self.i18n[1].lower(),self.i18n[2].lower() ) self.allusers = set() #this three parameters contains the Network, #the first as XDiGraph #the second/third as list of tuple self.network = Network() self.edges = [] self.nodes = [] if inputfilename: assert 'current' in inputfilename def startElement(self,name,attrs): #disable loading of contents if name == u'text': self.read = u'text' self.ltext = u'' elif name == u'title': self.read = u'title' self.ltitle = u'' self.lusername = u'' else: self.read = False def endElement(self,name): if name == u'text' and self.validdisc: self.network.add_node(node(self.lusername)) #see documentation of getCollaborators collaborators = getCollaborators(self.ltext,self.lang) if collaborators: self.nodes.append(self.lusername) for u,n in collaborators: #only if the number of edit is higher than the threshold if n>=self.threshold: try: edge = self.network.get_edge(node(u),node(self.lusername)) n += int(edge['value']) except NetworkXError: pass self.network.add_node(node(u)) self.network.add_edge(node(u),node(self.lusername),pool({'value':str(n)})) self.edges.append( (u,self.lusername,n) ) elif name == u'title': ### 'Discussion utente:Paolo-da-skio' title = self.ltitle.split('/')[0].split(':') # comparison case insensitive title[0] = title[0].lower() # if the discussion is in english or in the language of this wiki, and name of user is not '' if (len(title) > 1) and (( title[0] == self.i18n[0]) or (title[0] == i18n['en'][0].lower()) ) and title[1]: self.lusername = title[1] self.validdisc = True else: self.validdisc = False # True if is a talk page or user page add talk and user page in english if len(title) > 1 and title[0] in (self.i18n[1],self.i18n[0],i18n['en'][0].lower(), i18n['en'][1].lower() ) and title[1]: self.allusers.add(title[1]) def characters(self,contents): #fill the value if self.read == u'username': self.lusername += contents.strip() elif self.read == u'title': self.ltitle += contents.strip() elif self.read == u'text': self.ltext += contents.strip() #print an approximation of the percentage of computation if self.xmlsize and self.verbose: self.count += len(contents) perc = 100*self.count/self.xmlsize if perc != self.last_perc_print: print '>%d%% ~%d%%'%(perc,perc*100/88) self.last_perc_print = perc def getNetwork(self): return self.network def getPyNetwork(self): '''return list of edges''' return (self.nodes,self.edges)