class WikiCurrentContentHandler(sax.handler.ContentHandler): """ This class handle the xml for a wiki current dump """ # startElement: is called when a tag begin, and give as parameter the name and the attributes of the tag # endElement: is called when a tag end, and give as parameter the name of the tag #characters: is called between start and end of a tag. as parameter will be given the data between tag #getNetwork: return a Network with the data calculated #getPyNetwork: return a tuple with two list: # 1. list of string that represent the nodes # 2. list of tuple that represent the edges (who_edit,who_receive_edit,numbers_of_edit_of_who_edit) def __init__(self, lang, xmlsize=None, inputfilename=None, forcedistrust=False, threshold=0, verbose=False): sax.handler.ContentHandler.__init__(self) #lang of wikipedia network self.lang = lang self.read = False self.validdisc = False # valid discussion self.xmlsize = xmlsize self.inputfilename = inputfilename self.count = 0 self.last_perc_print = '' self.threshold = threshold self.verbose = verbose #set parse parameter for this language self.i18n = i18n[self.lang] #made the comparison case insensitive self.i18n = (self.i18n[0].lower(), self.i18n[1].lower(), self.i18n[2].lower()) self.allusers = set() #this three parameters contains the Network, #the first as XDiGraph #the second/third as list of tuple self.network = Network() self.edges = [] self.nodes = [] if inputfilename: assert 'current' in inputfilename def startElement(self, name, attrs): #disable loading of contents if name == u'text': self.read = u'text' self.ltext = u'' elif name == u'title': self.read = u'title' self.ltitle = u'' self.lusername = u'' else: self.read = False def endElement(self, name): if name == u'text' and self.validdisc: self.network.add_node(node(self.lusername)) #see documentation of getCollaborators collaborators = getCollaborators(self.ltext, self.lang) if collaborators: self.nodes.append(self.lusername) for u, n in collaborators: #only if the number of edit is higher than the threshold if n >= self.threshold: try: edge = self.network.get_edge( node(u), node(self.lusername)) n += int(edge['value']) except NetworkXError: pass self.network.add_node(node(u)) self.network.add_edge(node(u), node(self.lusername), pool({'value': str(n)})) self.edges.append((u, self.lusername, n)) elif name == u'title': ### 'Discussion utente:Paolo-da-skio' title = self.ltitle.split('/')[0].split(':') # comparison case insensitive title[0] = title[0].lower() # if the discussion is in english or in the language of this wiki, and name of user is not '' if (len(title) > 1) and ( (title[0] == self.i18n[0]) or (title[0] == i18n['en'][0].lower())) and title[1]: self.lusername = title[1] self.validdisc = True else: self.validdisc = False # True if is a talk page or user page add talk and user page in english if len(title) > 1 and title[0] in ( self.i18n[1], self.i18n[0], i18n['en'][0].lower(), i18n['en'][1].lower()) and title[1]: self.allusers.add(title[1]) def characters(self, contents): #fill the value if self.read == u'username': self.lusername += contents.strip() elif self.read == u'title': self.ltitle += contents.strip() elif self.read == u'text': self.ltext += contents.strip() #print an approximation of the percentage of computation if self.xmlsize and self.verbose: self.count += len(contents) perc = 100 * self.count / self.xmlsize if perc != self.last_perc_print: print '>%d%% ~%d%%' % (perc, perc * 100 / 88) self.last_perc_print = perc def getNetwork(self): return self.network def getPyNetwork(self): '''return list of edges''' return (self.nodes, self.edges)
class WikiCurrentContentHandler(sax.handler.ContentHandler): """ This class handle the xml for a wiki current dump """ # startElement: is called when a tag begin, and give as parameter the name and the attributes of the tag # endElement: is called when a tag end, and give as parameter the name of the tag #characters: is called between start and end of a tag. as parameter will be given the data between tag #getNetwork: return a Network with the data calculated #getPyNetwork: return a tuple with two list: # 1. list of string that represent the nodes # 2. list of tuple that represent the edges (who_edit,who_receive_edit,numbers_of_edit_of_who_edit) def __init__(self,lang,xmlsize=None,inputfilename=None,forcedistrust=False,threshold=0,verbose=False): sax.handler.ContentHandler.__init__(self) #lang of wikipedia network self.lang = lang self.read = False self.validdisc = False # valid discussion self.xmlsize = xmlsize self.inputfilename = inputfilename self.count = 0 self.last_perc_print = '' self.threshold = threshold self.verbose = verbose #set parse parameter for this language self.i18n = i18n[self.lang] #made the comparison case insensitive self.i18n = (self.i18n[0].lower(), self.i18n[1].lower(),self.i18n[2].lower() ) self.allusers = set() #this three parameters contains the Network, #the first as XDiGraph #the second/third as list of tuple self.network = Network() self.edges = [] self.nodes = [] if inputfilename: assert 'current' in inputfilename def startElement(self,name,attrs): #disable loading of contents if name == u'text': self.read = u'text' self.ltext = u'' elif name == u'title': self.read = u'title' self.ltitle = u'' self.lusername = u'' else: self.read = False def endElement(self,name): if name == u'text' and self.validdisc: self.network.add_node(node(self.lusername)) #see documentation of getCollaborators collaborators = getCollaborators(self.ltext,self.lang) if collaborators: self.nodes.append(self.lusername) for u,n in collaborators: #only if the number of edit is higher than the threshold if n>=self.threshold: try: edge = self.network.get_edge(node(u),node(self.lusername)) n += int(edge['value']) except NetworkXError: pass self.network.add_node(node(u)) self.network.add_edge(node(u),node(self.lusername),pool({'value':str(n)})) self.edges.append( (u,self.lusername,n) ) elif name == u'title': ### 'Discussion utente:Paolo-da-skio' title = self.ltitle.split('/')[0].split(':') # comparison case insensitive title[0] = title[0].lower() # if the discussion is in english or in the language of this wiki, and name of user is not '' if (len(title) > 1) and (( title[0] == self.i18n[0]) or (title[0] == i18n['en'][0].lower()) ) and title[1]: self.lusername = title[1] self.validdisc = True else: self.validdisc = False # True if is a talk page or user page add talk and user page in english if len(title) > 1 and title[0] in (self.i18n[1],self.i18n[0],i18n['en'][0].lower(), i18n['en'][1].lower() ) and title[1]: self.allusers.add(title[1]) def characters(self,contents): #fill the value if self.read == u'username': self.lusername += contents.strip() elif self.read == u'title': self.ltitle += contents.strip() elif self.read == u'text': self.ltext += contents.strip() #print an approximation of the percentage of computation if self.xmlsize and self.verbose: self.count += len(contents) perc = 100*self.count/self.xmlsize if perc != self.last_perc_print: print '>%d%% ~%d%%'%(perc,perc*100/88) self.last_perc_print = perc def getNetwork(self): return self.network def getPyNetwork(self): '''return list of edges''' return (self.nodes,self.edges)