Exemplo n.º 1
0
class WikiCurrentContentHandler(sax.handler.ContentHandler):
    """
    This class handle the xml for a wiki current dump 
    """

    # startElement: is called when a tag begin, and give as parameter the name and the attributes of the tag
    # endElement: is called when a tag end, and give as parameter the name of the tag
    #characters: is called between start and end of a tag. as parameter will be given the data between tag
    #getNetwork: return a Network with the data calculated
    #getPyNetwork: return a tuple with two list:
    #              1. list of string that represent the nodes
    #              2. list of tuple that represent the edges (who_edit,who_receive_edit,numbers_of_edit_of_who_edit)

    def __init__(self,
                 lang,
                 xmlsize=None,
                 inputfilename=None,
                 forcedistrust=False,
                 threshold=0,
                 verbose=False):
        sax.handler.ContentHandler.__init__(self)
        #lang of wikipedia network
        self.lang = lang
        self.read = False
        self.validdisc = False  # valid discussion
        self.xmlsize = xmlsize
        self.inputfilename = inputfilename
        self.count = 0
        self.last_perc_print = ''
        self.threshold = threshold
        self.verbose = verbose
        #set parse parameter for this language
        self.i18n = i18n[self.lang]
        #made the comparison case insensitive
        self.i18n = (self.i18n[0].lower(), self.i18n[1].lower(),
                     self.i18n[2].lower())

        self.allusers = set()

        #this three parameters contains the Network,
        #the first as XDiGraph
        #the second/third as list of tuple
        self.network = Network()
        self.edges = []
        self.nodes = []

        if inputfilename:
            assert 'current' in inputfilename

    def startElement(self, name, attrs):

        #disable loading of contents
        if name == u'text':
            self.read = u'text'
            self.ltext = u''
        elif name == u'title':
            self.read = u'title'
            self.ltitle = u''
            self.lusername = u''
        else:
            self.read = False

    def endElement(self, name):

        if name == u'text' and self.validdisc:
            self.network.add_node(node(self.lusername))
            #see documentation of getCollaborators
            collaborators = getCollaborators(self.ltext, self.lang)
            if collaborators:
                self.nodes.append(self.lusername)
                for u, n in collaborators:
                    #only if the number of edit is higher than the threshold
                    if n >= self.threshold:

                        try:
                            edge = self.network.get_edge(
                                node(u), node(self.lusername))
                            n += int(edge['value'])
                        except NetworkXError:
                            pass

                        self.network.add_node(node(u))
                        self.network.add_edge(node(u), node(self.lusername),
                                              pool({'value': str(n)}))
                        self.edges.append((u, self.lusername, n))

        elif name == u'title':

            ### 'Discussion utente:Paolo-da-skio'
            title = self.ltitle.split('/')[0].split(':')
            #  comparison case insensitive
            title[0] = title[0].lower()

            # if the discussion is in english or in the language of this wiki, and name of user is not ''
            if (len(title) > 1) and (
                (title[0] == self.i18n[0]) or
                (title[0] == i18n['en'][0].lower())) and title[1]:
                self.lusername = title[1]
                self.validdisc = True
            else:
                self.validdisc = False

            # True if is a talk page or user page                 add talk and user page in english
            if len(title) > 1 and title[0] in (
                    self.i18n[1], self.i18n[0], i18n['en'][0].lower(),
                    i18n['en'][1].lower()) and title[1]:
                self.allusers.add(title[1])

    def characters(self, contents):
        #fill the value

        if self.read == u'username':
            self.lusername += contents.strip()
        elif self.read == u'title':
            self.ltitle += contents.strip()
        elif self.read == u'text':
            self.ltext += contents.strip()

        #print an approximation of the percentage of computation
        if self.xmlsize and self.verbose:
            self.count += len(contents)
            perc = 100 * self.count / self.xmlsize
            if perc != self.last_perc_print:
                print '>%d%% ~%d%%' % (perc, perc * 100 / 88)
                self.last_perc_print = perc

    def getNetwork(self):
        return self.network

    def getPyNetwork(self):
        '''return list of edges'''
        return (self.nodes, self.edges)
Exemplo n.º 2
0
class WikiCurrentContentHandler(sax.handler.ContentHandler):
    """
    This class handle the xml for a wiki current dump 
    """
    
    # startElement: is called when a tag begin, and give as parameter the name and the attributes of the tag
    # endElement: is called when a tag end, and give as parameter the name of the tag
    #characters: is called between start and end of a tag. as parameter will be given the data between tag
    #getNetwork: return a Network with the data calculated
    #getPyNetwork: return a tuple with two list:
    #              1. list of string that represent the nodes
    #              2. list of tuple that represent the edges (who_edit,who_receive_edit,numbers_of_edit_of_who_edit)

    def __init__(self,lang,xmlsize=None,inputfilename=None,forcedistrust=False,threshold=0,verbose=False):
        sax.handler.ContentHandler.__init__(self)
        #lang of wikipedia network
        self.lang = lang
        self.read = False
        self.validdisc = False # valid discussion
        self.xmlsize = xmlsize
        self.inputfilename = inputfilename
        self.count = 0
        self.last_perc_print = ''
        self.threshold = threshold
        self.verbose = verbose
        #set parse parameter for this language
        self.i18n = i18n[self.lang]
        #made the comparison case insensitive
        self.i18n = (self.i18n[0].lower(), self.i18n[1].lower(),self.i18n[2].lower() )
        
        self.allusers = set()
        
        #this three parameters contains the Network,
        #the first as XDiGraph
        #the second/third as list of tuple
        self.network = Network()
        self.edges = []
        self.nodes = []
        
        if inputfilename:
            assert 'current' in inputfilename

    def startElement(self,name,attrs):
        
        #disable loading of contents
        if name == u'text':
            self.read = u'text'
            self.ltext = u''
        elif name == u'title':
            self.read = u'title'
            self.ltitle = u''
            self.lusername = u''
        else:
            self.read = False

    def endElement(self,name):

        if name == u'text' and self.validdisc:
            self.network.add_node(node(self.lusername))
            #see documentation of getCollaborators
            collaborators = getCollaborators(self.ltext,self.lang)
            if collaborators:
                self.nodes.append(self.lusername)
                for u,n in collaborators:
                    #only if the number of edit is higher than the threshold
                    if n>=self.threshold:

                        try:
                            edge = self.network.get_edge(node(u),node(self.lusername))
                            n += int(edge['value'])
                        except NetworkXError:
                            pass

                        self.network.add_node(node(u))
                        self.network.add_edge(node(u),node(self.lusername),pool({'value':str(n)}))
                        self.edges.append( (u,self.lusername,n) )                        

        elif name == u'title':

            ### 'Discussion utente:Paolo-da-skio'
            title = self.ltitle.split('/')[0].split(':')
            #  comparison case insensitive
            title[0] = title[0].lower()

            # if the discussion is in english or in the language of this wiki, and name of user is not ''
            if (len(title) > 1) and (( title[0] == self.i18n[0]) or (title[0] == i18n['en'][0].lower()) ) and title[1]:
                self.lusername = title[1]
                self.validdisc = True
            else:
                self.validdisc = False

            # True if is a talk page or user page                 add talk and user page in english
            if len(title) > 1 and title[0] in (self.i18n[1],self.i18n[0],i18n['en'][0].lower(), i18n['en'][1].lower() ) and title[1]:
                self.allusers.add(title[1])

    def characters(self,contents):
        #fill the value

        if self.read == u'username':
            self.lusername += contents.strip()
        elif self.read == u'title':
            self.ltitle += contents.strip()
        elif self.read == u'text':
            self.ltext += contents.strip()

        #print an approximation of the percentage of computation
        if self.xmlsize and self.verbose:
            self.count += len(contents)
            perc = 100*self.count/self.xmlsize
            if perc != self.last_perc_print:
                print '>%d%% ~%d%%'%(perc,perc*100/88)
                self.last_perc_print = perc

    def getNetwork(self):        
        return self.network

    def getPyNetwork(self):
        '''return list of edges'''
        return (self.nodes,self.edges)