def main():
    langs = i18n.keys()
    savepath = path.join(os.environ['HOME'],'.wikixml2graph')
    mkpath(savepath)

    name = '%.4d-%.2d-%.2d' % tuple(time.gmtime()[:3])

    for lang in langs:
        print lang
        users,bots,busers = get_list_users(lang,force=True)
        print len(users),len(bots),len(busers)

        assert save({'lang':lang},(users,bots,busers),path.join(savepath,name))
示例#2
0
def main():
    langs = i18n.keys()
    savepath = path.join(os.environ['HOME'], '.wikixml2graph')
    mkpath(savepath)

    name = '%.4d-%.2d-%.2d' % tuple(time.gmtime()[:3])

    for lang in langs:
        print lang
        users, bots, busers = get_list_users(lang, force=True)
        print len(users), len(bots), len(busers)

        assert save({'lang': lang}, (users, bots, busers),
                    path.join(savepath, name))
def get_list_users(lang,cachepath=None,force=False,verbose=False):
    '''
    Return users, bots and blocked users lists
     - cachepath is a directory
    '''
    url = 'http://%s.wikipedia.org/w/index.php?title=Special:ListUsers&limit=5000' % lang
    if not cachepath:
        cachepath = os.path.join(os.environ['HOME'],'.wikixml2graph','listusers.c2')
    else:
        assert not cachepath.endswith('.c2')
        cachepath = os.path.join(cachepath,'listusers.c2')

    re_user = re.compile('title="%s:[^"]+">([^<]+)'%i18n[lang][1])
    re_bot = re.compile('title="%s:[^"]+">([^<]+)</a>(.*?)</li>'%i18n[lang][1])

    # title="Utente:!! Roberto Valentino !! (pagina inesistente)">!! Roberto Valentino !!</a></li>

    MONTHS = 60*60*24*30
    WEEKS = 60*60*24*7

    users = []
    bots = []
    ll = 1
    pageurl = url
    count = 0
    if verbose:
        print 'Number of users read:'

    while ll:
        if verbose:
            print count

        page = load({'url':pageurl},cachepath)
        if page: t,page = page

        if not page or force or time.time()-t>2*WEEKS or not re.findall(re_user,page):
            page = getpage(pageurl)
            save({'url':pageurl},(time.time(),page),cachepath,version=3)
        newusers = re.findall(re_user,page)
        bots += [x[0] for x in re.findall(re_bot,page) if i18n[lang][2] in x[1]]
        if newusers:
            pageurl = url + '&' + urllib.urlencode({'offset':newusers[-1]})
            #print pageurl
        ll = len(newusers)
        count += ll
        users += newusers

    # get IPBlockList
    url = 'http://%s.wikipedia.org/wiki/Special:IPBlockList?limit=5000' % lang
    re_rows = re.compile('<li>(.*?)</li>')
    re_offset = re.compile('offset=(\d{14})\D+')
    busers = [] #blocked users
    pageurl = url
    
    if verbose:
        print 'Number of blocked users read:'

    while pageurl:
        if verbose:
            print len(busers)

        page = load({'url':pageurl},cachepath)
        if page: t,page = page

        if not page or force or time.time()-t>2*WEEKS or not re.findall(re_rows,page):
            page = getpage(pageurl)
            save({'url':pageurl},(time.time(),page),cachepath)

        for row in re.findall(re_rows,page):
            users12 = re.findall(re_user,row)
            if len(users12)>=2:
                # a user blocked another user
                busers.append(users12[1])

        try:
            newpageurl = url + '&offset=' + min(re.findall(re_offset,page))
            if newpageurl == pageurl:
                pageurl = None
            else:
                pageurl = newpageurl
            #print 'New url:',pageurl
        except ValueError:
            pageurl = None

    return users,bots,busers
示例#4
0
def get_list_users(lang, cachepath=None, force=False, verbose=False):
    '''
    Return users, bots and blocked users lists
     - cachepath is a directory
    '''
    url = 'http://%s.wikipedia.org/w/index.php?title=Special:ListUsers&limit=5000' % lang
    if not cachepath:
        cachepath = os.path.join(os.environ['HOME'], '.wikixml2graph',
                                 'listusers.c2')
    else:
        assert not cachepath.endswith('.c2')
        cachepath = os.path.join(cachepath, 'listusers.c2')

    re_user = re.compile('title="%s:[^"]+">([^<]+)' % i18n[lang][1])
    re_bot = re.compile('title="%s:[^"]+">([^<]+)</a>(.*?)</li>' %
                        i18n[lang][1])

    # title="Utente:!! Roberto Valentino !! (pagina inesistente)">!! Roberto Valentino !!</a></li>

    MONTHS = 60 * 60 * 24 * 30
    WEEKS = 60 * 60 * 24 * 7

    users = []
    bots = []
    ll = 1
    pageurl = url
    count = 0
    if verbose:
        print 'Number of users read:'

    while ll:
        if verbose:
            print count

        page = load({'url': pageurl}, cachepath)
        if page: t, page = page

        if not page or force or time.time() - t > 2 * WEEKS or not re.findall(
                re_user, page):
            page = getpage(pageurl)
            save({'url': pageurl}, (time.time(), page), cachepath, version=3)
        newusers = re.findall(re_user, page)
        bots += [
            x[0] for x in re.findall(re_bot, page) if i18n[lang][2] in x[1]
        ]
        if newusers:
            pageurl = url + '&' + urllib.urlencode({'offset': newusers[-1]})
            #print pageurl
        ll = len(newusers)
        count += ll
        users += newusers

    # get IPBlockList
    url = 'http://%s.wikipedia.org/wiki/Special:IPBlockList?limit=5000' % lang
    re_rows = re.compile('<li>(.*?)</li>')
    re_offset = re.compile('offset=(\d{14})\D+')
    busers = []  #blocked users
    pageurl = url

    if verbose:
        print 'Number of blocked users read:'

    while pageurl:
        if verbose:
            print len(busers)

        page = load({'url': pageurl}, cachepath)
        if page: t, page = page

        if not page or force or time.time() - t > 2 * WEEKS or not re.findall(
                re_rows, page):
            page = getpage(pageurl)
            save({'url': pageurl}, (time.time(), page), cachepath)

        for row in re.findall(re_rows, page):
            users12 = re.findall(re_user, row)
            if len(users12) >= 2:
                # a user blocked another user
                busers.append(users12[1])

        try:
            newpageurl = url + '&offset=' + min(re.findall(re_offset, page))
            if newpageurl == pageurl:
                pageurl = None
            else:
                pageurl = newpageurl
            #print 'New url:',pageurl
        except ValueError:
            pageurl = None

    return users, bots, busers