def mwCurateImageURL(config={}, url=''): """ Returns an absolute URL for an image, adding the domain if missing """ if 'mwindex' in config and config['mwindex']: # remove from :// (http or https) until the first / after domain domainalone = config['mwindex'].split( '://')[0] + '://' + config['mwindex'].split('://')[1].split('/')[0] elif 'mwapi' in config and config['mwapi']: domainalone = config['mwapi'].split( '://')[0] + '://' + config['mwapi'].split('://')[1].split('/')[0] else: sys.stderr.write('ERROR: no index nor API') sys.exit() if url.startswith('//'): # Orain wikifarm returns URLs starting with // url = '%s:%s' % (domainalone.split('://')[0], url) # is it a relative URL? elif url[0] == '/' or (not url.startswith('http://') and not url.startswith('https://')): if url[0] == '/': # slash is added later url = url[1:] # concat http(s) + domain + relative url url = '%s/%s' % (domainalone, url) url = wikiteam.undoHTMLEntities(text=url) # url = urllib.unquote(url) #do not use unquote with url, it break some # urls with odd chars url = re.sub(' ', '_', url) return url
def mwResumePreviousDump(config={}): imagenames = [] sys.stderr.write('Resuming previous dump process...') if config['xml']: pagetitles = mwReadPageTitles(config=config) try: lasttitles = wikiteam.reverseReadline('%s/%s-%s-titles.txt' % (config['path'], domain2prefix(config=config), config['date'])) lasttitle=lasttitles.next() if lasttitle == '': lasttitle=lasttitles.next() except: pass # probably file does not exists if lasttitle == '--END--': # titles list is complete sys.stderr.write('Title list was completed in the previous session') else: sys.stderr.write('Title list is incomplete. Reloading...') # do not resume, reload, to avoid inconsistences, deleted pages or # so pagetitles = mwGetPageTitles(config=config, start=lastxmltitle) wikiteam.savePageTitles(config=config, pagetitles=pagetitles) # checking xml dump xmliscomplete = False lastxmltitle = None try: f = wikiteam.reverseReadline( '%s/%s-%s-%s.xml' % (config['path'], domain2prefix( config=config), config['date'], config['curonly'] and 'current' or 'history'), ) for l in f: if l == '</mediawiki>': # xml dump is complete xmliscomplete = True break xmltitle = re.search(r'<title>([^<]+)</title>', l) if xmltitle: lastxmltitle = wikiteam.undoHTMLEntities(text=xmltitle.group(1)) break except: pass # probably file does not exists if xmliscomplete: sys.stderr.write('XML dump was completed in the previous session') elif lastxmltitle: # resuming... sys.stderr.write('Resuming XML dump from "%s"' % (lastxmltitle)) pagetitles = mwReadPageTitles(config=config, start=lastxmltitle) mwGenerateXMLDump( config=config, pagetitles=pagetitles, start=lastxmltitle) else: # corrupt? only has XML header? sys.stderr.write('XML is corrupt? Regenerating...') pagetitles = mwReadPageTitles(config=config) mwGenerateXMLDump(config=config, pagetitles=pagetitles) if config['images']: # load images lastimage = '' try: f = open('%s/%s-%s-images.txt' % (config['path'], domain2prefix(config=config), config['date']), 'r') raw = f.read().strip() lines = raw.split('\n') for l in lines: if re.search(r'\t', l): imagenames.append(l.split('\t')) lastimage = lines[-1] f.close() except: pass # probably file doesnot exists if lastimage == '--END--': sys.stderr.write('Image list was completed in the previous session') else: sys.stderr.write('Image list is incomplete. Reloading...') # do not resume, reload, to avoid inconsistences, deleted images or # so imagenames = mwGetImageNames(config=config) saveImageNames(config=config, imagenames=imagenames) # checking images directory listdir = [] try: listdir = [n.decode('utf-8') for n in os.listdir('%s/images' % (config['path']))] except: pass # probably directory does not exist listdir.sort() complete = True lastfilename = '' lastfilename2 = '' c = 0 for filename, url, uploader in imagenames: lastfilename2 = lastfilename # return always the complete filename, not the truncated lastfilename = filename filename2 = filename if len(filename2) > other['filenamelimit']: filename2 = truncateFilename(other=other, filename=filename2) if filename2 not in listdir: complete = False break c += 1 sys.stderr.write('%d images were found in the directory from a previous session' % (c)) if complete: # image dump is complete sys.stderr.write('Image dump was completed in the previous session') else: # we resume from previous image, which may be corrupted (or missing # .desc) by the previous session ctrl-c or abort mwGenerateImageDump(config=config, imagenames=imagenames, start=lastfilename2) if config['logs']: # fix pass mwSaveIndexPHP(config=config) mwSaveSpecialVersion(config=config) mwSaveSiteInfo(config=config)
def mwGetPageTitlesScraper(config={}): """ Scrape list of page titles from Special:Allpages """ pagetitles = [] namespaces, namespacenames = mwGetNamespacesScraper( config=config) for namespace in namespaces: sys.stderr.write(' Retrieving titles in namespace %s\n' % (namespace)) url = '%s?title=Special:Allpages&namespace=%s' % (config['index'], namespace) raw = wikiteam.getURL(url=url) raw = mwCleanHTML(raw) r_title = r'title="(?P<title>[^>]+)">' r_suballpages = '' r_suballpages1 = r'&from=(?P<from>[^>]+)&to=(?P<to>[^>]+)">' r_suballpages2 = r'Special:Allpages/(?P<from>[^>]+)">' r_suballpages3 = r'&from=(?P<from>[^>]+)" title="[^>]+">' if re.search(r_suballpages1, raw): r_suballpages = r_suballpages1 elif re.search(r_suballpages2, raw): r_suballpages = r_suballpages2 elif re.search(r_suballpages3, raw): r_suballpages = r_suballpages3 else: pass # perhaps no subpages # 3 is the current deep of English Wikipedia for Special:Allpages deep = 3 c = 0 checked_suballpages = [] rawacum = raw while r_suballpages and re.search(r_suballpages, raw) and c < deep: # load sub-Allpages m = re.compile(r_suballpages).finditer(raw) for i in m: fr = i.group('from') if r_suballpages == r_suballpages1: to = i.group('to') name = '%s-%s' % (fr, to) url = '%s?title=Special:Allpages&namespace=%s&from=%s&to=%s' % ( config['index'], namespace, fr, to) # do not put urllib.quote in fr or to # fix, esta regexp no carga bien todas? o falla el r_title en # este tipo de subpag? (wikiindex) elif r_suballpages == r_suballpages2: # clean &namespace=\d, sometimes happens fr = fr.split('&namespace=')[0] name = fr url = '%s?title=Special:Allpages/%s&namespace=%s' % ( config['index'], name, namespace) elif r_suballpages == r_suballpages3: fr = fr.split('&namespace=')[0] name = fr url = '%s?title=Special:Allpages&from=%s&namespace=%s' % ( config['index'], name, namespace) if name not in checked_suballpages: # to avoid reload dupe subpages links checked_suballpages.append(name) wikiteam.delay(config=config) raw2 = wikiteam.getURL(url=url) raw2 = mwCleanHTML(raw2) rawacum += raw2 # merge it after removed junk sys.stderr.write(' Reading %s, %s bytes, %d subpages, %d pages' % (name, len(raw2), \ len(re.findall(r_suballpages, raw2)), \ len(re.findall(r_title, raw2)))) wikiteam.delay(config=config) c += 1 c = 0 m = re.compile(r_title).finditer(rawacum) for i in m: t = wikiteam.undoHTMLEntities(text=i.group('title')) if not t.startswith('Special:'): if t not in pagetitles: pagetitles.append(t) c += 1 sys.stderr.write(' %d titles retrieved in the namespace %d\n' % (c, namespace)) return pagetitles
def mwGetImageNamesScraper(config={}): """ Retrieve file list: filename, url, uploader """ # (?<! http://docs.python.org/library/re.html r_next = r'(?<!&dir=prev)&offset=(?P<offset>\d+)&' imagenames = [] offset = '29990101000000' # january 1, 2999 limit = 5000 retries = config['retries'] while offset: # 5000 overload some servers, but it is needed for sites like this with # no next links # http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch= data={ 'title': 'Special:Imagelist', 'limit': limit, 'offset': offset} raw = wikiteam.getURL(url=config['index'], data=data) #handleStatusCode(r) wikiteam.delay(config=config) # delicate wiki if re.search(r'(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)', raw): if limit > 10: sys.stderr.write('Error: listing %d images in a chunk is not possible, trying tiny chunks' % (limit)) limit = limit / 10 continue elif retries > 0: # waste retries, then exit retries -= 1 sys.stderr.write('Retrying...') continue else: sys.stderr.write('No more retries, exit...') break raw = mwCleanHTML(raw) # archiveteam 1.15.1 <td class="TablePager_col_img_name"><a href="/index.php?title=File:Yahoovideo.jpg" title="File:Yahoovideo.jpg">Yahoovideo.jpg</a> (<a href="/images/2/2b/Yahoovideo.jpg">file</a>)</td> # wikanda 1.15.5 <td class="TablePager_col_img_user_text"><a # href="/w/index.php?title=Usuario:Fernandocg&action=edit&redlink=1" # class="new" title="Usuario:Fernandocg (página no # existe)">Fernandocg</a></td> r_images1 = r'(?im)<td class="TablePager_col_img_name"><a href[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+/[^>/]+)">[^<]+</a>[^<]+</td>\s*<td class="TablePager_col_img_user_text"><a[^>]+>(?P<uploader>[^<]+)</a></td>' # wikijuegos 1.9.5 # http://softwarelibre.uca.es/wikijuegos/Especial:Imagelist old # mediawiki version r_images2 = r'(?im)<td class="TablePager_col_links"><a href[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+/[^>/]+)">[^<]+</a></td>\s*<td class="TablePager_col_img_timestamp">[^<]+</td>\s*<td class="TablePager_col_img_name">[^<]+</td>\s*<td class="TablePager_col_img_user_text"><a[^>]+>(?P<uploader>[^<]+)</a></td>' # gentoowiki 1.18 r_images3 = r'(?im)<td class="TablePager_col_img_name"><a[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+)">[^<]+</a>[^<]+</td><td class="TablePager_col_thumb"><a[^>]+><img[^>]+></a></td><td class="TablePager_col_img_size">[^<]+</td><td class="TablePager_col_img_user_text"><a[^>]+>(?P<uploader>[^<]+)</a></td>' # http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch= # (<a href="/en/Image:109_0923.JPG" title="Image:109 0923.JPG">desc</a>) <a href="/en/upload/c/cd/109_0923.JPG">109 0923.JPG</a> . . 885,713 bytes . . <a href="/en/User:Bfalconer" title="User:Bfalconer">Bfalconer</a> . . 18:44, 17 November 2005<br /> r_images4 = r'(?im)<a href=[^>]+ title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+)">[^<]+</a>[^<]+<a[^>]+>(?P<uploader>[^<]+)</a>' r_images5 = ( r'(?im)<td class="TablePager_col_img_name">\s*<a href[^>]*?>(?P<filename>[^>]+)</a>\s*\(<a href="(?P<url>[^>]+)">[^<]*?</a>\s*\)\s*</td>\s*' '<td class="TablePager_col_thumb">[^\n\r]*?</td>\s*' '<td class="TablePager_col_img_size">[^<]*?</td>\s*' '<td class="TablePager_col_img_user_text">\s*(<a href="[^>]*?" title="[^>]*?">)?(?P<uploader>[^<]+?)(</a>)?\s*</td>') # Select the regexp that returns more results regexps = [r_images1, r_images2, r_images3, r_images4, r_images5] count = 0 i = 0 regexp_best = 0 for regexp in regexps: if len(re.findall(regexp, raw)) > count: count = len(re.findall(regexp, raw)) regexp_best = i i += 1 m = re.compile(regexps[regexp_best]).finditer(raw) # Iter the image results for i in m: url = i.group('url') url = mwCurateImageURL(config=config, url=url) filename = re.sub('_', ' ', i.group('filename')) filename = wikiteam.undoHTMLEntities(text=filename) filename = urllib.unquote(filename) uploader = re.sub('_', ' ', i.group('uploader')) uploader = wikiteam.undoHTMLEntities(text=uploader) uploader = urllib.unquote(uploader) imagenames.append([filename, url, uploader]) if re.search(r_next, raw): new_offset = re.findall(r_next, raw)[0] # Avoid infinite loop if new_offset != offset: offset = new_offset retries += 5 # add more retries if we got a page with offset else: offset = '' else: offset = '' if (len(imagenames) == 1): sys.stderr.write(' Found 1 image') else: sys.stderr.write(' Found %d images' % (len(imagenames))) imagenames.sort() return imagenames
def mwResumePreviousDump(config={}): imagenames = [] sys.stderr.write('Resuming previous dump process...') if config['xml']: pagetitles = mwReadPageTitles(config=config) try: lasttitles = wikiteam.reverseReadline( '%s/%s-%s-titles.txt' % (config['path'], domain2prefix(config=config), config['date'])) lasttitle = lasttitles.next() if lasttitle == '': lasttitle = lasttitles.next() except: pass # probably file does not exists if lasttitle == '--END--': # titles list is complete sys.stderr.write( 'Title list was completed in the previous session') else: sys.stderr.write('Title list is incomplete. Reloading...') # do not resume, reload, to avoid inconsistences, deleted pages or # so pagetitles = mwGetPageTitles(config=config, start=lastxmltitle) wikiteam.savePageTitles(config=config, pagetitles=pagetitles) # checking xml dump xmliscomplete = False lastxmltitle = None try: f = wikiteam.reverseReadline( '%s/%s-%s-%s.xml' % (config['path'], domain2prefix(config=config), config['date'], config['curonly'] and 'current' or 'history'), ) for l in f: if l == '</mediawiki>': # xml dump is complete xmliscomplete = True break xmltitle = re.search(r'<title>([^<]+)</title>', l) if xmltitle: lastxmltitle = wikiteam.undoHTMLEntities( text=xmltitle.group(1)) break except: pass # probably file does not exists if xmliscomplete: sys.stderr.write('XML dump was completed in the previous session') elif lastxmltitle: # resuming... sys.stderr.write('Resuming XML dump from "%s"' % (lastxmltitle)) pagetitles = mwReadPageTitles(config=config, start=lastxmltitle) mwGenerateXMLDump(config=config, pagetitles=pagetitles, start=lastxmltitle) else: # corrupt? only has XML header? sys.stderr.write('XML is corrupt? Regenerating...') pagetitles = mwReadPageTitles(config=config) mwGenerateXMLDump(config=config, pagetitles=pagetitles) if config['images']: # load images lastimage = '' try: f = open( '%s/%s-%s-images.txt' % (config['path'], domain2prefix(config=config), config['date']), 'r') raw = f.read().strip() lines = raw.split('\n') for l in lines: if re.search(r'\t', l): imagenames.append(l.split('\t')) lastimage = lines[-1] f.close() except: pass # probably file doesnot exists if lastimage == '--END--': sys.stderr.write( 'Image list was completed in the previous session') else: sys.stderr.write('Image list is incomplete. Reloading...') # do not resume, reload, to avoid inconsistences, deleted images or # so imagenames = mwGetImageNames(config=config) saveImageNames(config=config, imagenames=imagenames) # checking images directory listdir = [] try: listdir = [ n.decode('utf-8') for n in os.listdir('%s/images' % (config['path'])) ] except: pass # probably directory does not exist listdir.sort() complete = True lastfilename = '' lastfilename2 = '' c = 0 for filename, url, uploader in imagenames: lastfilename2 = lastfilename # return always the complete filename, not the truncated lastfilename = filename filename2 = filename if len(filename2) > other['filenamelimit']: filename2 = truncateFilename(other=other, filename=filename2) if filename2 not in listdir: complete = False break c += 1 sys.stderr.write( '%d images were found in the directory from a previous session' % (c)) if complete: # image dump is complete sys.stderr.write( 'Image dump was completed in the previous session') else: # we resume from previous image, which may be corrupted (or missing # .desc) by the previous session ctrl-c or abort mwGenerateImageDump(config=config, imagenames=imagenames, start=lastfilename2) if config['logs']: # fix pass mwSaveIndexPHP(config=config) mwSaveSpecialVersion(config=config) mwSaveSiteInfo(config=config)
def mwGetPageTitlesScraper(config={}): """ Scrape list of page titles from Special:Allpages """ pagetitles = [] namespaces, namespacenames = mwGetNamespacesScraper(config=config) for namespace in namespaces: sys.stderr.write(' Retrieving titles in namespace %s\n' % (namespace)) url = '%s?title=Special:Allpages&namespace=%s' % (config['index'], namespace) raw = wikiteam.getURL(url=url) raw = mwCleanHTML(raw) r_title = r'title="(?P<title>[^>]+)">' r_suballpages = '' r_suballpages1 = r'&from=(?P<from>[^>]+)&to=(?P<to>[^>]+)">' r_suballpages2 = r'Special:Allpages/(?P<from>[^>]+)">' r_suballpages3 = r'&from=(?P<from>[^>]+)" title="[^>]+">' if re.search(r_suballpages1, raw): r_suballpages = r_suballpages1 elif re.search(r_suballpages2, raw): r_suballpages = r_suballpages2 elif re.search(r_suballpages3, raw): r_suballpages = r_suballpages3 else: pass # perhaps no subpages # 3 is the current deep of English Wikipedia for Special:Allpages deep = 3 c = 0 checked_suballpages = [] rawacum = raw while r_suballpages and re.search(r_suballpages, raw) and c < deep: # load sub-Allpages m = re.compile(r_suballpages).finditer(raw) for i in m: fr = i.group('from') if r_suballpages == r_suballpages1: to = i.group('to') name = '%s-%s' % (fr, to) url = '%s?title=Special:Allpages&namespace=%s&from=%s&to=%s' % ( config['index'], namespace, fr, to ) # do not put urllib.quote in fr or to # fix, esta regexp no carga bien todas? o falla el r_title en # este tipo de subpag? (wikiindex) elif r_suballpages == r_suballpages2: # clean &namespace=\d, sometimes happens fr = fr.split('&namespace=')[0] name = fr url = '%s?title=Special:Allpages/%s&namespace=%s' % ( config['index'], name, namespace) elif r_suballpages == r_suballpages3: fr = fr.split('&namespace=')[0] name = fr url = '%s?title=Special:Allpages&from=%s&namespace=%s' % ( config['index'], name, namespace) if name not in checked_suballpages: # to avoid reload dupe subpages links checked_suballpages.append(name) wikiteam.delay(config=config) raw2 = wikiteam.getURL(url=url) raw2 = mwCleanHTML(raw2) rawacum += raw2 # merge it after removed junk sys.stderr.write(' Reading %s, %s bytes, %d subpages, %d pages' % (name, len(raw2), \ len(re.findall(r_suballpages, raw2)), \ len(re.findall(r_title, raw2)))) wikiteam.delay(config=config) c += 1 c = 0 m = re.compile(r_title).finditer(rawacum) for i in m: t = wikiteam.undoHTMLEntities(text=i.group('title')) if not t.startswith('Special:'): if t not in pagetitles: pagetitles.append(t) c += 1 sys.stderr.write(' %d titles retrieved in the namespace %d\n' % (c, namespace)) return pagetitles
def mwGetImageNamesScraper(config={}): """ Retrieve file list: filename, url, uploader """ # (?<! http://docs.python.org/library/re.html r_next = r'(?<!&dir=prev)&offset=(?P<offset>\d+)&' imagenames = [] offset = '29990101000000' # january 1, 2999 limit = 5000 retries = config['retries'] while offset: # 5000 overload some servers, but it is needed for sites like this with # no next links # http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch= data = {'title': 'Special:Imagelist', 'limit': limit, 'offset': offset} raw = wikiteam.getURL(url=config['index'], data=data) #handleStatusCode(r) wikiteam.delay(config=config) # delicate wiki if re.search( r'(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)', raw): if limit > 10: sys.stderr.write( 'Error: listing %d images in a chunk is not possible, trying tiny chunks' % (limit)) limit = limit / 10 continue elif retries > 0: # waste retries, then exit retries -= 1 sys.stderr.write('Retrying...') continue else: sys.stderr.write('No more retries, exit...') break raw = mwCleanHTML(raw) # archiveteam 1.15.1 <td class="TablePager_col_img_name"><a href="/index.php?title=File:Yahoovideo.jpg" title="File:Yahoovideo.jpg">Yahoovideo.jpg</a> (<a href="/images/2/2b/Yahoovideo.jpg">file</a>)</td> # wikanda 1.15.5 <td class="TablePager_col_img_user_text"><a # href="/w/index.php?title=Usuario:Fernandocg&action=edit&redlink=1" # class="new" title="Usuario:Fernandocg (página no # existe)">Fernandocg</a></td> r_images1 = r'(?im)<td class="TablePager_col_img_name"><a href[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+/[^>/]+)">[^<]+</a>[^<]+</td>\s*<td class="TablePager_col_img_user_text"><a[^>]+>(?P<uploader>[^<]+)</a></td>' # wikijuegos 1.9.5 # http://softwarelibre.uca.es/wikijuegos/Especial:Imagelist old # mediawiki version r_images2 = r'(?im)<td class="TablePager_col_links"><a href[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+/[^>/]+)">[^<]+</a></td>\s*<td class="TablePager_col_img_timestamp">[^<]+</td>\s*<td class="TablePager_col_img_name">[^<]+</td>\s*<td class="TablePager_col_img_user_text"><a[^>]+>(?P<uploader>[^<]+)</a></td>' # gentoowiki 1.18 r_images3 = r'(?im)<td class="TablePager_col_img_name"><a[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+)">[^<]+</a>[^<]+</td><td class="TablePager_col_thumb"><a[^>]+><img[^>]+></a></td><td class="TablePager_col_img_size">[^<]+</td><td class="TablePager_col_img_user_text"><a[^>]+>(?P<uploader>[^<]+)</a></td>' # http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch= # (<a href="/en/Image:109_0923.JPG" title="Image:109 0923.JPG">desc</a>) <a href="/en/upload/c/cd/109_0923.JPG">109 0923.JPG</a> . . 885,713 bytes . . <a href="/en/User:Bfalconer" title="User:Bfalconer">Bfalconer</a> . . 18:44, 17 November 2005<br /> r_images4 = r'(?im)<a href=[^>]+ title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+)">[^<]+</a>[^<]+<a[^>]+>(?P<uploader>[^<]+)</a>' r_images5 = ( r'(?im)<td class="TablePager_col_img_name">\s*<a href[^>]*?>(?P<filename>[^>]+)</a>\s*\(<a href="(?P<url>[^>]+)">[^<]*?</a>\s*\)\s*</td>\s*' '<td class="TablePager_col_thumb">[^\n\r]*?</td>\s*' '<td class="TablePager_col_img_size">[^<]*?</td>\s*' '<td class="TablePager_col_img_user_text">\s*(<a href="[^>]*?" title="[^>]*?">)?(?P<uploader>[^<]+?)(</a>)?\s*</td>' ) # Select the regexp that returns more results regexps = [r_images1, r_images2, r_images3, r_images4, r_images5] count = 0 i = 0 regexp_best = 0 for regexp in regexps: if len(re.findall(regexp, raw)) > count: count = len(re.findall(regexp, raw)) regexp_best = i i += 1 m = re.compile(regexps[regexp_best]).finditer(raw) # Iter the image results for i in m: url = i.group('url') url = mwCurateImageURL(config=config, url=url) filename = re.sub('_', ' ', i.group('filename')) filename = wikiteam.undoHTMLEntities(text=filename) filename = urllib.unquote(filename) uploader = re.sub('_', ' ', i.group('uploader')) uploader = wikiteam.undoHTMLEntities(text=uploader) uploader = urllib.unquote(uploader) imagenames.append([filename, url, uploader]) if re.search(r_next, raw): new_offset = re.findall(r_next, raw)[0] # Avoid infinite loop if new_offset != offset: offset = new_offset retries += 5 # add more retries if we got a page with offset else: offset = '' else: offset = '' if (len(imagenames) == 1): sys.stderr.write(' Found 1 image') else: sys.stderr.write(' Found %d images' % (len(imagenames))) imagenames.sort() return imagenames