def mwSaveSiteInfo(config={}): """ Save a file with site info """ if config['api']: if os.path.exists('%s/siteinfo.json' % (config['path'])): sys.stderr.write('siteinfo.json exists, do not overwrite') else: sys.stderr.write('Downloading site info as siteinfo.json') # MediaWiki 1.13+ raw = wikiteam.getURL(url=config['api'], data={ 'action': 'query', 'meta': 'siteinfo', 'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap|namespacealiases|specialpagealiases|usergroups|extensions|skins|magicwords|fileextensions|rightsinfo', 'sinumberingroup': 1, 'format': 'json'}) wikiteam.delay(config=config) # MediaWiki 1.11-1.12 if not 'query' in wikiteam.getJSON(raw): raw = wikiteam.getURL(url=config['api'], data={ 'action': 'query', 'meta': 'siteinfo', 'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap', 'format': 'json'}) # MediaWiki 1.8-1.10 if not 'query' in wikiteam.getJSON(raw): raw = wikiteam.getURL(url=config['api'], data={ 'action': 'query', 'meta': 'siteinfo', 'siprop': 'general|namespaces', 'format': 'json'}) result = wikiteam.getJSON(raw) wikiteam.delay(config=config) with open('%s/siteinfo.json' % (config['path']), 'w') as outfile: outfile.write(json.dumps(result, indent=4, sort_keys=True))
def mwGetIndex(config={}): """ Returns Index.php for a MediaWiki wiki, if available """ if config['mwapi']: mwapi = config['mwapi'] else: mwapi = mwGetAPI(config=config) index = '' html = wikiteam.getURL(url=config['wiki']) m = re.findall( r'<li id="ca-viewsource"[^>]*?>\s*(?:<span>)?\s*<a href="([^\?]+?)\?', html) if m: index = m[0] else: m = re.findall( r'<li id="ca-history"[^>]*?>\s*(?:<span>)?\s*<a href="([^\?]+?)\?', html) if m: index = m[0] if index: if index.startswith('/'): index = '/'.join( mwapi.split('/')[:-1]) + '/' + index.split('/')[-1] else: if mwapi: if len(re.findall(r'/index\.php5\?', html)) > len( re.findall(r'/index\.php\?', html)): index = '/'.join(mwapi.split('/')[:-1]) + '/index.php5' else: index = '/'.join(mwapi.split('/')[:-1]) + '/index.php' return index
def mwGetIndex(config={}): """ Returns Index.php for a MediaWiki wiki, if available """ if config['mwapi']: mwapi = config['mwapi'] else: mwapi = mwGetAPI(config=config) index = '' html = wikiteam.getURL(url=config['wiki']) m = re.findall(r'<li id="ca-viewsource"[^>]*?>\s*(?:<span>)?\s*<a href="([^\?]+?)\?', html) if m: index = m[0] else: m = re.findall(r'<li id="ca-history"[^>]*?>\s*(?:<span>)?\s*<a href="([^\?]+?)\?', html) if m: index = m[0] if index: if index.startswith('/'): index = '/'.join(mwapi.split('/')[:-1]) + '/' + index.split('/')[-1] else: if mwapi: if len(re.findall(r'/index\.php5\?', html)) > len(re.findall(r'/index\.php\?', html)): index = '/'.join(mwapi.split('/')[:-1]) + '/index.php5' else: index = '/'.join(mwapi.split('/')[:-1]) + '/index.php' return index
def mwGetNamespacesAPI(config={}): """ Uses the API to get the list of namespaces names and ids """ namespaces = config['namespaces'] namespacenames = {0: ''} # main is 0, no prefix if namespaces: data = {'action': 'query', 'meta': 'siteinfo', 'siprop': 'namespaces', 'format': 'json'} r = wikiteam.getURL(url=config['mwapi'], data=data) result = wikiteam.getJSON(r) wikiteam.delay(config=config) if 'all' in namespaces: namespaces = [] for i in result['query']['namespaces'].keys(): if int(i) < 0: # Skipping -1: Special, -2: Media continue namespaces.append(int(i)) namespacenames[int(i)] = result['query']['namespaces'][i]['*'] else: # check if those namespaces really exist in this wiki namespaces2 = [] for i in result['query']['namespaces'].keys(): if int(i) < 0: continue if int(i) in namespaces: namespaces2.append(int(i)) namespacenames[int(i)] = result['query']['namespaces'][i]['*'] namespaces = namespaces2 else: namespaces = [0] namespaces = list(set(namespaces)) # uniques sys.stderr.write('%d namespaces found\n' % (len(namespaces))) return namespaces, namespacenames
def mwGetNamespacesScraper(config={}): """ Hackishly gets the list of namespaces names and ids from the dropdown in the HTML of Special:AllPages """ """ Function called if no API is available """ namespaces = config['namespaces'] namespacenames = {0: ''} # main is 0, no prefix if namespaces: raw = wikiteam.getURL(url=config['index'], data={'title': 'Special:Allpages'}) wikiteam.delay(config=config) # [^>]*? to include selected="selected" m = re.compile(r'<option [^>]*?value="(?P<namespaceid>\d+)"[^>]*?>(?P<namespacename>[^<]+)</option>').finditer(raw) if 'all' in namespaces: namespaces = [] for i in m: namespaces.append(int(i.group("namespaceid"))) namespacenames[int(i.group("namespaceid"))] = i.group("namespacename") else: # check if those namespaces really exist in this wiki namespaces2 = [] for i in m: if int(i.group("namespaceid")) in namespaces: namespaces2.append(int(i.group("namespaceid"))) namespacenames[int(i.group("namespaceid"))] = i.group("namespacename") namespaces = namespaces2 else: namespaces = [0] namespaces = list(set(namespaces)) # uniques std.stderr.write('%d namespaces found' % (len(namespaces))) return namespaces, namespacenames
def mwGetXMLPageCore(config={}, data={}): """ Returns a XML containing data['limit'] revisions (or current only), ending in </mediawiki> if retrieving data['limit'] revisions fails, returns current only version if all fail, returns empty string """ xml = '' cretries = 0 maxseconds = 100 # max seconds to wait in a single sleeping maxretries = config['retries'] # x retries and exit increment = 20 # increment seconds every retry while not re.search(r'</mediawiki>', xml): if cretries > 0 and cretries < maxretries: wait = increment * cretries < maxseconds and increment * \ cretries or maxseconds # incremental until maxseconds sys.stderr.write(' In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...\n' % (c, data['pages'], wait)) time.sleep(wait) # reducing server load requesting smallest chunks (if curonly then # limit = 1 from mother function) if data['limit'] > 1: data['limit'] = data['limit'] / 2 # half if cretries >= maxretries: sys.stderr.write(' We have retried %d times\n' % (cretries)) sys.stderr.write(' MediaWiki error for "%s", probably network error...' % (data['pages'])) # If it's not already what we tried: our last chance, preserve only the last revision... # config['curonly'] means that the whole dump is configured to save only the last, # data['curonly'] should mean that we've already tried this # fallback, because it's set by the following if and passed to # mwGetXMLPageCore if not config['curonly'] and not 'curonly' in data: sys.stderr.write(' Trying to save only the last revision for this page...\n') data['curonly'] = 1 logerror( config=config, text='Error while retrieving the full history of "%s". Trying to save only the last revision for this page' % (data['pages']) ) return mwGetXMLPageCore(config=config, data=data) else: sys.stderr.write(' Saving in error log, skipping...\n') logerror( config=config, text='Error while retrieving last revision of "%s". Skipping.\n' % (data['pages'])) raise ExportAbortedError(config['index']) return '' # empty xml # FIXME HANDLE HTTP Errors HERE try: r = wikiteam.getURL(url=config['index'], data=data) #handleStatusCode(r) #r = fixBOM(r) xml = fixBOM(r) except: sys.stderr.write(' Connection error\n') xml = '' cretries += 1 return xml
def mwSaveSiteInfo(config={}): """ Save a file with site info """ if config['api']: if os.path.exists('%s/siteinfo.json' % (config['path'])): sys.stderr.write('siteinfo.json exists, do not overwrite') else: sys.stderr.write('Downloading site info as siteinfo.json') # MediaWiki 1.13+ raw = wikiteam.getURL( url=config['api'], data={ 'action': 'query', 'meta': 'siteinfo', 'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap|namespacealiases|specialpagealiases|usergroups|extensions|skins|magicwords|fileextensions|rightsinfo', 'sinumberingroup': 1, 'format': 'json' }) wikiteam.delay(config=config) # MediaWiki 1.11-1.12 if not 'query' in wikiteam.getJSON(raw): raw = wikiteam.getURL( url=config['api'], data={ 'action': 'query', 'meta': 'siteinfo', 'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap', 'format': 'json' }) # MediaWiki 1.8-1.10 if not 'query' in wikiteam.getJSON(raw): raw = wikiteam.getURL(url=config['api'], data={ 'action': 'query', 'meta': 'siteinfo', 'siprop': 'general|namespaces', 'format': 'json' }) result = wikiteam.getJSON(raw) wikiteam.delay(config=config) with open('%s/siteinfo.json' % (config['path']), 'w') as outfile: outfile.write(json.dumps(result, indent=4, sort_keys=True))
def mwSaveIndexPHP(config={}): """ Save index.php as .html, to preserve license details available at the botom of the page """ if os.path.exists('%s/index.html' % (config['path'])): sys.stderr.write('index.html exists, do not overwrite') else: sys.stderr.write('Downloading index.php (Main Page) as index.html') raw = wikiteam.getURL(url=config['index'], data={}) wikiteam.delay(config=config) raw = mwRemoveIP(raw=raw) with open('%s/index.html' % (config['path']), 'w') as outfile: outfile.write(raw)
def mwSaveSpecialVersion(config={}): """ Save Special:Version as .html, to preserve extensions details """ if os.path.exists('%s/Special:Version.html' % (config['path'])): sys.stderr.write('Special:Version.html exists, do not overwrite') else: sys.stderr.write('Downloading Special:Version with extensions and other related info') raw = wikiteam.getURL(url=config['index'], data={'title': 'Special:Version'}) wikiteam.delay(config=config) raw = mwRemoveIP(raw=raw) with open('%s/Special:Version.html' % (config['path']), 'w') as outfile: outfile.write(raw)
def mwGetAPI(config={}): """ Returns API for a MediaWiki wiki, if available """ api = '' html = wikiteam.getURL(url=config['wiki']) m = re.findall( r'(?im)<\s*link\s*rel="EditURI"\s*type="application/rsd\+xml"\s*href="([^>]+?)\?action=rsd"\s*/\s*>', html) if m: api = m[0] if api.startswith('//'): # gentoo wiki and others api = url.split('//')[0] + api return api
def mwSaveSpecialVersion(config={}): """ Save Special:Version as .html, to preserve extensions details """ if os.path.exists('%s/Special:Version.html' % (config['path'])): sys.stderr.write('Special:Version.html exists, do not overwrite') else: sys.stderr.write( 'Downloading Special:Version with extensions and other related info' ) raw = wikiteam.getURL(url=config['index'], data={'title': 'Special:Version'}) wikiteam.delay(config=config) raw = mwRemoveIP(raw=raw) with open('%s/Special:Version.html' % (config['path']), 'w') as outfile: outfile.write(raw)
def mwGetNamespacesAPI(config={}): """ Uses the API to get the list of namespaces names and ids """ namespaces = config['namespaces'] namespacenames = {0: ''} # main is 0, no prefix if namespaces: data = { 'action': 'query', 'meta': 'siteinfo', 'siprop': 'namespaces', 'format': 'json' } r = wikiteam.getURL(url=config['mwapi'], data=data) result = wikiteam.getJSON(r) wikiteam.delay(config=config) if 'all' in namespaces: namespaces = [] for i in result['query']['namespaces'].keys(): if int(i) < 0: # Skipping -1: Special, -2: Media continue namespaces.append(int(i)) namespacenames[int(i)] = result['query']['namespaces'][i]['*'] else: # check if those namespaces really exist in this wiki namespaces2 = [] for i in result['query']['namespaces'].keys(): if int(i) < 0: continue if int(i) in namespaces: namespaces2.append(int(i)) namespacenames[int( i)] = result['query']['namespaces'][i]['*'] namespaces = namespaces2 else: namespaces = [0] namespaces = list(set(namespaces)) # uniques sys.stderr.write('%d namespaces found\n' % (len(namespaces))) return namespaces, namespacenames
def mwGetNamespacesScraper(config={}): """ Hackishly gets the list of namespaces names and ids from the dropdown in the HTML of Special:AllPages """ """ Function called if no API is available """ namespaces = config['namespaces'] namespacenames = {0: ''} # main is 0, no prefix if namespaces: raw = wikiteam.getURL(url=config['index'], data={'title': 'Special:Allpages'}) wikiteam.delay(config=config) # [^>]*? to include selected="selected" m = re.compile( r'<option [^>]*?value="(?P<namespaceid>\d+)"[^>]*?>(?P<namespacename>[^<]+)</option>' ).finditer(raw) if 'all' in namespaces: namespaces = [] for i in m: namespaces.append(int(i.group("namespaceid"))) namespacenames[int( i.group("namespaceid"))] = i.group("namespacename") else: # check if those namespaces really exist in this wiki namespaces2 = [] for i in m: if int(i.group("namespaceid")) in namespaces: namespaces2.append(int(i.group("namespaceid"))) namespacenames[int( i.group("namespaceid"))] = i.group("namespacename") namespaces = namespaces2 else: namespaces = [0] namespaces = list(set(namespaces)) # uniques std.stderr.write('%d namespaces found' % (len(namespaces))) return namespaces, namespacenames
def mwGetPageTitlesScraper(config={}): """ Scrape list of page titles from Special:Allpages """ pagetitles = [] namespaces, namespacenames = mwGetNamespacesScraper( config=config) for namespace in namespaces: sys.stderr.write(' Retrieving titles in namespace %s\n' % (namespace)) url = '%s?title=Special:Allpages&namespace=%s' % (config['index'], namespace) raw = wikiteam.getURL(url=url) raw = mwCleanHTML(raw) r_title = r'title="(?P<title>[^>]+)">' r_suballpages = '' r_suballpages1 = r'&from=(?P<from>[^>]+)&to=(?P<to>[^>]+)">' r_suballpages2 = r'Special:Allpages/(?P<from>[^>]+)">' r_suballpages3 = r'&from=(?P<from>[^>]+)" title="[^>]+">' if re.search(r_suballpages1, raw): r_suballpages = r_suballpages1 elif re.search(r_suballpages2, raw): r_suballpages = r_suballpages2 elif re.search(r_suballpages3, raw): r_suballpages = r_suballpages3 else: pass # perhaps no subpages # 3 is the current deep of English Wikipedia for Special:Allpages deep = 3 c = 0 checked_suballpages = [] rawacum = raw while r_suballpages and re.search(r_suballpages, raw) and c < deep: # load sub-Allpages m = re.compile(r_suballpages).finditer(raw) for i in m: fr = i.group('from') if r_suballpages == r_suballpages1: to = i.group('to') name = '%s-%s' % (fr, to) url = '%s?title=Special:Allpages&namespace=%s&from=%s&to=%s' % ( config['index'], namespace, fr, to) # do not put urllib.quote in fr or to # fix, esta regexp no carga bien todas? o falla el r_title en # este tipo de subpag? (wikiindex) elif r_suballpages == r_suballpages2: # clean &namespace=\d, sometimes happens fr = fr.split('&namespace=')[0] name = fr url = '%s?title=Special:Allpages/%s&namespace=%s' % ( config['index'], name, namespace) elif r_suballpages == r_suballpages3: fr = fr.split('&namespace=')[0] name = fr url = '%s?title=Special:Allpages&from=%s&namespace=%s' % ( config['index'], name, namespace) if name not in checked_suballpages: # to avoid reload dupe subpages links checked_suballpages.append(name) wikiteam.delay(config=config) raw2 = wikiteam.getURL(url=url) raw2 = mwCleanHTML(raw2) rawacum += raw2 # merge it after removed junk sys.stderr.write(' Reading %s, %s bytes, %d subpages, %d pages' % (name, len(raw2), \ len(re.findall(r_suballpages, raw2)), \ len(re.findall(r_title, raw2)))) wikiteam.delay(config=config) c += 1 c = 0 m = re.compile(r_title).finditer(rawacum) for i in m: t = wikiteam.undoHTMLEntities(text=i.group('title')) if not t.startswith('Special:'): if t not in pagetitles: pagetitles.append(t) c += 1 sys.stderr.write(' %d titles retrieved in the namespace %d\n' % (c, namespace)) return pagetitles
def mwGetPageTitlesAPI(config={}): """ Uses the API to get the list of page titles """ pagetitles = [] namespaces, namespacenames = mwGetNamespacesAPI( config=config) for namespace in namespaces: if namespace in config['exnamespaces']: sys.stderr.write(' Skipping namespace = %d\n' % (namespace)) continue c = 0 sys.stderr.write(' Retrieving page titles in namespace %d\n' % (namespace)) apfrom = '!' while apfrom: sys.stderr.write('.') # progress data = { 'action': 'query', 'list': 'allpages', 'apnamespace': namespace, 'apfrom': apfrom.encode('utf-8'), 'format': 'json', 'aplimit': 500} retryCount = 0 while retryCount < config["retries"]: try: r = wikiteam.getURL(url=config['mwapi'], data=data) break except ConnectionError as err: sys.stderr.write("Connection error: %s\n" % (str(err),)) retryCount += 1 time.sleep(20) #wikiteam.handleStatusCode(r) # FIXME Handle HTTP errors here! jsontitles = wikiteam.getJSON(r) apfrom = '' if 'query-continue' in jsontitles and 'allpages' in jsontitles[ 'query-continue']: if 'apcontinue' in jsontitles['query-continue']['allpages']: apfrom = jsontitles[ 'query-continue']['allpages']['apcontinue'] elif 'apfrom' in jsontitles['query-continue']['allpages']: apfrom = jsontitles['query-continue']['allpages']['apfrom'] elif 'continue' in jsontitles: if 'apcontinue' in jsontitles['continue']: apfrom = jsontitles['continue']['apcontinue'] elif 'apfrom' in jsontitles['continue']: apfrom = jsontitles['continue']['apfrom'] # sys.stderr.write(apfrom) # sys.stderr.write(jsontitles) allpages = jsontitles['query']['allpages'] # Hack for old versions of MediaWiki API where result is dict if isinstance(allpages, dict): allpages = allpages.values() for page in allpages: yield page['title'] c += len(allpages) if len(pagetitles) != len(set(pagetitles)): # Are we in a loop? Server returning dupes, stop it sys.stderr.write('Probably a loop, finishing\n') apfrom = '' wikiteam.delay(config=config) sys.stderr.write(' %d titles retrieved in namespace %d\n' % (c, namespace))
def mwGetImageNamesAPI(config={}): """ Retrieve file list: filename, url, uploader """ oldAPI = False aifrom = '!' imagenames = [] while aifrom: sys.stderr.write('.') # progress data = { 'action': 'query', 'list': 'allimages', 'aiprop': 'url|user', 'aifrom': aifrom, 'format': 'json', 'ailimit': 500 } # FIXME Handle HTTP Errors HERE r = wikiteam.getURL(url=config['mwapi'], data=data) #handleStatusCode(r) jsonimages = wikiteam.getJSON(r) wikiteam.delay(config=config) if 'query' in jsonimages: aifrom = '' if 'query-continue' in jsonimages and 'allimages' in jsonimages[ 'query-continue']: if 'aicontinue' in jsonimages['query-continue']['allimages']: aifrom = jsonimages['query-continue']['allimages'][ 'aicontinue'] elif 'aifrom' in jsonimages['query-continue']['allimages']: aifrom = jsonimages['query-continue']['allimages'][ 'aifrom'] elif 'continue' in jsonimages: if 'aicontinue' in jsonimages['continue']: aifrom = jsonimages['continue']['aicontinue'] elif 'aifrom' in jsonimages['continue']: aifrom = jsonimages['continue']['aifrom'] # sys.stderr.write(aifrom) for image in jsonimages['query']['allimages']: url = image['url'] url = mwCurateImageURL(config=config, url=url) # encoding to ascii is needed to work around this horrible bug: # http://bugs.python.org/issue8136 if 'mwapi' in config and '.wikia.com' in config['mwapi']: #to avoid latest?cb=20120816112532 in filenames filename = urllib.parse.unquote( re.sub('_', ' ', url.split('/')[-3])).encode('ascii', 'ignore') else: filename = urllib.parse.unquote( re.sub('_', ' ', url.split('/')[-1])).encode('ascii', 'ignore') uploader = re.sub('_', ' ', image['user']) imagenames.append([filename, url, uploader]) else: oldAPI = True break if oldAPI: gapfrom = '!' imagenames = [] while gapfrom: sys.stderr.write('.') # progress # Some old APIs doesn't have allimages query # In this case use allpages (in nm=6) as generator for imageinfo # Example: # http://minlingo.wiki-site.com/api.php?action=query&generator=allpages&gapnamespace=6 # &gaplimit=500&prop=imageinfo&iiprop=user|url&gapfrom=! data = { 'action': 'query', 'generator': 'allpages', 'gapnamespace': 6, 'gaplimit': 500, 'gapfrom': gapfrom, 'prop': 'imageinfo', 'iiprop': 'user|url', 'format': 'json' } # FIXME Handle HTTP Errors HERE r = wikiteam.getURL(url=config['mwapi'], data=data) #handleStatusCode(r) jsonimages = wikiteam.getJSON(r) wikiteam.delay(config=config) if 'query' in jsonimages: gapfrom = '' if 'query-continue' in jsonimages and 'allpages' in jsonimages[ 'query-continue']: if 'gapfrom' in jsonimages['query-continue']['allpages']: gapfrom = jsonimages['query-continue']['allpages'][ 'gapfrom'] for image, props in jsonimages['query']['pages'].items(): url = props['imageinfo'][0]['url'] url = mwCurateImageURL(config=config, url=url) tmp_filename = ':'.join(props['title'].split(':')[1:]) filename = re.sub('_', ' ', tmp_filename) uploader = re.sub('_', ' ', props['imageinfo'][0]['user']) imagenames.append([filename, url, uploader]) else: # if the API doesn't return query data, then we're done break if len(imagenames) == 1: sys.stderr.write(' Found 1 image') else: sys.stderr.write(' Found %d images' % (len(imagenames))) return imagenames
def mwGetImageNamesScraper(config={}): """ Retrieve file list: filename, url, uploader """ # (?<! http://docs.python.org/library/re.html r_next = r'(?<!&dir=prev)&offset=(?P<offset>\d+)&' imagenames = [] offset = '29990101000000' # january 1, 2999 limit = 5000 retries = config['retries'] while offset: # 5000 overload some servers, but it is needed for sites like this with # no next links # http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch= data={ 'title': 'Special:Imagelist', 'limit': limit, 'offset': offset} raw = wikiteam.getURL(url=config['index'], data=data) #handleStatusCode(r) wikiteam.delay(config=config) # delicate wiki if re.search(r'(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)', raw): if limit > 10: sys.stderr.write('Error: listing %d images in a chunk is not possible, trying tiny chunks' % (limit)) limit = limit / 10 continue elif retries > 0: # waste retries, then exit retries -= 1 sys.stderr.write('Retrying...') continue else: sys.stderr.write('No more retries, exit...') break raw = mwCleanHTML(raw) # archiveteam 1.15.1 <td class="TablePager_col_img_name"><a href="/index.php?title=File:Yahoovideo.jpg" title="File:Yahoovideo.jpg">Yahoovideo.jpg</a> (<a href="/images/2/2b/Yahoovideo.jpg">file</a>)</td> # wikanda 1.15.5 <td class="TablePager_col_img_user_text"><a # href="/w/index.php?title=Usuario:Fernandocg&action=edit&redlink=1" # class="new" title="Usuario:Fernandocg (página no # existe)">Fernandocg</a></td> r_images1 = r'(?im)<td class="TablePager_col_img_name"><a href[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+/[^>/]+)">[^<]+</a>[^<]+</td>\s*<td class="TablePager_col_img_user_text"><a[^>]+>(?P<uploader>[^<]+)</a></td>' # wikijuegos 1.9.5 # http://softwarelibre.uca.es/wikijuegos/Especial:Imagelist old # mediawiki version r_images2 = r'(?im)<td class="TablePager_col_links"><a href[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+/[^>/]+)">[^<]+</a></td>\s*<td class="TablePager_col_img_timestamp">[^<]+</td>\s*<td class="TablePager_col_img_name">[^<]+</td>\s*<td class="TablePager_col_img_user_text"><a[^>]+>(?P<uploader>[^<]+)</a></td>' # gentoowiki 1.18 r_images3 = r'(?im)<td class="TablePager_col_img_name"><a[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+)">[^<]+</a>[^<]+</td><td class="TablePager_col_thumb"><a[^>]+><img[^>]+></a></td><td class="TablePager_col_img_size">[^<]+</td><td class="TablePager_col_img_user_text"><a[^>]+>(?P<uploader>[^<]+)</a></td>' # http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch= # (<a href="/en/Image:109_0923.JPG" title="Image:109 0923.JPG">desc</a>) <a href="/en/upload/c/cd/109_0923.JPG">109 0923.JPG</a> . . 885,713 bytes . . <a href="/en/User:Bfalconer" title="User:Bfalconer">Bfalconer</a> . . 18:44, 17 November 2005<br /> r_images4 = r'(?im)<a href=[^>]+ title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+)">[^<]+</a>[^<]+<a[^>]+>(?P<uploader>[^<]+)</a>' r_images5 = ( r'(?im)<td class="TablePager_col_img_name">\s*<a href[^>]*?>(?P<filename>[^>]+)</a>\s*\(<a href="(?P<url>[^>]+)">[^<]*?</a>\s*\)\s*</td>\s*' '<td class="TablePager_col_thumb">[^\n\r]*?</td>\s*' '<td class="TablePager_col_img_size">[^<]*?</td>\s*' '<td class="TablePager_col_img_user_text">\s*(<a href="[^>]*?" title="[^>]*?">)?(?P<uploader>[^<]+?)(</a>)?\s*</td>') # Select the regexp that returns more results regexps = [r_images1, r_images2, r_images3, r_images4, r_images5] count = 0 i = 0 regexp_best = 0 for regexp in regexps: if len(re.findall(regexp, raw)) > count: count = len(re.findall(regexp, raw)) regexp_best = i i += 1 m = re.compile(regexps[regexp_best]).finditer(raw) # Iter the image results for i in m: url = i.group('url') url = mwCurateImageURL(config=config, url=url) filename = re.sub('_', ' ', i.group('filename')) filename = wikiteam.undoHTMLEntities(text=filename) filename = urllib.unquote(filename) uploader = re.sub('_', ' ', i.group('uploader')) uploader = wikiteam.undoHTMLEntities(text=uploader) uploader = urllib.unquote(uploader) imagenames.append([filename, url, uploader]) if re.search(r_next, raw): new_offset = re.findall(r_next, raw)[0] # Avoid infinite loop if new_offset != offset: offset = new_offset retries += 5 # add more retries if we got a page with offset else: offset = '' else: offset = '' if (len(imagenames) == 1): sys.stderr.write(' Found 1 image') else: sys.stderr.write(' Found %d images' % (len(imagenames))) imagenames.sort() return imagenames
def mwGetImageNamesAPI(config={}): """ Retrieve file list: filename, url, uploader """ oldAPI = False aifrom = '!' imagenames = [] while aifrom: sys.stderr.write('.') # progress data = { 'action': 'query', 'list': 'allimages', 'aiprop': 'url|user', 'aifrom': aifrom, 'format': 'json', 'ailimit': 500} # FIXME Handle HTTP Errors HERE r = wikiteam.getURL(url=config['mwapi'], data=data) #handleStatusCode(r) jsonimages = wikiteam.getJSON(r) wikiteam.delay(config=config) if 'query' in jsonimages: aifrom = '' if 'query-continue' in jsonimages and 'allimages' in jsonimages['query-continue']: if 'aicontinue' in jsonimages['query-continue']['allimages']: aifrom = jsonimages['query-continue']['allimages']['aicontinue'] elif 'aifrom' in jsonimages['query-continue']['allimages']: aifrom = jsonimages['query-continue']['allimages']['aifrom'] elif 'continue' in jsonimages: if 'aicontinue' in jsonimages['continue']: aifrom = jsonimages['continue']['aicontinue'] elif 'aifrom' in jsonimages['continue']: aifrom = jsonimages['continue']['aifrom'] # sys.stderr.write(aifrom) for image in jsonimages['query']['allimages']: url = image['url'] url = mwCurateImageURL(config=config, url=url) # encoding to ascii is needed to work around this horrible bug: # http://bugs.python.org/issue8136 if 'mwapi' in config and '.wikia.com' in config['mwapi']: #to avoid latest?cb=20120816112532 in filenames filename = urllib.parse.unquote(re.sub('_', ' ', url.split('/')[-3])).encode('ascii', 'ignore') else: filename = urllib.parse.unquote(re.sub('_', ' ', url.split('/')[-1])).encode('ascii', 'ignore') uploader = re.sub('_', ' ', image['user']) imagenames.append([filename, url, uploader]) else: oldAPI = True break if oldAPI: gapfrom = '!' imagenames = [] while gapfrom: sys.stderr.write('.') # progress # Some old APIs doesn't have allimages query # In this case use allpages (in nm=6) as generator for imageinfo # Example: # http://minlingo.wiki-site.com/api.php?action=query&generator=allpages&gapnamespace=6 # &gaplimit=500&prop=imageinfo&iiprop=user|url&gapfrom=! data = { 'action': 'query', 'generator': 'allpages', 'gapnamespace': 6, 'gaplimit': 500, 'gapfrom': gapfrom, 'prop': 'imageinfo', 'iiprop': 'user|url', 'format': 'json'} # FIXME Handle HTTP Errors HERE r = wikiteam.getURL(url=config['mwapi'], data=data) #handleStatusCode(r) jsonimages = wikiteam.getJSON(r) wikiteam.delay(config=config) if 'query' in jsonimages: gapfrom = '' if 'query-continue' in jsonimages and 'allpages' in jsonimages['query-continue']: if 'gapfrom' in jsonimages['query-continue']['allpages']: gapfrom = jsonimages['query-continue']['allpages']['gapfrom'] for image, props in jsonimages['query']['pages'].items(): url = props['imageinfo'][0]['url'] url = mwCurateImageURL(config=config, url=url) tmp_filename = ':'.join(props['title'].split(':')[1:]) filename = re.sub('_', ' ', tmp_filename) uploader = re.sub('_', ' ', props['imageinfo'][0]['user']) imagenames.append([filename, url, uploader]) else: # if the API doesn't return query data, then we're done break if len(imagenames) == 1: sys.stderr.write(' Found 1 image') else: sys.stderr.write(' Found %d images' % (len(imagenames))) return imagenames
def mwGetImageNamesScraper(config={}): """ Retrieve file list: filename, url, uploader """ # (?<! http://docs.python.org/library/re.html r_next = r'(?<!&dir=prev)&offset=(?P<offset>\d+)&' imagenames = [] offset = '29990101000000' # january 1, 2999 limit = 5000 retries = config['retries'] while offset: # 5000 overload some servers, but it is needed for sites like this with # no next links # http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch= data = {'title': 'Special:Imagelist', 'limit': limit, 'offset': offset} raw = wikiteam.getURL(url=config['index'], data=data) #handleStatusCode(r) wikiteam.delay(config=config) # delicate wiki if re.search( r'(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)', raw): if limit > 10: sys.stderr.write( 'Error: listing %d images in a chunk is not possible, trying tiny chunks' % (limit)) limit = limit / 10 continue elif retries > 0: # waste retries, then exit retries -= 1 sys.stderr.write('Retrying...') continue else: sys.stderr.write('No more retries, exit...') break raw = mwCleanHTML(raw) # archiveteam 1.15.1 <td class="TablePager_col_img_name"><a href="/index.php?title=File:Yahoovideo.jpg" title="File:Yahoovideo.jpg">Yahoovideo.jpg</a> (<a href="/images/2/2b/Yahoovideo.jpg">file</a>)</td> # wikanda 1.15.5 <td class="TablePager_col_img_user_text"><a # href="/w/index.php?title=Usuario:Fernandocg&action=edit&redlink=1" # class="new" title="Usuario:Fernandocg (página no # existe)">Fernandocg</a></td> r_images1 = r'(?im)<td class="TablePager_col_img_name"><a href[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+/[^>/]+)">[^<]+</a>[^<]+</td>\s*<td class="TablePager_col_img_user_text"><a[^>]+>(?P<uploader>[^<]+)</a></td>' # wikijuegos 1.9.5 # http://softwarelibre.uca.es/wikijuegos/Especial:Imagelist old # mediawiki version r_images2 = r'(?im)<td class="TablePager_col_links"><a href[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+/[^>/]+)">[^<]+</a></td>\s*<td class="TablePager_col_img_timestamp">[^<]+</td>\s*<td class="TablePager_col_img_name">[^<]+</td>\s*<td class="TablePager_col_img_user_text"><a[^>]+>(?P<uploader>[^<]+)</a></td>' # gentoowiki 1.18 r_images3 = r'(?im)<td class="TablePager_col_img_name"><a[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+)">[^<]+</a>[^<]+</td><td class="TablePager_col_thumb"><a[^>]+><img[^>]+></a></td><td class="TablePager_col_img_size">[^<]+</td><td class="TablePager_col_img_user_text"><a[^>]+>(?P<uploader>[^<]+)</a></td>' # http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch= # (<a href="/en/Image:109_0923.JPG" title="Image:109 0923.JPG">desc</a>) <a href="/en/upload/c/cd/109_0923.JPG">109 0923.JPG</a> . . 885,713 bytes . . <a href="/en/User:Bfalconer" title="User:Bfalconer">Bfalconer</a> . . 18:44, 17 November 2005<br /> r_images4 = r'(?im)<a href=[^>]+ title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+)">[^<]+</a>[^<]+<a[^>]+>(?P<uploader>[^<]+)</a>' r_images5 = ( r'(?im)<td class="TablePager_col_img_name">\s*<a href[^>]*?>(?P<filename>[^>]+)</a>\s*\(<a href="(?P<url>[^>]+)">[^<]*?</a>\s*\)\s*</td>\s*' '<td class="TablePager_col_thumb">[^\n\r]*?</td>\s*' '<td class="TablePager_col_img_size">[^<]*?</td>\s*' '<td class="TablePager_col_img_user_text">\s*(<a href="[^>]*?" title="[^>]*?">)?(?P<uploader>[^<]+?)(</a>)?\s*</td>' ) # Select the regexp that returns more results regexps = [r_images1, r_images2, r_images3, r_images4, r_images5] count = 0 i = 0 regexp_best = 0 for regexp in regexps: if len(re.findall(regexp, raw)) > count: count = len(re.findall(regexp, raw)) regexp_best = i i += 1 m = re.compile(regexps[regexp_best]).finditer(raw) # Iter the image results for i in m: url = i.group('url') url = mwCurateImageURL(config=config, url=url) filename = re.sub('_', ' ', i.group('filename')) filename = wikiteam.undoHTMLEntities(text=filename) filename = urllib.unquote(filename) uploader = re.sub('_', ' ', i.group('uploader')) uploader = wikiteam.undoHTMLEntities(text=uploader) uploader = urllib.unquote(uploader) imagenames.append([filename, url, uploader]) if re.search(r_next, raw): new_offset = re.findall(r_next, raw)[0] # Avoid infinite loop if new_offset != offset: offset = new_offset retries += 5 # add more retries if we got a page with offset else: offset = '' else: offset = '' if (len(imagenames) == 1): sys.stderr.write(' Found 1 image') else: sys.stderr.write(' Found %d images' % (len(imagenames))) imagenames.sort() return imagenames
def mwGetXMLPageCore(config={}, data={}): """ Returns a XML containing data['limit'] revisions (or current only), ending in </mediawiki> if retrieving data['limit'] revisions fails, returns current only version if all fail, returns empty string """ xml = '' cretries = 0 maxseconds = 100 # max seconds to wait in a single sleeping maxretries = config['retries'] # x retries and exit increment = 20 # increment seconds every retry while not re.search(r'</mediawiki>', xml): if cretries > 0 and cretries < maxretries: wait = increment * cretries < maxseconds and increment * \ cretries or maxseconds # incremental until maxseconds sys.stderr.write( ' In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...\n' % (c, data['pages'], wait)) time.sleep(wait) # reducing server load requesting smallest chunks (if curonly then # limit = 1 from mother function) if data['limit'] > 1: data['limit'] = data['limit'] / 2 # half if cretries >= maxretries: sys.stderr.write(' We have retried %d times\n' % (cretries)) sys.stderr.write( ' MediaWiki error for "%s", probably network error...' % (data['pages'])) # If it's not already what we tried: our last chance, preserve only the last revision... # config['curonly'] means that the whole dump is configured to save only the last, # data['curonly'] should mean that we've already tried this # fallback, because it's set by the following if and passed to # mwGetXMLPageCore if not config['curonly'] and not 'curonly' in data: sys.stderr.write( ' Trying to save only the last revision for this page...\n' ) data['curonly'] = 1 logerror( config=config, text= 'Error while retrieving the full history of "%s". Trying to save only the last revision for this page' % (data['pages'])) return mwGetXMLPageCore(config=config, data=data) else: sys.stderr.write(' Saving in error log, skipping...\n') logerror( config=config, text= 'Error while retrieving last revision of "%s". Skipping.\n' % (data['pages'])) raise ExportAbortedError(config['index']) return '' # empty xml # FIXME HANDLE HTTP Errors HERE try: r = wikiteam.getURL(url=config['index'], data=data) #handleStatusCode(r) #r = fixBOM(r) xml = fixBOM(r) except: sys.stderr.write(' Connection error\n') xml = '' cretries += 1 return xml
def mwGetPageTitlesScraper(config={}): """ Scrape list of page titles from Special:Allpages """ pagetitles = [] namespaces, namespacenames = mwGetNamespacesScraper(config=config) for namespace in namespaces: sys.stderr.write(' Retrieving titles in namespace %s\n' % (namespace)) url = '%s?title=Special:Allpages&namespace=%s' % (config['index'], namespace) raw = wikiteam.getURL(url=url) raw = mwCleanHTML(raw) r_title = r'title="(?P<title>[^>]+)">' r_suballpages = '' r_suballpages1 = r'&from=(?P<from>[^>]+)&to=(?P<to>[^>]+)">' r_suballpages2 = r'Special:Allpages/(?P<from>[^>]+)">' r_suballpages3 = r'&from=(?P<from>[^>]+)" title="[^>]+">' if re.search(r_suballpages1, raw): r_suballpages = r_suballpages1 elif re.search(r_suballpages2, raw): r_suballpages = r_suballpages2 elif re.search(r_suballpages3, raw): r_suballpages = r_suballpages3 else: pass # perhaps no subpages # 3 is the current deep of English Wikipedia for Special:Allpages deep = 3 c = 0 checked_suballpages = [] rawacum = raw while r_suballpages and re.search(r_suballpages, raw) and c < deep: # load sub-Allpages m = re.compile(r_suballpages).finditer(raw) for i in m: fr = i.group('from') if r_suballpages == r_suballpages1: to = i.group('to') name = '%s-%s' % (fr, to) url = '%s?title=Special:Allpages&namespace=%s&from=%s&to=%s' % ( config['index'], namespace, fr, to ) # do not put urllib.quote in fr or to # fix, esta regexp no carga bien todas? o falla el r_title en # este tipo de subpag? (wikiindex) elif r_suballpages == r_suballpages2: # clean &namespace=\d, sometimes happens fr = fr.split('&namespace=')[0] name = fr url = '%s?title=Special:Allpages/%s&namespace=%s' % ( config['index'], name, namespace) elif r_suballpages == r_suballpages3: fr = fr.split('&namespace=')[0] name = fr url = '%s?title=Special:Allpages&from=%s&namespace=%s' % ( config['index'], name, namespace) if name not in checked_suballpages: # to avoid reload dupe subpages links checked_suballpages.append(name) wikiteam.delay(config=config) raw2 = wikiteam.getURL(url=url) raw2 = mwCleanHTML(raw2) rawacum += raw2 # merge it after removed junk sys.stderr.write(' Reading %s, %s bytes, %d subpages, %d pages' % (name, len(raw2), \ len(re.findall(r_suballpages, raw2)), \ len(re.findall(r_title, raw2)))) wikiteam.delay(config=config) c += 1 c = 0 m = re.compile(r_title).finditer(rawacum) for i in m: t = wikiteam.undoHTMLEntities(text=i.group('title')) if not t.startswith('Special:'): if t not in pagetitles: pagetitles.append(t) c += 1 sys.stderr.write(' %d titles retrieved in the namespace %d\n' % (c, namespace)) return pagetitles
def mwGetPageTitlesAPI(config={}): """ Uses the API to get the list of page titles """ pagetitles = [] namespaces, namespacenames = mwGetNamespacesAPI(config=config) for namespace in namespaces: if namespace in config['exnamespaces']: sys.stderr.write(' Skipping namespace = %d\n' % (namespace)) continue c = 0 sys.stderr.write(' Retrieving page titles in namespace %d\n' % (namespace)) apfrom = '!' while apfrom: sys.stderr.write('.') # progress data = { 'action': 'query', 'list': 'allpages', 'apnamespace': namespace, 'apfrom': apfrom.encode('utf-8'), 'format': 'json', 'aplimit': 500 } retryCount = 0 while retryCount < config["retries"]: try: r = wikiteam.getURL(url=config['mwapi'], data=data) break except ConnectionError as err: sys.stderr.write("Connection error: %s\n" % (str(err), )) retryCount += 1 time.sleep(20) #wikiteam.handleStatusCode(r) # FIXME Handle HTTP errors here! jsontitles = wikiteam.getJSON(r) apfrom = '' if 'query-continue' in jsontitles and 'allpages' in jsontitles[ 'query-continue']: if 'apcontinue' in jsontitles['query-continue']['allpages']: apfrom = jsontitles['query-continue']['allpages'][ 'apcontinue'] elif 'apfrom' in jsontitles['query-continue']['allpages']: apfrom = jsontitles['query-continue']['allpages']['apfrom'] elif 'continue' in jsontitles: if 'apcontinue' in jsontitles['continue']: apfrom = jsontitles['continue']['apcontinue'] elif 'apfrom' in jsontitles['continue']: apfrom = jsontitles['continue']['apfrom'] # sys.stderr.write(apfrom) # sys.stderr.write(jsontitles) allpages = jsontitles['query']['allpages'] # Hack for old versions of MediaWiki API where result is dict if isinstance(allpages, dict): allpages = allpages.values() for page in allpages: yield page['title'] c += len(allpages) if len(pagetitles) != len(set(pagetitles)): # Are we in a loop? Server returning dupes, stop it sys.stderr.write('Probably a loop, finishing\n') apfrom = '' wikiteam.delay(config=config) sys.stderr.write(' %d titles retrieved in namespace %d\n' % (c, namespace))