def test_fetch(self): """Test that using the data parameter and body parameter produce same results.""" r_data = http.fetch(uri=self.get_httpbin_url('/post'), method='POST', data={'fish&chips': 'delicious'}) r_body = http.fetch(uri=self.get_httpbin_url('/post'), method='POST', body={'fish&chips': 'delicious'}) self.assertDictEqual(json.loads(r_data.content), json.loads(r_body.content))
def test_follow_redirects(self): """Test follow 301 redirects correctly.""" # The following will redirect from ' ' -> '_', and maybe to https:// r = http.fetch(uri='http://en.wikipedia.org/wiki/Main%20Page') self.assertEqual(r.status, 200) self.assertIsNotNone(r.data.history) self.assertIn('//en.wikipedia.org/wiki/Main_Page', r.data.url) r = http.fetch(uri='http://www.gandi.eu') self.assertEqual(r.status, 200) self.assertEqual(r.data.url, 'http://www.gandi.net')
def _parse_post_117(self): """Parse 1.17+ siteinfo data.""" response = fetch(self.api + '?action=query&meta=siteinfo&format=json') check_response(response) # remove preleading newlines and Byte Order Mark (BOM), see T128992 content = response.text.strip().lstrip('\uFEFF') info = json.loads(content) self.private_wiki = ('error' in info and info['error']['code'] == 'readapidenied') if self.private_wiki: # user-config.py is not loaded because PYWIKIBOT_NO_USER_CONFIG # is set to '2' by generate_family_file.py. # Prepare a temporary config for login. username = pywikibot.input( 'Private wiki detected. Login is required.\n' 'Please enter your username?') config.usernames['temporary_family'] = {'temporary_code': username} # Setup a dummy family so that we can create a site object fam = pywikibot.Family() fam.name = 'temporary_family' fam.scriptpath = lambda code: self.api[:-8] # without /api.php fam.langs = {'temporary_code': self.server} site = pywikibot.Site('temporary_code', fam) site.version = lambda: str(self.version) # Now the site object is able to login info = site.siteinfo else: info = info['query']['general'] self.version = MediaWikiVersion.from_generator(info['generator']) if self.version < MediaWikiVersion('1.17'): return self.server = urljoin(self.fromurl, info['server']) for item in ['scriptpath', 'articlepath', 'lang']: setattr(self, item, info[item])
def getDataFromHost(self, queryStr): """ Go and fetch a query from the host's API. @rtype: dict """ url = self.getUrl(queryStr) try: resp = http.fetch(url) except: pywikibot.warning(u"Failed to retrieve %s" % url) raise data = resp.content if not data: pywikibot.warning('No data received for %s' % url) raise pywikibot.ServerError('No data received for %s' % url) try: data = json.loads(data) except ValueError: pywikibot.warning( 'Data received for %s but no JSON could be decoded: %r' % (url, data)) raise pywikibot.ServerError( 'Data received for %s but no JSON could be decoded: %r' % (url, data)) return data
def langs(self): """Build interwikimap.""" response = fetch(self.api + "?action=query&meta=siteinfo&siprop=interwikimap&sifilteriw=local&format=json") iw = json.loads(response.content) if "error" in iw: raise RuntimeError("%s - %s" % (iw["error"]["code"], iw["error"]["info"])) return [wiki for wiki in iw["query"]["interwikimap"] if "language" in wiki]
def _ocr_callback(self, cmd_uri, parser_func=None): """OCR callback function. @return: tuple (error, text [error description in case of error]). """ def id(x): return x if not cmd_uri: raise ValueError('Parameter cmd_uri is mandatory.') if parser_func is None: parser_func = id if not callable(parser_func): raise TypeError('Keyword parser_func must be callable.') # wrong link fail with Exceptions try: response = http.fetch(cmd_uri, charset='utf-8') except Exception as e: pywikibot.error('Querying %s: %s' % (cmd_uri, e)) return (True, e) data = json.loads(response.content) assert 'error' in data, 'Error from phe-tools: %s' % data assert data['error'] in [0, 1], 'Error from phe-tools: %s' % data error = bool(data['error']) if error: pywikibot.error('Querying %s: %s' % (cmd_uri, data['text'])) return (error, data['text']) else: return (error, parser_func(data['text']))
def github_svn_rev2hash(tag, rev): """Convert a Subversion revision to a Git hash using Github. @param tag: name of the Subversion repo on Github @param rev: Subversion revision identifier @return: the git hash @rtype: str """ from io import StringIO import xml.dom.minidom from pywikibot.comms import http uri = 'https://github.com/wikimedia/%s/!svn/vcc/default' % tag request = http.fetch(uri=uri, method='PROPFIND', body="<?xml version='1.0' encoding='utf-8'?>" "<propfind xmlns=\"DAV:\"><allprop/></propfind>", headers={'label': str(rev), 'user-agent': 'SVN/1.7.5 {pwb}'}) data = request.content dom = xml.dom.minidom.parse(StringIO(data)) hsh = dom.getElementsByTagName("C:git-commit")[0].firstChild.nodeValue date = dom.getElementsByTagName("S:date")[0].firstChild.nodeValue date = time.strptime(date[:19], '%Y-%m-%dT%H:%M:%S') return hsh, date
def getWebCitationURL(url, timestamp=None): """Return archived URL by Web Citation. See http://www.webcitation.org/doc/WebCiteBestPracticesGuide.pdf for more details @param url: url to search an archived version for @param timestamp: requested archive date. The version closest to that moment is returned. Format: YYYYMMDDhhmmss or part thereof. """ uri = u'http://www.webcitation.org/query?' query = {'returnxml': 'true', 'url': url} if timestamp is not None: query['date'] = timestamp uri = uri + urlencode(query) xmltext = http.fetch(uri).content if "success" in xmltext: data = ET.fromstring(xmltext) return data.find('.//webcite_url').text else: return None
def test_no_params(self): """Test fetch method with no parameters.""" r = http.fetch(uri=self.get_httpbin_url('/get'), params={}) self.assertEqual(r.status, 200) content = json.loads(r.content) self.assertDictEqual(content['args'], {})
def test_https_cert_error(self): """Test if http.fetch respects disable_ssl_certificate_validation.""" self.assertRaisesRegex(pywikibot.FatalServerError, self.CERT_VERIFY_FAILED_RE, http.fetch, uri='https://testssl-expire-r2i2.disig.sk/index.en.html') http.session.close() # clear the connection with warnings.catch_warnings(record=True) as warning_log: response = http.fetch( uri='https://testssl-expire-r2i2.disig.sk/index.en.html', disable_ssl_certificate_validation=True) r = response.content self.assertIsInstance(r, unicode) self.assertTrue(re.search(r'<title>.*</title>', r)) http.session.close() # clear the connection # Verify that it now fails again self.assertRaisesRegex(pywikibot.FatalServerError, self.CERT_VERIFY_FAILED_RE, http.fetch, uri='https://testssl-expire-r2i2.disig.sk/index.en.html') http.session.close() # clear the connection # Verify that the warning occurred self.assertIn('InsecureRequestWarning', [w.category.__name__ for w in warning_log])
def getOpenStreetMap(latitude, longitude): """ Get the result from https://nominatim.openstreetmap.org/reverse . @rtype: list of tuples """ result = [] gotInfo = False parameters = urlencode({'lat': latitude, 'lon': longitude, 'accept-language': 'en'}) while not gotInfo: try: page = fetch('https://nominatim.openstreetmap.org/reverse?format=xml&%s' % parameters) et = xml.etree.ElementTree.fromstring(page.content) gotInfo = True except IOError: pywikibot.output(u'Got an IOError, let\'s try again') time.sleep(30) except socket.timeout: pywikibot.output(u'Got a timeout, let\'s try again') time.sleep(30) validParts = [u'hamlet', u'village', u'city', u'county', u'country'] invalidParts = [u'path', u'road', u'suburb', u'state', u'country_code'] addressparts = et.find('addressparts') for addresspart in addressparts.getchildren(): if addresspart.tag in validParts: result.append(addresspart.text) elif addresspart.tag in invalidParts: pywikibot.output(u'Dropping %s, %s' % (addresspart.tag, addresspart.text)) else: pywikibot.warning('%s, %s is not in addressparts lists' % (addresspart.tag, addresspart.text)) return result
def getInternetArchiveURL(url, timestamp=None): """Return archived URL by Internet Archive. See [[:mw:Archived Pages]] and https://archive.org/help/wayback_api.php for more details. @param url: url to search an archived version for @param timestamp: requested archive date. The version closest to that moment is returned. Format: YYYYMMDDhhmmss or part thereof. """ import json uri = u'https://archive.org/wayback/available?' query = {'url': url} if timestamp is not None: query['timestamp'] = timestamp uri = uri + urlencode(query) jsontext = http.fetch(uri).content if "closest" in jsontext: data = json.loads(jsontext) return data['archived_snapshots']['closest']['url'] else: return None
def test_follow_redirects(self): """Test follow 301 redirects after an exception works correctly.""" # to be effective, this exception should be raised in httplib2 self.assertRaises(Exception, http.fetch, uri='invalid://url') # The following will redirect from ' ' -> '_', and maybe to https:// r = http.fetch(uri='http://en.wikipedia.org/wiki/Main%20Page') self.assertEqual(r.status, 200) self.assertIn('//en.wikipedia.org/wiki/Main_Page', r.response_headers['content-location']) r = http.fetch(uri='http://www.gandi.eu') self.assertEqual(r.status, 200) self.assertEqual(r.response_headers['content-location'], 'http://www.gandi.net')
def get_image_from_image_page(imagePage): """Get the image object to work based on an imagePage object.""" imageBuffer = None imageURL = imagePage.fileUrl() imageURLopener = http.fetch(imageURL) imageBuffer = io.BytesIO(imageURLopener.raw[:]) image = Image.open(imageBuffer) return image
def test_fetch(self): """Test http.fetch using http://www.wikipedia.org/.""" r = http.fetch('http://www.wikipedia.org/') self.assertIsInstance(r, threadedhttp.HttpRequest) self.assertEqual(r.status, 200) self.assertIn('<html lang="mul"', r.content) self.assertIsInstance(r.content, unicode) self.assertIsInstance(r.raw, bytes)
def test_tools_path(self): """Test tools path.""" if '?' in tool: self.skipTest('"{0}" is a regex!'.format(tool)) path = 'http://tools.wmflabs.org/%s?user=%s' % (tool, 'xqt') request = fetch(path) self.assertIn(request.status, (200, 207), 'Http response status {0} for "{1}"' ''.format(request.data.status_code, tool))
def test_follow_redirects(self): """Test follow 301 redirects after an exception works correctly.""" # It doesnt matter what exception is raised here, provided it # occurs within the httplib2 request method. self.assertRaises(KeyError, http.fetch, uri='invalid://url') # The following will redirect from ' ' -> '_', and maybe to https:// r = http.fetch(uri='http://en.wikipedia.org/wiki/Main%20Page') self.assertEqual(r.status, 200) self.assertIn('//en.wikipedia.org/wiki/Main_Page', r.response_headers['content-location']) r = http.fetch(uri='http://www.gandi.eu') self.assertEqual(r.status, 200) self.assertEqual(r.response_headers['content-location'], 'http://www.gandi.net')
def downloadPhoto(photoUrl=''): """ Download the photo and store it in a io.BytesIO object. TODO: Add exception handling """ imageFile = fetch(photoUrl).raw return io.BytesIO(imageFile)
def downloadPhoto(self): """ Download the photo and store it in a io.BytesIO object. TODO: Add exception handling """ if not self.contents: imageFile = fetch(self.URL).raw self.contents = io.BytesIO(imageFile) return self.contents
def getFlinfoDescription(photo_id=0): """ Get the description from http://wikipedia.ramselehof.de/flinfo.php. TODO: Add exception handling, try a couple of times """ parameters = urlencode({'id': photo_id, 'raw': 'on'}) return fetch( 'http://wikipedia.ramselehof.de/flinfo.php?%s' % parameters).content
def setUpClass(cls): """ Set up the test class. Prevent tests running if the host is down. """ super(CheckHostnameMixin, cls).setUpClass() if not hasattr(cls, 'sites'): return for key, data in cls.sites.items(): if 'hostname' not in data: raise Exception('%s: hostname not defined for %s' % (cls.__name__, key)) hostname = data['hostname'] if hostname in cls._checked_hostnames: if isinstance(cls._checked_hostnames[hostname], Exception): raise unittest.SkipTest( '%s: hostname %s failed (cached): %s' % (cls.__name__, hostname, cls._checked_hostnames[hostname])) elif cls._checked_hostnames[hostname] is False: raise unittest.SkipTest('%s: hostname %s failed (cached)' % (cls.__name__, hostname)) else: continue e = None try: if '://' not in hostname: hostname = 'http://' + hostname r = http.fetch(uri=hostname, default_error_handling=False) if r.exception: e = r.exception else: if r.status not in [200, 301, 302, 303, 307, 308]: raise ServerError('HTTP status: %d' % r.status) r.content # default decode may raise exception except Exception as e2: pywikibot.error('%s: accessing %s caused exception:' % (cls.__name__, hostname)) pywikibot.exception(e2, tb=True) e = e2 pass if e: cls._checked_hostnames[hostname] = e raise unittest.SkipTest( '%s: hostname %s failed: %s' % (cls.__name__, hostname, e)) cls._checked_hostnames[hostname] = True
def _parse_post_117(self): """Parse 1.17+ siteinfo data.""" response = fetch(self.api + '?action=query&meta=siteinfo&format=json') info = json.loads(response.content)['query']['general'] self.version = MediaWikiVersion.from_generator(info['generator']) if self.version < MediaWikiVersion('1.17'): return self.server = urljoin(self.fromurl, info['server']) for item in ['scriptpath', 'articlepath', 'lang']: setattr(self, item, info[item])
def langs(self): """Build interwikimap.""" response = fetch( self.api + "?action=query&meta=siteinfo&siprop=interwikimap&sifilteriw=local&format=json") iw = json.loads(response.content) if 'error' in iw: raise RuntimeError('{0!s} - {1!s}'.format(iw['error']['code'], iw['error']['info'])) return [wiki for wiki in iw['query']['interwikimap'] if u'language' in wiki]
def test_unencoded_params(self): """ Test fetch method with unencoded parameters, which should be encoded internally. HTTPBin returns the args in their urldecoded form, so what we put in should be the same as what we get out. """ r = http.fetch(uri=self.get_httpbin_url('/get'), params={'fish&chips': 'delicious'}) self.assertEqual(r.status, 200) content = json.loads(r.content) self.assertDictEqual(content['args'], {'fish&chips': 'delicious'})
def langs(self): """Build interwikimap.""" response = fetch( self.api + '?action=query&meta=siteinfo&siprop=interwikimap' '&sifilteriw=local&format=json') iw = json.loads(response.text) if 'error' in iw: raise RuntimeError('%s - %s' % (iw['error']['code'], iw['error']['info'])) return [wiki for wiki in iw['query']['interwikimap'] if 'language' in wiki]
def _test_fetch_use_fake_user_agent(self): """Test `use_fake_user_agent` argument of http.fetch.""" # Existing headers r = http.fetch( self.get_httpbin_url('/status/200'), headers={'user-agent': 'EXISTING'}) self.assertEqual(r.headers['user-agent'], 'EXISTING') # Argument value changes r = http.fetch(self.get_httpbin_url('/status/200'), use_fake_user_agent=True) self.assertNotEqual(r.headers['user-agent'], http.user_agent()) r = http.fetch(self.get_httpbin_url('/status/200'), use_fake_user_agent=False) self.assertEqual(r.headers['user-agent'], http.user_agent()) r = http.fetch( self.get_httpbin_url('/status/200'), use_fake_user_agent='ARBITRARY') self.assertEqual(r.headers['user-agent'], 'ARBITRARY') # Manually overridden domains config.fake_user_agent_exceptions = {self.get_httpbin_hostname(): 'OVERRIDDEN'} r = http.fetch( self.get_httpbin_url('/status/200'), use_fake_user_agent=False) self.assertEqual(r.headers['user-agent'], 'OVERRIDDEN')
def __init__(self, fromurl): """ Constructor. @raises ServerError: a server error occurred while loading the site @raises Timeout: a timeout occurred while loading the site @raises RuntimeError: Version not found or version less than 1.14 """ if fromurl.endswith("$1"): fromurl = fromurl[:-2] r = fetch(fromurl) if r.status == 503: raise ServerError('Service Unavailable') if fromurl != r.data.url: pywikibot.log('{0} redirected to {1}'.format(fromurl, r.data.url)) fromurl = r.data.url self.fromurl = fromurl data = r.content wp = WikiHTMLPageParser(fromurl) wp.feed(data) self.version = wp.version self.server = wp.server self.scriptpath = wp.scriptpath self.articlepath = None try: self._parse_pre_117(data) except Exception as e: pywikibot.log('MW pre-1.17 detection failed: {0!r}'.format(e)) if self.api: try: self._parse_post_117() except Exception as e: pywikibot.log('MW 1.17+ detection failed: {0!r}'.format(e)) if not self.version: self._fetch_old_version() if not self.api: raise RuntimeError('Unsupported url: {0}'.format(self.fromurl)) if (not self.version or self.version < MediaWikiVersion('1.14')): raise RuntimeError('Unsupported version: {0}'.format(self.version))
def getversion_onlinerepo(): """Retrieve current framework git hash from Gerrit.""" from pywikibot.comms import http url = 'https://gerrit.wikimedia.org/r/projects/pywikibot%2Fcore/branches/master' # Gerrit API responses include )]}' at the beginning, make sure to strip it out buf = http.fetch(uri=url, headers={'user-agent': '{pwb}'}).text[4:] try: hsh = json.loads(buf)['revision'] return hsh except Exception as e: raise ParseError(repr(e) + ' while parsing ' + repr(buf))
def query(self, query, headers=DEFAULT_HEADERS): """ Run SPARQL query and return parsed JSON result. @param query: Query text @type query: string """ url = '%s?query=%s' % (self.endpoint, quote(query)) self.last_response = http.fetch(url, headers=headers) if not self.last_response.content: return None try: return json.loads(self.last_response.content) except ValueError: return None
def getversion_onlinerepo(repo=None): """Retrieve current framework revision number from online repository. @param repo: (optional) Online repository location @type repo: URL or string """ from pywikibot.comms import http url = repo or 'https://git.wikimedia.org/feed/pywikibot/core' buf = http.fetch(url).content.splitlines() try: hsh = buf[13].split('/')[5][:-1] return hsh except Exception as e: raise ParseError(repr(e) + ' while parsing ' + repr(buf))
def getOpenStreetMap(latitude, longitude): """ Get the result from https://nominatim.openstreetmap.org/reverse . @rtype: list of tuples """ result = [] gotInfo = False parameters = urlencode({ 'lat': latitude, 'lon': longitude, 'accept-language': 'en' }) while not gotInfo: try: page = fetch( 'https://nominatim.openstreetmap.org/reverse?format=xml&%s' % parameters) et = xml.etree.ElementTree.fromstring(page.content) gotInfo = True except IOError: pywikibot.output(u'Got an IOError, let\'s try again') time.sleep(30) except socket.timeout: pywikibot.output(u'Got a timeout, let\'s try again') time.sleep(30) validParts = [u'hamlet', u'village', u'city', u'county', u'country'] invalidParts = [u'path', u'road', u'suburb', u'state', u'country_code'] addressparts = et.find('addressparts') for addresspart in addressparts.getchildren(): if addresspart.tag in validParts: result.append(addresspart.text) elif addresspart.tag in invalidParts: pywikibot.output(u'Dropping %s, %s' % (addresspart.tag, addresspart.text)) else: pywikibot.warning('%s, %s is not in addressparts lists' % (addresspart.tag, addresspart.text)) return result
def fetch(self, table: str, format='xml') -> bytes: # pragma: no cover """ DEPRECATED. Fetch data from WikiStats. @param table: table of data to fetch @param format: Format of data to use @type format: 'xml' or 'csv'. """ if format == 'xml': path = '/{format}/{table}.{format}' else: path = '/api.php?action=dump&table={table}&format={format}' url = self.url + path if table not in self.ALL_KEYS: pywikibot.warning('WikiStats unknown table ' + table) if table in self.FAMILY_MAPPING: table = self.FAMILY_MAPPING[table] r = http.fetch(url.format(table=table, format=format)) return r.raw
def pageTextPost(url, parameters): """ Get data from commons helper page. @param url: This parameter is not used here, we keep it here to avoid user scripts from breaking. @param parameters: Data that will be submitted to CommonsHelper. @type parameters: dict @return: A CommonHelper description message. @rtype: str """ gotInfo = False while not gotInfo: try: commonsHelperPage = fetch('https://commonshelper.toolforge.org/', method='POST', data=parameters) data = commonsHelperPage.data.content.decode('utf-8') gotInfo = True except RequestException: pywikibot.output("Got a RequestException, let's try again") return data
def url_image(self): """Get the file url of the scan of ProofreadPage. @return: file url of the scan ProofreadPage or None. @rtype: str/unicode @raises Exception: in case of http errors @raise ImportError: if bs4 is not installed, _bs4_soup() will raise @raises ValueError: in case of no prp_page_image src found for scan """ # wrong link fails with various possible Exceptions. if not hasattr(self, '_url_image'): if self.exists(): url = self.full_url() else: path = 'w/index.php?title={0}&action=edit&redlink=1' url = self.site.base_url(path.format(self.title(as_url=True))) try: response = http.fetch(url, charset='utf-8') except Exception: pywikibot.error('Error fetching HTML for %s.' % self) raise soup = _bs4_soup(response.text) try: self._url_image = soup.find(class_='prp-page-image') # if None raises AttributeError self._url_image = self._url_image.find('img') # if None raises TypeError. self._url_image = self._url_image['src'] except (TypeError, AttributeError): raise ValueError('No prp-page-image src found for %s.' % self) else: self._url_image = 'https:' + self._url_image return self._url_image
def github_svn_rev2hash(tag, rev): """Convert a Subversion revision to a Git hash using Github. @param tag: name of the Subversion repo on Github @param rev: Subversion revision identifier @return: the git hash @rtype: str """ from pywikibot.comms import http uri = 'https://github.com/wikimedia/%s/!svn/vcc/default' % tag request = http.fetch(uri=uri, method='PROPFIND', body="<?xml version='1.0' encoding='utf-8'?>" "<propfind xmlns=\"DAV:\"><allprop/></propfind>", headers={'label': str(rev), 'user-agent': 'SVN/1.7.5 {pwb}'}) dom = xml.dom.minidom.parse(BytesIO(request.raw)) hsh = dom.getElementsByTagName("C:git-commit")[0].firstChild.nodeValue date = dom.getElementsByTagName("S:date")[0].firstChild.nodeValue date = time.strptime(date[:19], '%Y-%m-%dT%H:%M:%S') return hsh, date
def filterParents(categories): """Remove all parent categories from the set to prevent overcategorization.""" result = [] toFilter = u'' for cat in categories: cat = cat.replace('_', ' ') toFilter = toFilter + "[[Category:" + cat + "]]\n" parameters = urlencode({'source': toFilter.encode('utf-8'), 'bot': '1'}) filterCategoriesRe = re.compile(r'\[\[Category:([^\]]*)\]\]') try: filterCategoriesPage = fetch( "https://toolserver.org/~multichill/filtercats.php?%s" % parameters) result = filterCategoriesRe.findall(filterCategoriesPage.content) except IOError: # Something is wrong, forget about this filter, and return the input return categories if not result: # Is empty, dont want to remove all categories return categories return result
def url_image(self): """Get the file url of the scan of ProofreadPage. @return: file url of the scan ProofreadPage or None. @rtype: str/unicode @raises: - Exception in case of http errors. """ # wrong link fail with various possible Exceptions. if not hasattr(self, '_url_image'): if self.exists(): url = self.full_url() else: path = 'w/index.php?title={0}&action=edit&redlink=1' url = self.site.base_url(path.format(self.title(asUrl=True))) try: response = http.fetch(url, charset='utf-8') except Exception: pywikibot.error('Error fetching HTML for %s.' % self) raise soup = Soup(response.content) try: # None if nothing is found by .find() self._url_image = soup.find(class_='prp-page-image') self._url_image = self._url_image.find('img') # if None raises TypeError. self._url_image = self._url_image['src'] except TypeError: raise ValueError('No prp-page-image src found for %s.' % self) else: self._url_image = 'https:' + self._url_image return self._url_image
def getDataFromHost(self, queryStr): """ Go and fetch a query from the host's API. @rtype: dict """ url = self.getUrl(queryStr) try: resp = http.fetch(url) except: pywikibot.warning(u"Failed to retrieve %s" % url) raise try: data = json.loads(resp.content) except ValueError: pywikibot.warning( u"Data received from host but no JSON could be decoded") raise pywikibot.ServerError( "Data received from host but no JSON could be decoded") return data
def getInternetArchiveURL(url, timestamp=None): """Return archived URL by Internet Archive. See [[:mw:Archived Pages]] and https://archive.org/help/wayback_api.php for more details. @param url: url to search an archived version for @param timestamp: requested archive date. The version closest to that moment is returned. Format: YYYYMMDDhhmmss or part thereof. """ uri = u'https://archive.org/wayback/available?' query = {'url': url} if timestamp is not None: query['timestamp'] = timestamp uri = uri + urlencode(query) retry_count = 0 while retry_count <= config2.max_retries: try: jsontext = http.fetch(uri).text break except RequestsConnectionError as e: error = e retry_count += 1 sleep(config2.retry_wait) else: raise error if "closest" in jsontext: data = json.loads(jsontext) return data['archived_snapshots']['closest']['url'] else: return None
def test_https_cert_error(self): """Test if http.fetch respects disable_ssl_certificate_validation.""" self.assertRaisesRegex( pywikibot.FatalServerError, self.CERT_VERIFY_FAILED_RE, http.fetch, 'https://testssl-expire-r2i2.disig.sk/index.en.html') http.session.close() # clear the connection with warnings.catch_warnings(record=True) as warning_log: response = http.fetch( 'https://testssl-expire-r2i2.disig.sk/index.en.html', verify=False) self.assertIsInstance(response.text, str) self.assertTrue(re.search(r'<title>.*</title>', response.text)) http.session.close() # clear the connection # Verify that it now fails again self.assertRaisesRegex( pywikibot.FatalServerError, self.CERT_VERIFY_FAILED_RE, http.fetch, 'https://testssl-expire-r2i2.disig.sk/index.en.html') http.session.close() # clear the connection # Verify that the warning occurred self.assertIn('InsecureRequestWarning', [w.category.__name__ for w in warning_log])
def test_https_cert_error(self): """Test if http.fetch respects disable_ssl_certificate_validation.""" self.assertRaises(pywikibot.FatalServerError, http.fetch, uri='https://testssl-expire.disig.sk/index.en.html') with warnings.catch_warnings(record=True) as warning_log: response = http.fetch( uri='https://testssl-expire.disig.sk/index.en.html', disable_ssl_certificate_validation=True) r = response.content self.assertIsInstance(r, unicode) self.assertTrue(re.search(r'<title>.*</title>', r)) # Verify that it now fails again http.session.close() # but first clear the connection self.assertRaises(pywikibot.FatalServerError, http.fetch, uri='https://testssl-expire.disig.sk/index.en.html') # Verify that the warning occurred self.assertEqual(len(warning_log), 1) self.assertEqual(warning_log[0].category.__name__, 'InsecureRequestWarning')
def fetch(self, table, format="xml"): """ Fetch data from WikiStats. @param table: table of data to fetch @type table: basestring @param format: Format of data to use @type format: 'xml' or 'csv'. @rtype: bytes """ if format == 'xml': path = '/{format}/{table}.{format}' else: path = '/api.php?action=dump&table={table}&format={format}' URL = self.url + path if table not in self.ALL_KEYS: pywikibot.warning('WikiStats unknown table %s' % table) if table in self.FAMILY_MAPPING: table = self.FAMILY_MAPPING[table] r = http.fetch(URL.format(table=table, format=format)) return r.raw
def test_server_not_found(self): """Test server not found exception.""" with self.assertRaisesRegex( ConnectionError, 'Max retries exceeded with url: /w/api.php'): http.fetch('http://ru-sib.wikipedia.org/w/api.php', default_error_handling=True)
def test_http(self): """Test with http, standard http interface for pywikibot.""" r = http.fetch(self.url) self.assertEqual(r.headers['content-type'], 'image/png') self.assertEqual(r.content, self.png)
def getCommonshelperCats(imagepage): """Get category suggestions from CommonSense. @rtype: list of unicode """ commonshelperCats = [] usage = [] galleries = [] global search_wikis global hint_wiki site = imagepage.site lang = site.code family = site.family.name if lang == u'commons' and family == u'commons': parameters = urlencode({ 'i': imagepage.title(withNamespace=False).encode('utf-8'), 'r': 'on', 'go-clean': 'Find+Categories', 'p': search_wikis, 'cl': hint_wiki }) elif family == u'wikipedia': parameters = urlencode({ 'i': imagepage.title(withNamespace=False).encode('utf-8'), 'r': 'on', 'go-move': 'Find+Categories', 'p': search_wikis, 'cl': hint_wiki, 'w': lang }) else: # Cant handle other sites atm return [], [], [] commonsenseRe = re.compile( r'^#COMMONSENSE(.*)#USAGE(\s)+\((?P<usagenum>(\d)+)\)\s(?P<usage>(.*))\s' r'#KEYWORDS(\s)+\((?P<keywords>(\d)+)\)(.*)' r'#CATEGORIES(\s)+\((?P<catnum>(\d)+)\)\s(?P<cats>(.*))\s' r'#GALLERIES(\s)+\((?P<galnum>(\d)+)\)\s(?P<gals>(.*))\s(.*)#EOF$', re.MULTILINE + re.DOTALL) gotInfo = False matches = None maxtries = 10 tries = 0 while not gotInfo: try: if tries < maxtries: tries += 1 commonsHelperPage = fetch( "https://toolserver.org/~daniel/WikiSense/CommonSense.php?%s" % parameters) matches = commonsenseRe.search(commonsHelperPage.content) gotInfo = True else: break except IOError: pywikibot.output(u'Got an IOError, let\'s try again') except socket.timeout: pywikibot.output(u'Got a timeout, let\'s try again') if matches and gotInfo: if matches.group('usagenum') > 0: used = matches.group('usage').splitlines() for use in used: usage = usage + getUsage(use) if matches.group('catnum') > 0: cats = matches.group('cats').splitlines() for cat in cats: commonshelperCats.append(cat.replace('_', ' ')) pywikibot.output(u'category : ' + cat) if matches.group('galnum') > 0: gals = matches.group('gals').splitlines() for gal in gals: galleries.append(gal.replace('_', ' ')) pywikibot.output(u'gallery : ' + gal) commonshelperCats = list(set(commonshelperCats)) galleries = list(set(galleries)) for (lang, project, article) in usage: pywikibot.output(lang + project + article) return commonshelperCats, usage, galleries
def _ocr_callback(self, cmd_uri, parser_func=None, ocr_tool=None): """OCR callback function. @return: tuple (error, text [error description in case of error]). """ def identity(x): return x if not cmd_uri: raise ValueError('Parameter cmd_uri is mandatory.') if parser_func is None: parser_func = identity if not callable(parser_func): raise TypeError('Keyword parser_func must be callable.') if ocr_tool not in self._OCR_METHODS: raise TypeError("ocr_tool must be in {}, not '{}'.".format( self._OCR_METHODS, ocr_tool)) # wrong link fail with Exceptions for retry in range(5, 30, 5): pywikibot.debug('{}: get URI {!r}'.format(ocr_tool, cmd_uri), _logger) try: response = http.fetch(cmd_uri) except ReadTimeout as e: pywikibot.warning('ReadTimeout %s: %s' % (cmd_uri, e)) except Exception as e: pywikibot.error('"{}": {}'.format(cmd_uri, e)) return True, e else: pywikibot.debug('{}: {}'.format(ocr_tool, response.text), _logger) break pywikibot.warning('retrying in {} seconds ...'.format(retry)) time.sleep(retry) else: return True, ReadTimeout if 400 <= response.status_code < 600: return True, 'Http response status {}'.format(response.status_code) data = json.loads(response.text) if ocr_tool == self._PHETOOLS: # phetools assert 'error' in data, 'Error from phetools: %s' % data assert data['error'] in [0, 1, 2, 3], \ 'Error from phetools: {}'.format(data) error, _text = bool(data['error']), data['text'] else: # googleOCR if 'error' in data: error, _text = True, data['error'] else: error, _text = False, data['text'] if error: pywikibot.error('OCR query %s: %s' % (cmd_uri, _text)) return error, _text else: return error, parser_func(_text)
def _ocr_callback(self, cmd_uri, parser_func=None, ocr_tool=None): """OCR callback function. @return: tuple (error, text [error description in case of error]). """ def identity(x): return x if not cmd_uri: raise ValueError('Parameter cmd_uri is mandatory.') if parser_func is None: parser_func = identity if not callable(parser_func): raise TypeError('Keyword parser_func must be callable.') if ocr_tool not in self._OCR_METHODS: raise TypeError("ocr_tool must be in %s, not '%s'." % (self._OCR_METHODS, ocr_tool)) # wrong link fail with Exceptions retry = 0 while retry < 5: pywikibot.debug('{0}: get URI {1!r}'.format(ocr_tool, cmd_uri), _logger) try: response = http.fetch(cmd_uri) except requests.exceptions.ReadTimeout as e: retry += 1 pywikibot.warning('ReadTimeout %s: %s' % (cmd_uri, e)) pywikibot.warning('retrying in %s seconds ...' % (retry * 5)) time.sleep(retry * 5) except Exception as e: pywikibot.error('"%s": %s' % (cmd_uri, e)) return (True, e) else: pywikibot.debug('{0}: {1}'.format(ocr_tool, response.text), _logger) break if 400 <= response.status < 600: return (True, 'Http response status {0}'.format(response.status)) data = json.loads(response.text) if ocr_tool == self._PHETOOLS: # phetools assert 'error' in data, 'Error from phetools: %s' % data assert data['error'] in [0, 1, 2, 3], ('Error from phetools: %s' % data) error, _text = bool(data['error']), data['text'] else: # googleOCR if 'error' in data: error, _text = True, data['error'] else: error, _text = False, data['text'] if error: pywikibot.error('OCR query %s: %s' % (cmd_uri, _text)) return (error, _text) else: return (error, parser_func(_text))
def run(self): """Run bot.""" def convert_from_bytes(total_bytes): for unit in ['B', 'K', 'M', 'G', 'T']: if abs(total_bytes) < 1024: return str(total_bytes) + unit total_bytes = float(format(total_bytes / 1024.0, '.2f')) return str(total_bytes) + 'P' pywikibot.output('Downloading dump from ' + self.opt.wikiname) download_filename = '{wikiname}-{dumpdate}-{filename}'.format_map( self.opt) temp_filename = download_filename + '-' + \ binascii.b2a_hex(urandom(8)).decode('ascii') + '.part' file_final_storepath = os.path.join( self.opt.storepath, download_filename) file_current_storepath = os.path.join( self.opt.storepath, temp_filename) # https://wikitech.wikimedia.org/wiki/Help:Toolforge#Dumps toolforge_dump_filepath = self.get_dump_name( self.opt.wikiname, self.opt.filename, self.opt.dumpdate) # First iteration for atomic download with temporary file # Second iteration for fallback non-atomic download for non_atomic in range(2): try: if toolforge_dump_filepath: pywikibot.output('Symlinking file from ' + toolforge_dump_filepath) if non_atomic: if os.path.exists(file_final_storepath): remove(file_final_storepath) symlink(toolforge_dump_filepath, file_current_storepath) else: url = 'https://dumps.wikimedia.org/{}/{}/{}'.format( self.opt.wikiname, self.opt.dumpdate, download_filename) pywikibot.output('Downloading file from ' + url) response = fetch(url, stream=True) if response.status_code != 200: if response.status_code == 404: pywikibot.output( 'File with name {filename!r}, from dumpdate ' '{dumpdate!r}, and wiki {wikiname!r} ({url}) ' "isn't available in the Wikimedia Dumps" .format(url=url, **self.opt)) return with open(file_current_storepath, 'wb') as result_file: total = int(response.headers['content-length']) if total == -1: pywikibot.warning("'content-length' missing in " 'response headers') downloaded = 0 parts = 50 display_string = '' pywikibot.output('') for data in response.iter_content(100 * 1024): result_file.write(data) if total <= 0: continue downloaded += len(data) done = int(parts * downloaded / total) display = map(convert_from_bytes, (downloaded, total)) prior_display = display_string display_string = '\r|{}{}|{}{}/{}'.format( '=' * done, '-' * (parts - done), ' ' * 5, *display) # Add whitespace to cover up prior bar display_string += ' ' * ( len(prior_display.rstrip()) - len(display_string.rstrip())) pywikibot.output(display_string, newline=False) pywikibot.output('') # Rename the temporary file to the target file # if the download completes successfully if not non_atomic: replace(file_current_storepath, file_final_storepath) break except (OSError, IOError): pywikibot.exception() try: remove(file_current_storepath) except (OSError, IOError): pywikibot.exception() # If the atomic download fails, try without a temporary file # If the non-atomic download also fails, exit the script if non_atomic: return pywikibot.output('Cannot make temporary file, ' 'falling back to non-atomic download') file_current_storepath = file_final_storepath pywikibot.output('Done! File stored as ' + file_final_storepath)
def run(self): """Run bot.""" pywikibot.output('Downloading dump from ' + self.getOption('wikiname')) download_filename = '{wiki_name}-{revision}-{filename}'.format( wiki_name=self.getOption('wikiname'), revision=self.getOption('revision'), filename=self.getOption('filename') ) temp_filename = download_filename + '-' + \ binascii.b2a_hex(urandom(8)).decode('ascii') + '.part' file_final_storepath = os.path.join( self.getOption('storepath'), download_filename) file_current_storepath = os.path.join( self.getOption('storepath'), temp_filename) # https://wikitech.wikimedia.org/wiki/Help:Toolforge#Dumps toolforge_dump_filepath = self.get_dump_name( self.getOption('wikiname'), self.getOption('filename')) # First iteration for atomic download with temporary file # Second iteration for fallback non-atomic download for non_atomic in range(2): try: if toolforge_dump_filepath: pywikibot.output('Symlinking file from ' + toolforge_dump_filepath) if non_atomic: if os.path.exists(file_final_storepath): remove(file_final_storepath) symlink(toolforge_dump_filepath, file_current_storepath) else: url = 'https://dumps.wikimedia.org/{0}/{1}/{2}'.format( self.getOption('wikiname'), self.getOption('revision'), download_filename) pywikibot.output('Downloading file from ' + url) response = fetch(url, stream=True) if response.status == 200: with open(file_current_storepath, 'wb') as result_file: for data in response.data.iter_content(100 * 1024): result_file.write(data) elif response.status == 404: pywikibot.output( 'File with name "{filename}", ' 'from revision "{revision}", ' 'and wiki "{wikiname}" ({url}) isn\'t ' 'available in the Wikimedia Dumps'.format( filename=self.getOption('filename'), revision=self.getOption('revision'), url=url, wikiname=self.getOption('wikiname'))) return else: return # Rename the temporary file to the target file # if the download completes successfully if not non_atomic: replace(file_current_storepath, file_final_storepath) break except (OSError, IOError): pywikibot.exception() try: remove(file_current_storepath) except (OSError, IOError): pywikibot.exception() # If the atomic download fails, try without a temporary file # If the non-atomic download also fails, exit the script if not non_atomic: pywikibot.output('Cannot make temporary file, ' + 'falling back to non-atomic download') file_current_storepath = file_final_storepath else: return False pywikibot.output('Done! File stored as ' + file_final_storepath) return
def run(self): """Run bot.""" def convert_from_bytes(bytes): for unit in ['B', 'K', 'M', 'G', 'T']: if abs(bytes) < 1024: return str(bytes) + unit bytes = float(format(bytes / 1024.0, '.2f')) return str(bytes) + 'P' pywikibot.output('Downloading dump from ' + self.getOption('wikiname')) download_filename = '{wiki_name}-{dumpdate}-{filename}'.format( wiki_name=self.getOption('wikiname'), dumpdate=self.getOption('dumpdate'), filename=self.getOption('filename')) temp_filename = download_filename + '-' + \ binascii.b2a_hex(urandom(8)).decode('ascii') + '.part' file_final_storepath = os.path.join(self.getOption('storepath'), download_filename) file_current_storepath = os.path.join(self.getOption('storepath'), temp_filename) # https://wikitech.wikimedia.org/wiki/Help:Toolforge#Dumps toolforge_dump_filepath = self.get_dump_name( self.getOption('wikiname'), self.getOption('filename')) # First iteration for atomic download with temporary file # Second iteration for fallback non-atomic download for non_atomic in range(2): try: if toolforge_dump_filepath: pywikibot.output('Symlinking file from ' + toolforge_dump_filepath) if non_atomic: if os.path.exists(file_final_storepath): remove(file_final_storepath) symlink(toolforge_dump_filepath, file_current_storepath) else: url = 'https://dumps.wikimedia.org/{0}/{1}/{2}'.format( self.getOption('wikiname'), self.getOption('dumpdate'), download_filename) pywikibot.output('Downloading file from ' + url) response = fetch(url, stream=True) if response.status == 200: with open(file_current_storepath, 'wb') as result_file: try: total = int(response. response_headers['content-length']) except KeyError: pywikibot.exception() total = -1 downloaded = 0 parts = 50 display_string = '' pywikibot.output('') for data in response.data.iter_content(100 * 1024): result_file.write(data) if total > 0: downloaded += len(data) done = int(parts * downloaded / total) display = map(convert_from_bytes, (downloaded, total)) prior_display = display_string display_string = ('\r|{0}{1}|' + ' ' * 5 + '{2}/{3}').format( '=' * done, '-' * (parts - done), *display) # Add whitespace to cover up prior bar display_string += ' ' * ( len(prior_display.rstrip()) - len(display_string.rstrip())) pywikibot.output(display_string, newline=False) pywikibot.output('') elif response.status == 404: pywikibot.output( 'File with name "{filename}", ' 'from dumpdate "{dumpdate}", ' 'and wiki "{wikiname}" ({url}) isn\'t ' 'available in the Wikimedia Dumps'.format( filename=self.getOption('filename'), dumpdate=self.getOption('dumpdate'), url=url, wikiname=self.getOption('wikiname'))) return else: return # Rename the temporary file to the target file # if the download completes successfully if not non_atomic: replace(file_current_storepath, file_final_storepath) break except (OSError, IOError): pywikibot.exception() try: remove(file_current_storepath) except (OSError, IOError): pywikibot.exception() # If the atomic download fails, try without a temporary file # If the non-atomic download also fails, exit the script if not non_atomic: pywikibot.output('Cannot make temporary file, ' + 'falling back to non-atomic download') file_current_storepath = file_final_storepath else: return False pywikibot.output('Done! File stored as ' + file_final_storepath) return
def __init__(self, fromurl): """ Constructor. @raises ServerError: a server error occurred while loading the site @raises Timeout: a timeout occurred while loading the site @raises RuntimeError: Version not found or version less than 1.14 """ if fromurl.endswith("$1"): fromurl = fromurl[:-2] r = fetch(fromurl) if r.status == 503: raise ServerError('Service Unavailable') elif r.status == 500: raise ServerError('Internal Server Error') if fromurl != r.data.url: pywikibot.log('{0} redirected to {1}'.format(fromurl, r.data.url)) fromurl = r.data.url self.fromurl = fromurl data = r.content wp = WikiHTMLPageParser(fromurl) wp.feed(data) self.version = wp.version self.server = wp.server self.scriptpath = wp.scriptpath self.articlepath = None try: self._parse_pre_117(data) except Exception as e: pywikibot.log('MW pre-1.17 detection failed: {0!r}'.format(e)) if self.api: try: self._parse_post_117() except Exception as e: pywikibot.log('MW 1.17+ detection failed: {0!r}'.format(e)) if not self.version: self._fetch_old_version() if not self.api: raise RuntimeError('Unsupported url: {0}'.format(self.fromurl)) if not self.articlepath: if self.private_wiki: if self.api != self.fromurl and self.private_wiki: self.articlepath = self.fromurl.rsplit('/', 1)[0] + '/$1' else: raise RuntimeError( 'Unable to determine articlepath because the wiki is ' 'private. Use the Main Page URL instead of the API.') else: raise RuntimeError('Unable to determine articlepath: ' '{0}'.format(self.fromurl)) if (not self.version or self.version < MediaWikiVersion('1.14')): raise RuntimeError('Unsupported version: {0}'.format(self.version))
def read_file_content(self, file_url: str): """Return name of temp file in which remote file is saved.""" pywikibot.output('Reading file ' + file_url) handle, tempname = tempfile.mkstemp() path = Path(tempname) size = 0 dt_gen = (el for el in (15, 30, 45, 60, 120, 180, 240, 300)) while True: file_len = path.stat().st_size if file_len: pywikibot.output('Download resumed.') headers = {'Range': 'bytes={}-'.format(file_len)} else: headers = {} with open(str(path), 'ab') as fd: # T272345: Python 3.5 needs str os.lseek(handle, file_len, 0) try: response = http.fetch(file_url, stream=True, headers=headers) response.raise_for_status() # get download info, if available # Note: this is not enough to exclude pages # e.g. 'application/json' is also not a media if 'text/' in response.headers['Content-Type']: raise FatalServerError('The requested URL was not ' 'found on server.') size = max(size, int(response.headers.get('Content-Length', 0))) # stream content to temp file (in chunks of 1Mb) for chunk in response.iter_content(chunk_size=1024 * 1024): fd.write(chunk) # raised from connection lost during response.iter_content() except requests.ConnectionError: fd.flush() pywikibot.output('Connection closed at byte {}'.format( path.stat().st_size)) # raised from response.raise_for_status() except requests.HTTPError as e: # exit criteria if size is not available # error on last iteration is OK, we're requesting # {'Range': 'bytes=file_len-'} err = HTTPStatus.REQUESTED_RANGE_NOT_SATISFIABLE if response.status_code == err and path.stat().st_size: break raise FatalServerError(str(e)) from e if size and size == path.stat().st_size: break try: dt = next(dt_gen) pywikibot.output('Sleeping for {} seconds ...'.format(dt)) pywikibot.sleep(dt) except StopIteration: raise FatalServerError('Download failed, too many retries!') pywikibot.output('Downloaded {} bytes'.format(path.stat().st_size)) return tempname
def test_http(self): """Test with http, standard http interface for pywikibot.""" r = http.fetch(uri=self.url) self.assertEqual(r.raw, self.png)
def test_http_504(self): """Test that a HTTP 504 raises the correct exception.""" with self.assertRaisesRegex( Server504Error, r'Server ([^\:]+|[^\:]+:[0-9]+)' r' timed out'): http.fetch(self.get_httpbin_url('/status/504'))