Пример #1
0
    def test_fetch(self):
        """Test that using the data parameter and body parameter produce same results."""
        r_data = http.fetch(uri=self.get_httpbin_url('/post'), method='POST',
                            data={'fish&chips': 'delicious'})
        r_body = http.fetch(uri=self.get_httpbin_url('/post'), method='POST',
                            body={'fish&chips': 'delicious'})

        self.assertDictEqual(json.loads(r_data.content),
                             json.loads(r_body.content))
Пример #2
0
    def test_follow_redirects(self):
        """Test follow 301 redirects correctly."""
        # The following will redirect from ' ' -> '_', and maybe to https://
        r = http.fetch(uri='http://en.wikipedia.org/wiki/Main%20Page')
        self.assertEqual(r.status, 200)
        self.assertIsNotNone(r.data.history)
        self.assertIn('//en.wikipedia.org/wiki/Main_Page',
                      r.data.url)

        r = http.fetch(uri='http://www.gandi.eu')
        self.assertEqual(r.status, 200)
        self.assertEqual(r.data.url,
                         'http://www.gandi.net')
Пример #3
0
    def _parse_post_117(self):
        """Parse 1.17+ siteinfo data."""
        response = fetch(self.api + '?action=query&meta=siteinfo&format=json')
        check_response(response)
        # remove preleading newlines and Byte Order Mark (BOM), see T128992
        content = response.text.strip().lstrip('\uFEFF')
        info = json.loads(content)
        self.private_wiki = ('error' in info
                             and info['error']['code'] == 'readapidenied')
        if self.private_wiki:
            # user-config.py is not loaded because PYWIKIBOT_NO_USER_CONFIG
            # is set to '2' by generate_family_file.py.
            # Prepare a temporary config for login.
            username = pywikibot.input(
                'Private wiki detected. Login is required.\n'
                'Please enter your username?')
            config.usernames['temporary_family'] = {'temporary_code': username}
            # Setup a dummy family so that we can create a site object
            fam = pywikibot.Family()
            fam.name = 'temporary_family'
            fam.scriptpath = lambda code: self.api[:-8]  # without /api.php
            fam.langs = {'temporary_code': self.server}
            site = pywikibot.Site('temporary_code', fam)
            site.version = lambda: str(self.version)
            # Now the site object is able to login
            info = site.siteinfo
        else:
            info = info['query']['general']
        self.version = MediaWikiVersion.from_generator(info['generator'])
        if self.version < MediaWikiVersion('1.17'):
            return

        self.server = urljoin(self.fromurl, info['server'])
        for item in ['scriptpath', 'articlepath', 'lang']:
            setattr(self, item, info[item])
Пример #4
0
    def getDataFromHost(self, queryStr):
        """
        Go and fetch a query from the host's API.

        @rtype: dict
        """
        url = self.getUrl(queryStr)

        try:
            resp = http.fetch(url)
        except:
            pywikibot.warning(u"Failed to retrieve %s" % url)
            raise

        data = resp.content
        if not data:
            pywikibot.warning('No data received for %s' % url)
            raise pywikibot.ServerError('No data received for %s' % url)

        try:
            data = json.loads(data)
        except ValueError:
            pywikibot.warning(
                'Data received for %s but no JSON could be decoded: %r'
                % (url, data))
            raise pywikibot.ServerError(
                'Data received for %s but no JSON could be decoded: %r'
                % (url, data))

        return data
Пример #5
0
 def langs(self):
     """Build interwikimap."""
     response = fetch(self.api + "?action=query&meta=siteinfo&siprop=interwikimap&sifilteriw=local&format=json")
     iw = json.loads(response.content)
     if "error" in iw:
         raise RuntimeError("%s - %s" % (iw["error"]["code"], iw["error"]["info"]))
     return [wiki for wiki in iw["query"]["interwikimap"] if "language" in wiki]
Пример #6
0
    def _ocr_callback(self, cmd_uri, parser_func=None):
        """OCR callback function.

        @return: tuple (error, text [error description in case of error]).
        """
        def id(x):
            return x

        if not cmd_uri:
            raise ValueError('Parameter cmd_uri is mandatory.')

        if parser_func is None:
            parser_func = id

        if not callable(parser_func):
            raise TypeError('Keyword parser_func must be callable.')

        # wrong link fail with Exceptions
        try:
            response = http.fetch(cmd_uri, charset='utf-8')
        except Exception as e:
            pywikibot.error('Querying %s: %s' % (cmd_uri, e))
            return (True, e)

        data = json.loads(response.content)

        assert 'error' in data, 'Error from phe-tools: %s' % data
        assert data['error'] in [0, 1], 'Error from phe-tools: %s' % data

        error = bool(data['error'])
        if error:
            pywikibot.error('Querying %s: %s' % (cmd_uri, data['text']))
            return (error, data['text'])
        else:
            return (error, parser_func(data['text']))
Пример #7
0
def github_svn_rev2hash(tag, rev):
    """Convert a Subversion revision to a Git hash using Github.

    @param tag: name of the Subversion repo on Github
    @param rev: Subversion revision identifier
    @return: the git hash
    @rtype: str
    """
    from io import StringIO
    import xml.dom.minidom
    from pywikibot.comms import http

    uri = 'https://github.com/wikimedia/%s/!svn/vcc/default' % tag
    request = http.fetch(uri=uri, method='PROPFIND',
                         body="<?xml version='1.0' encoding='utf-8'?>"
                              "<propfind xmlns=\"DAV:\"><allprop/></propfind>",
                         headers={'label': str(rev),
                                  'user-agent': 'SVN/1.7.5 {pwb}'})
    data = request.content

    dom = xml.dom.minidom.parse(StringIO(data))
    hsh = dom.getElementsByTagName("C:git-commit")[0].firstChild.nodeValue
    date = dom.getElementsByTagName("S:date")[0].firstChild.nodeValue
    date = time.strptime(date[:19], '%Y-%m-%dT%H:%M:%S')
    return hsh, date
Пример #8
0
def getWebCitationURL(url, timestamp=None):
    """Return archived URL by Web Citation.

    See http://www.webcitation.org/doc/WebCiteBestPracticesGuide.pdf
    for more details

    @param url: url to search an archived version for
    @param timestamp: requested archive date. The version closest to that
        moment is returned. Format: YYYYMMDDhhmmss or part thereof.

    """
    uri = u'http://www.webcitation.org/query?'

    query = {'returnxml': 'true',
             'url': url}

    if timestamp is not None:
        query['date'] = timestamp

    uri = uri + urlencode(query)
    xmltext = http.fetch(uri).content
    if "success" in xmltext:
        data = ET.fromstring(xmltext)
        return data.find('.//webcite_url').text
    else:
        return None
Пример #9
0
    def test_no_params(self):
        """Test fetch method with no parameters."""
        r = http.fetch(uri=self.get_httpbin_url('/get'), params={})
        self.assertEqual(r.status, 200)

        content = json.loads(r.content)
        self.assertDictEqual(content['args'], {})
Пример #10
0
    def test_https_cert_error(self):
        """Test if http.fetch respects disable_ssl_certificate_validation."""
        self.assertRaisesRegex(pywikibot.FatalServerError, self.CERT_VERIFY_FAILED_RE,
                               http.fetch,
                               uri='https://testssl-expire-r2i2.disig.sk/index.en.html')
        http.session.close()  # clear the connection

        with warnings.catch_warnings(record=True) as warning_log:
            response = http.fetch(
                uri='https://testssl-expire-r2i2.disig.sk/index.en.html',
                disable_ssl_certificate_validation=True)
        r = response.content
        self.assertIsInstance(r, unicode)
        self.assertTrue(re.search(r'<title>.*</title>', r))
        http.session.close()  # clear the connection

        # Verify that it now fails again
        self.assertRaisesRegex(pywikibot.FatalServerError, self.CERT_VERIFY_FAILED_RE,
                               http.fetch,
                               uri='https://testssl-expire-r2i2.disig.sk/index.en.html')
        http.session.close()  # clear the connection

        # Verify that the warning occurred
        self.assertIn('InsecureRequestWarning',
                      [w.category.__name__ for w in warning_log])
Пример #11
0
def getOpenStreetMap(latitude, longitude):
    """
    Get the result from https://nominatim.openstreetmap.org/reverse .

    @rtype: list of tuples
    """
    result = []
    gotInfo = False
    parameters = urlencode({'lat': latitude, 'lon': longitude, 'accept-language': 'en'})
    while not gotInfo:
        try:
            page = fetch('https://nominatim.openstreetmap.org/reverse?format=xml&%s' % parameters)
            et = xml.etree.ElementTree.fromstring(page.content)
            gotInfo = True
        except IOError:
            pywikibot.output(u'Got an IOError, let\'s try again')
            time.sleep(30)
        except socket.timeout:
            pywikibot.output(u'Got a timeout, let\'s try again')
            time.sleep(30)
    validParts = [u'hamlet', u'village', u'city', u'county', u'country']
    invalidParts = [u'path', u'road', u'suburb', u'state', u'country_code']
    addressparts = et.find('addressparts')

    for addresspart in addressparts.getchildren():
        if addresspart.tag in validParts:
            result.append(addresspart.text)
        elif addresspart.tag in invalidParts:
            pywikibot.output(u'Dropping %s, %s' % (addresspart.tag, addresspart.text))
        else:
            pywikibot.warning('%s, %s is not in addressparts lists'
                              % (addresspart.tag, addresspart.text))
    return result
Пример #12
0
def getInternetArchiveURL(url, timestamp=None):
    """Return archived URL by Internet Archive.

    See [[:mw:Archived Pages]] and https://archive.org/help/wayback_api.php
    for more details.

    @param url: url to search an archived version for
    @param timestamp: requested archive date. The version closest to that
        moment is returned. Format: YYYYMMDDhhmmss or part thereof.

    """
    import json
    uri = u'https://archive.org/wayback/available?'

    query = {'url': url}

    if timestamp is not None:
        query['timestamp'] = timestamp

    uri = uri + urlencode(query)
    jsontext = http.fetch(uri).content
    if "closest" in jsontext:
        data = json.loads(jsontext)
        return data['archived_snapshots']['closest']['url']
    else:
        return None
Пример #13
0
    def test_follow_redirects(self):
        """Test follow 301 redirects after an exception works correctly."""
        # to be effective, this exception should be raised in httplib2
        self.assertRaises(Exception,
                          http.fetch,
                          uri='invalid://url')

        # The following will redirect from ' ' -> '_', and maybe to https://
        r = http.fetch(uri='http://en.wikipedia.org/wiki/Main%20Page')
        self.assertEqual(r.status, 200)
        self.assertIn('//en.wikipedia.org/wiki/Main_Page',
                      r.response_headers['content-location'])

        r = http.fetch(uri='http://www.gandi.eu')
        self.assertEqual(r.status, 200)
        self.assertEqual(r.response_headers['content-location'],
                         'http://www.gandi.net')
Пример #14
0
def get_image_from_image_page(imagePage):
    """Get the image object to work based on an imagePage object."""
    imageBuffer = None
    imageURL = imagePage.fileUrl()
    imageURLopener = http.fetch(imageURL)
    imageBuffer = io.BytesIO(imageURLopener.raw[:])
    image = Image.open(imageBuffer)
    return image
Пример #15
0
 def test_fetch(self):
     """Test http.fetch using http://www.wikipedia.org/."""
     r = http.fetch('http://www.wikipedia.org/')
     self.assertIsInstance(r, threadedhttp.HttpRequest)
     self.assertEqual(r.status, 200)
     self.assertIn('<html lang="mul"', r.content)
     self.assertIsInstance(r.content, unicode)
     self.assertIsInstance(r.raw, bytes)
 def test_tools_path(self):
     """Test tools path."""
     if '?' in tool:
         self.skipTest('"{0}" is a regex!'.format(tool))
     path = 'http://tools.wmflabs.org/%s?user=%s' % (tool, 'xqt')
     request = fetch(path)
     self.assertIn(request.status, (200, 207),
                   'Http response status {0} for "{1}"'
                   ''.format(request.data.status_code, tool))
Пример #17
0
    def test_follow_redirects(self):
        """Test follow 301 redirects after an exception works correctly."""
        # It doesnt matter what exception is raised here, provided it
        # occurs within the httplib2 request method.
        self.assertRaises(KeyError,
                          http.fetch,
                          uri='invalid://url')

        # The following will redirect from ' ' -> '_', and maybe to https://
        r = http.fetch(uri='http://en.wikipedia.org/wiki/Main%20Page')
        self.assertEqual(r.status, 200)
        self.assertIn('//en.wikipedia.org/wiki/Main_Page',
                      r.response_headers['content-location'])

        r = http.fetch(uri='http://www.gandi.eu')
        self.assertEqual(r.status, 200)
        self.assertEqual(r.response_headers['content-location'],
                         'http://www.gandi.net')
Пример #18
0
def downloadPhoto(photoUrl=''):
    """
    Download the photo and store it in a io.BytesIO object.

    TODO: Add exception handling

    """
    imageFile = fetch(photoUrl).raw
    return io.BytesIO(imageFile)
    def downloadPhoto(self):
        """
        Download the photo and store it in a io.BytesIO object.

        TODO: Add exception handling
        """
        if not self.contents:
            imageFile = fetch(self.URL).raw
            self.contents = io.BytesIO(imageFile)
        return self.contents
Пример #20
0
def getFlinfoDescription(photo_id=0):
    """
    Get the description from http://wikipedia.ramselehof.de/flinfo.php.

    TODO: Add exception handling, try a couple of times
    """
    parameters = urlencode({'id': photo_id, 'raw': 'on'})

    return fetch(
        'http://wikipedia.ramselehof.de/flinfo.php?%s' % parameters).content
Пример #21
0
    def setUpClass(cls):
        """
        Set up the test class.

        Prevent tests running if the host is down.
        """
        super(CheckHostnameMixin, cls).setUpClass()

        if not hasattr(cls, 'sites'):
            return

        for key, data in cls.sites.items():
            if 'hostname' not in data:
                raise Exception('%s: hostname not defined for %s'
                                % (cls.__name__, key))
            hostname = data['hostname']

            if hostname in cls._checked_hostnames:
                if isinstance(cls._checked_hostnames[hostname], Exception):
                    raise unittest.SkipTest(
                        '%s: hostname %s failed (cached): %s'
                        % (cls.__name__, hostname,
                           cls._checked_hostnames[hostname]))
                elif cls._checked_hostnames[hostname] is False:
                    raise unittest.SkipTest('%s: hostname %s failed (cached)'
                                            % (cls.__name__, hostname))
                else:
                    continue

            e = None
            try:
                if '://' not in hostname:
                    hostname = 'http://' + hostname
                r = http.fetch(uri=hostname,
                               default_error_handling=False)
                if r.exception:
                    e = r.exception
                else:
                    if r.status not in [200, 301, 302, 303, 307, 308]:
                        raise ServerError('HTTP status: %d' % r.status)
                    r.content  # default decode may raise exception
            except Exception as e2:
                pywikibot.error('%s: accessing %s caused exception:'
                                % (cls.__name__, hostname))
                pywikibot.exception(e2, tb=True)
                e = e2
                pass

            if e:
                cls._checked_hostnames[hostname] = e
                raise unittest.SkipTest(
                    '%s: hostname %s failed: %s'
                    % (cls.__name__, hostname, e))

            cls._checked_hostnames[hostname] = True
Пример #22
0
    def _parse_post_117(self):
        """Parse 1.17+ siteinfo data."""
        response = fetch(self.api + '?action=query&meta=siteinfo&format=json')
        info = json.loads(response.content)['query']['general']
        self.version = MediaWikiVersion.from_generator(info['generator'])
        if self.version < MediaWikiVersion('1.17'):
            return

        self.server = urljoin(self.fromurl, info['server'])
        for item in ['scriptpath', 'articlepath', 'lang']:
            setattr(self, item, info[item])
Пример #23
0
 def langs(self):
     """Build interwikimap."""
     response = fetch(
         self.api +
         "?action=query&meta=siteinfo&siprop=interwikimap&sifilteriw=local&format=json")
     iw = json.loads(response.content)
     if 'error' in iw:
         raise RuntimeError('{0!s} - {1!s}'.format(iw['error']['code'],
                                         iw['error']['info']))
     return [wiki for wiki in iw['query']['interwikimap']
             if u'language' in wiki]
Пример #24
0
    def test_unencoded_params(self):
        """
        Test fetch method with unencoded parameters, which should be encoded internally.

        HTTPBin returns the args in their urldecoded form, so what we put in should be
        the same as what we get out.
        """
        r = http.fetch(uri=self.get_httpbin_url('/get'), params={'fish&chips': 'delicious'})
        self.assertEqual(r.status, 200)

        content = json.loads(r.content)
        self.assertDictEqual(content['args'], {'fish&chips': 'delicious'})
Пример #25
0
 def langs(self):
     """Build interwikimap."""
     response = fetch(
         self.api
         + '?action=query&meta=siteinfo&siprop=interwikimap'
           '&sifilteriw=local&format=json')
     iw = json.loads(response.text)
     if 'error' in iw:
         raise RuntimeError('%s - %s' % (iw['error']['code'],
                                         iw['error']['info']))
     return [wiki for wiki in iw['query']['interwikimap']
             if 'language' in wiki]
Пример #26
0
    def _test_fetch_use_fake_user_agent(self):
        """Test `use_fake_user_agent` argument of http.fetch."""
        # Existing headers
        r = http.fetch(
            self.get_httpbin_url('/status/200'), headers={'user-agent': 'EXISTING'})
        self.assertEqual(r.headers['user-agent'], 'EXISTING')

        # Argument value changes
        r = http.fetch(self.get_httpbin_url('/status/200'), use_fake_user_agent=True)
        self.assertNotEqual(r.headers['user-agent'], http.user_agent())
        r = http.fetch(self.get_httpbin_url('/status/200'), use_fake_user_agent=False)
        self.assertEqual(r.headers['user-agent'], http.user_agent())
        r = http.fetch(
            self.get_httpbin_url('/status/200'), use_fake_user_agent='ARBITRARY')
        self.assertEqual(r.headers['user-agent'], 'ARBITRARY')

        # Manually overridden domains
        config.fake_user_agent_exceptions = {self.get_httpbin_hostname(): 'OVERRIDDEN'}
        r = http.fetch(
            self.get_httpbin_url('/status/200'), use_fake_user_agent=False)
        self.assertEqual(r.headers['user-agent'], 'OVERRIDDEN')
Пример #27
0
    def __init__(self, fromurl):
        """
        Constructor.

        @raises ServerError: a server error occurred while loading the site
        @raises Timeout: a timeout occurred while loading the site
        @raises RuntimeError: Version not found or version less than 1.14
        """
        if fromurl.endswith("$1"):
            fromurl = fromurl[:-2]
        r = fetch(fromurl)
        if r.status == 503:
            raise ServerError('Service Unavailable')

        if fromurl != r.data.url:
            pywikibot.log('{0} redirected to {1}'.format(fromurl, r.data.url))
            fromurl = r.data.url

        self.fromurl = fromurl

        data = r.content

        wp = WikiHTMLPageParser(fromurl)
        wp.feed(data)

        self.version = wp.version
        self.server = wp.server
        self.scriptpath = wp.scriptpath
        self.articlepath = None

        try:
            self._parse_pre_117(data)
        except Exception as e:
            pywikibot.log('MW pre-1.17 detection failed: {0!r}'.format(e))

        if self.api:
            try:
                self._parse_post_117()
            except Exception as e:
                pywikibot.log('MW 1.17+ detection failed: {0!r}'.format(e))

            if not self.version:
                self._fetch_old_version()

        if not self.api:
            raise RuntimeError('Unsupported url: {0}'.format(self.fromurl))

        if (not self.version or
                self.version < MediaWikiVersion('1.14')):
            raise RuntimeError('Unsupported version: {0}'.format(self.version))
Пример #28
0
def getversion_onlinerepo():
    """Retrieve current framework git hash from Gerrit."""
    from pywikibot.comms import http

    url = 'https://gerrit.wikimedia.org/r/projects/pywikibot%2Fcore/branches/master'
    # Gerrit API responses include )]}' at the beginning, make sure to strip it out
    buf = http.fetch(uri=url,
                     headers={'user-agent': '{pwb}'}).text[4:]

    try:
        hsh = json.loads(buf)['revision']
        return hsh
    except Exception as e:
        raise ParseError(repr(e) + ' while parsing ' + repr(buf))
Пример #29
0
    def query(self, query, headers=DEFAULT_HEADERS):
        """
        Run SPARQL query and return parsed JSON result.

        @param query: Query text
        @type query: string
        """
        url = '%s?query=%s' % (self.endpoint, quote(query))
        self.last_response = http.fetch(url, headers=headers)
        if not self.last_response.content:
            return None
        try:
            return json.loads(self.last_response.content)
        except ValueError:
            return None
Пример #30
0
def getversion_onlinerepo(repo=None):
    """Retrieve current framework revision number from online repository.

    @param repo: (optional) Online repository location
    @type repo: URL or string
    """
    from pywikibot.comms import http

    url = repo or 'https://git.wikimedia.org/feed/pywikibot/core'
    buf = http.fetch(url).content.splitlines()
    try:
        hsh = buf[13].split('/')[5][:-1]
        return hsh
    except Exception as e:
        raise ParseError(repr(e) + ' while parsing ' + repr(buf))
def getOpenStreetMap(latitude, longitude):
    """
    Get the result from https://nominatim.openstreetmap.org/reverse .

    @rtype: list of tuples
    """
    result = []
    gotInfo = False
    parameters = urlencode({
        'lat': latitude,
        'lon': longitude,
        'accept-language': 'en'
    })
    while not gotInfo:
        try:
            page = fetch(
                'https://nominatim.openstreetmap.org/reverse?format=xml&%s' %
                parameters)
            et = xml.etree.ElementTree.fromstring(page.content)
            gotInfo = True
        except IOError:
            pywikibot.output(u'Got an IOError, let\'s try again')
            time.sleep(30)
        except socket.timeout:
            pywikibot.output(u'Got a timeout, let\'s try again')
            time.sleep(30)
    validParts = [u'hamlet', u'village', u'city', u'county', u'country']
    invalidParts = [u'path', u'road', u'suburb', u'state', u'country_code']
    addressparts = et.find('addressparts')

    for addresspart in addressparts.getchildren():
        if addresspart.tag in validParts:
            result.append(addresspart.text)
        elif addresspart.tag in invalidParts:
            pywikibot.output(u'Dropping %s, %s' %
                             (addresspart.tag, addresspart.text))
        else:
            pywikibot.warning('%s, %s is not in addressparts lists' %
                              (addresspart.tag, addresspart.text))
    return result
Пример #32
0
    def fetch(self, table: str, format='xml') -> bytes:  # pragma: no cover
        """
        DEPRECATED. Fetch data from WikiStats.

        @param table: table of data to fetch
        @param format: Format of data to use
        @type format: 'xml' or 'csv'.
        """
        if format == 'xml':
            path = '/{format}/{table}.{format}'
        else:
            path = '/api.php?action=dump&table={table}&format={format}'
        url = self.url + path

        if table not in self.ALL_KEYS:
            pywikibot.warning('WikiStats unknown table ' + table)

        if table in self.FAMILY_MAPPING:
            table = self.FAMILY_MAPPING[table]

        r = http.fetch(url.format(table=table, format=format))
        return r.raw
Пример #33
0
def pageTextPost(url, parameters):
    """
    Get data from commons helper page.

    @param url: This parameter is not used here, we keep it here to avoid user
                scripts from breaking.
    @param parameters: Data that will be submitted to CommonsHelper.
    @type parameters: dict
    @return: A CommonHelper description message.
    @rtype: str
    """
    gotInfo = False
    while not gotInfo:
        try:
            commonsHelperPage = fetch('https://commonshelper.toolforge.org/',
                                      method='POST',
                                      data=parameters)
            data = commonsHelperPage.data.content.decode('utf-8')
            gotInfo = True
        except RequestException:
            pywikibot.output("Got a RequestException, let's try again")
    return data
Пример #34
0
    def url_image(self):
        """Get the file url of the scan of ProofreadPage.

        @return: file url of the scan ProofreadPage or None.
        @rtype: str/unicode

        @raises Exception: in case of http errors
        @raise ImportError: if bs4 is not installed, _bs4_soup() will raise
        @raises ValueError: in case of no prp_page_image src found for scan
        """
        # wrong link fails with various possible Exceptions.
        if not hasattr(self, '_url_image'):

            if self.exists():
                url = self.full_url()
            else:
                path = 'w/index.php?title={0}&action=edit&redlink=1'
                url = self.site.base_url(path.format(self.title(as_url=True)))

            try:
                response = http.fetch(url, charset='utf-8')
            except Exception:
                pywikibot.error('Error fetching HTML for %s.' % self)
                raise

            soup = _bs4_soup(response.text)

            try:
                self._url_image = soup.find(class_='prp-page-image')
                # if None raises AttributeError
                self._url_image = self._url_image.find('img')
                # if None raises TypeError.
                self._url_image = self._url_image['src']
            except (TypeError, AttributeError):
                raise ValueError('No prp-page-image src found for %s.' % self)
            else:
                self._url_image = 'https:' + self._url_image

        return self._url_image
Пример #35
0
def github_svn_rev2hash(tag, rev):
    """Convert a Subversion revision to a Git hash using Github.

    @param tag: name of the Subversion repo on Github
    @param rev: Subversion revision identifier
    @return: the git hash
    @rtype: str
    """
    from pywikibot.comms import http

    uri = 'https://github.com/wikimedia/%s/!svn/vcc/default' % tag
    request = http.fetch(uri=uri, method='PROPFIND',
                         body="<?xml version='1.0' encoding='utf-8'?>"
                              "<propfind xmlns=\"DAV:\"><allprop/></propfind>",
                         headers={'label': str(rev),
                                  'user-agent': 'SVN/1.7.5 {pwb}'})

    dom = xml.dom.minidom.parse(BytesIO(request.raw))
    hsh = dom.getElementsByTagName("C:git-commit")[0].firstChild.nodeValue
    date = dom.getElementsByTagName("S:date")[0].firstChild.nodeValue
    date = time.strptime(date[:19], '%Y-%m-%dT%H:%M:%S')
    return hsh, date
def filterParents(categories):
    """Remove all parent categories from the set to prevent overcategorization."""
    result = []
    toFilter = u''
    for cat in categories:
        cat = cat.replace('_', ' ')
        toFilter = toFilter + "[[Category:" + cat + "]]\n"
    parameters = urlencode({'source': toFilter.encode('utf-8'), 'bot': '1'})
    filterCategoriesRe = re.compile(r'\[\[Category:([^\]]*)\]\]')
    try:
        filterCategoriesPage = fetch(
            "https://toolserver.org/~multichill/filtercats.php?%s" %
            parameters)
        result = filterCategoriesRe.findall(filterCategoriesPage.content)
    except IOError:
        # Something is wrong, forget about this filter, and return the input
        return categories

    if not result:
        # Is empty, dont want to remove all categories
        return categories
    return result
Пример #37
0
    def url_image(self):
        """Get the file url of the scan of ProofreadPage.

        @return: file url of the scan ProofreadPage or None.
        @rtype: str/unicode

        @raises:
        - Exception in case of http errors.
        """
        # wrong link fail with various possible Exceptions.
        if not hasattr(self, '_url_image'):

            if self.exists():
                url = self.full_url()
            else:
                path = 'w/index.php?title={0}&action=edit&redlink=1'
                url = self.site.base_url(path.format(self.title(asUrl=True)))

            try:
                response = http.fetch(url, charset='utf-8')
            except Exception:
                pywikibot.error('Error fetching HTML for %s.' % self)
                raise

            soup = Soup(response.content)

            try:
                # None if nothing is found by .find()
                self._url_image = soup.find(class_='prp-page-image')
                self._url_image = self._url_image.find('img')
                # if None raises TypeError.
                self._url_image = self._url_image['src']
            except TypeError:
                raise ValueError('No prp-page-image src found for %s.' % self)
            else:
                self._url_image = 'https:' + self._url_image

        return self._url_image
Пример #38
0
    def getDataFromHost(self, queryStr):
        """
        Go and fetch a query from the host's API.

        @rtype: dict
        """
        url = self.getUrl(queryStr)

        try:
            resp = http.fetch(url)
        except:
            pywikibot.warning(u"Failed to retrieve %s" % url)
            raise

        try:
            data = json.loads(resp.content)
        except ValueError:
            pywikibot.warning(
                u"Data received from host but no JSON could be decoded")
            raise pywikibot.ServerError(
                "Data received from host but no JSON could be decoded")

        return data
Пример #39
0
def getInternetArchiveURL(url, timestamp=None):
    """Return archived URL by Internet Archive.

    See [[:mw:Archived Pages]] and https://archive.org/help/wayback_api.php
    for more details.

    @param url: url to search an archived version for
    @param timestamp: requested archive date. The version closest to that
        moment is returned. Format: YYYYMMDDhhmmss or part thereof.

    """
    uri = u'https://archive.org/wayback/available?'

    query = {'url': url}

    if timestamp is not None:
        query['timestamp'] = timestamp

    uri = uri + urlencode(query)

    retry_count = 0
    while retry_count <= config2.max_retries:
        try:
            jsontext = http.fetch(uri).text
            break
        except RequestsConnectionError as e:
            error = e
            retry_count += 1
            sleep(config2.retry_wait)
    else:
        raise error

    if "closest" in jsontext:
        data = json.loads(jsontext)
        return data['archived_snapshots']['closest']['url']
    else:
        return None
Пример #40
0
    def test_https_cert_error(self):
        """Test if http.fetch respects disable_ssl_certificate_validation."""
        self.assertRaisesRegex(
            pywikibot.FatalServerError, self.CERT_VERIFY_FAILED_RE, http.fetch,
            'https://testssl-expire-r2i2.disig.sk/index.en.html')
        http.session.close()  # clear the connection

        with warnings.catch_warnings(record=True) as warning_log:
            response = http.fetch(
                'https://testssl-expire-r2i2.disig.sk/index.en.html',
                verify=False)
        self.assertIsInstance(response.text, str)
        self.assertTrue(re.search(r'<title>.*</title>', response.text))
        http.session.close()  # clear the connection

        # Verify that it now fails again
        self.assertRaisesRegex(
            pywikibot.FatalServerError, self.CERT_VERIFY_FAILED_RE, http.fetch,
            'https://testssl-expire-r2i2.disig.sk/index.en.html')
        http.session.close()  # clear the connection

        # Verify that the warning occurred
        self.assertIn('InsecureRequestWarning',
                      [w.category.__name__ for w in warning_log])
Пример #41
0
    def test_https_cert_error(self):
        """Test if http.fetch respects disable_ssl_certificate_validation."""
        self.assertRaises(pywikibot.FatalServerError,
                          http.fetch,
                          uri='https://testssl-expire.disig.sk/index.en.html')

        with warnings.catch_warnings(record=True) as warning_log:
            response = http.fetch(
                uri='https://testssl-expire.disig.sk/index.en.html',
                disable_ssl_certificate_validation=True)
        r = response.content
        self.assertIsInstance(r, unicode)
        self.assertTrue(re.search(r'<title>.*</title>', r))

        # Verify that it now fails again
        http.session.close()  # but first clear the connection
        self.assertRaises(pywikibot.FatalServerError,
                          http.fetch,
                          uri='https://testssl-expire.disig.sk/index.en.html')

        # Verify that the warning occurred
        self.assertEqual(len(warning_log), 1)
        self.assertEqual(warning_log[0].category.__name__,
                         'InsecureRequestWarning')
Пример #42
0
    def fetch(self, table, format="xml"):
        """
        Fetch data from WikiStats.

        @param table: table of data to fetch
        @type table: basestring
        @param format: Format of data to use
        @type format: 'xml' or 'csv'.
        @rtype: bytes
        """
        if format == 'xml':
            path = '/{format}/{table}.{format}'
        else:
            path = '/api.php?action=dump&table={table}&format={format}'
        URL = self.url + path

        if table not in self.ALL_KEYS:
            pywikibot.warning('WikiStats unknown table %s' % table)

        if table in self.FAMILY_MAPPING:
            table = self.FAMILY_MAPPING[table]

        r = http.fetch(URL.format(table=table, format=format))
        return r.raw
Пример #43
0
 def test_server_not_found(self):
     """Test server not found exception."""
     with self.assertRaisesRegex(
             ConnectionError, 'Max retries exceeded with url: /w/api.php'):
         http.fetch('http://ru-sib.wikipedia.org/w/api.php',
                    default_error_handling=True)
Пример #44
0
    def test_http(self):
        """Test with http, standard http interface for pywikibot."""
        r = http.fetch(self.url)

        self.assertEqual(r.headers['content-type'], 'image/png')
        self.assertEqual(r.content, self.png)
def getCommonshelperCats(imagepage):
    """Get category suggestions from CommonSense.

    @rtype: list of unicode

    """
    commonshelperCats = []
    usage = []
    galleries = []

    global search_wikis
    global hint_wiki
    site = imagepage.site
    lang = site.code
    family = site.family.name
    if lang == u'commons' and family == u'commons':
        parameters = urlencode({
            'i':
            imagepage.title(withNamespace=False).encode('utf-8'),
            'r':
            'on',
            'go-clean':
            'Find+Categories',
            'p':
            search_wikis,
            'cl':
            hint_wiki
        })
    elif family == u'wikipedia':
        parameters = urlencode({
            'i':
            imagepage.title(withNamespace=False).encode('utf-8'),
            'r':
            'on',
            'go-move':
            'Find+Categories',
            'p':
            search_wikis,
            'cl':
            hint_wiki,
            'w':
            lang
        })
    else:
        # Cant handle other sites atm
        return [], [], []

    commonsenseRe = re.compile(
        r'^#COMMONSENSE(.*)#USAGE(\s)+\((?P<usagenum>(\d)+)\)\s(?P<usage>(.*))\s'
        r'#KEYWORDS(\s)+\((?P<keywords>(\d)+)\)(.*)'
        r'#CATEGORIES(\s)+\((?P<catnum>(\d)+)\)\s(?P<cats>(.*))\s'
        r'#GALLERIES(\s)+\((?P<galnum>(\d)+)\)\s(?P<gals>(.*))\s(.*)#EOF$',
        re.MULTILINE + re.DOTALL)

    gotInfo = False
    matches = None
    maxtries = 10
    tries = 0
    while not gotInfo:
        try:
            if tries < maxtries:
                tries += 1
                commonsHelperPage = fetch(
                    "https://toolserver.org/~daniel/WikiSense/CommonSense.php?%s"
                    % parameters)
                matches = commonsenseRe.search(commonsHelperPage.content)
                gotInfo = True
            else:
                break
        except IOError:
            pywikibot.output(u'Got an IOError, let\'s try again')
        except socket.timeout:
            pywikibot.output(u'Got a timeout, let\'s try again')

    if matches and gotInfo:
        if matches.group('usagenum') > 0:
            used = matches.group('usage').splitlines()
            for use in used:
                usage = usage + getUsage(use)
        if matches.group('catnum') > 0:
            cats = matches.group('cats').splitlines()
            for cat in cats:
                commonshelperCats.append(cat.replace('_', ' '))
                pywikibot.output(u'category : ' + cat)
        if matches.group('galnum') > 0:
            gals = matches.group('gals').splitlines()
            for gal in gals:
                galleries.append(gal.replace('_', ' '))
                pywikibot.output(u'gallery : ' + gal)
    commonshelperCats = list(set(commonshelperCats))
    galleries = list(set(galleries))
    for (lang, project, article) in usage:
        pywikibot.output(lang + project + article)
    return commonshelperCats, usage, galleries
Пример #46
0
    def _ocr_callback(self, cmd_uri, parser_func=None, ocr_tool=None):
        """OCR callback function.

        @return: tuple (error, text [error description in case of error]).
        """
        def identity(x):
            return x

        if not cmd_uri:
            raise ValueError('Parameter cmd_uri is mandatory.')

        if parser_func is None:
            parser_func = identity

        if not callable(parser_func):
            raise TypeError('Keyword parser_func must be callable.')

        if ocr_tool not in self._OCR_METHODS:
            raise TypeError("ocr_tool must be in {}, not '{}'.".format(
                self._OCR_METHODS, ocr_tool))

        # wrong link fail with Exceptions
        for retry in range(5, 30, 5):
            pywikibot.debug('{}: get URI {!r}'.format(ocr_tool, cmd_uri),
                            _logger)
            try:
                response = http.fetch(cmd_uri)
            except ReadTimeout as e:
                pywikibot.warning('ReadTimeout %s: %s' % (cmd_uri, e))
            except Exception as e:
                pywikibot.error('"{}": {}'.format(cmd_uri, e))
                return True, e
            else:
                pywikibot.debug('{}: {}'.format(ocr_tool, response.text),
                                _logger)
                break

            pywikibot.warning('retrying in {} seconds ...'.format(retry))
            time.sleep(retry)
        else:
            return True, ReadTimeout

        if 400 <= response.status_code < 600:
            return True, 'Http response status {}'.format(response.status_code)

        data = json.loads(response.text)

        if ocr_tool == self._PHETOOLS:  # phetools
            assert 'error' in data, 'Error from phetools: %s' % data
            assert data['error'] in [0, 1, 2, 3], \
                'Error from phetools: {}'.format(data)
            error, _text = bool(data['error']), data['text']
        else:  # googleOCR
            if 'error' in data:
                error, _text = True, data['error']
            else:
                error, _text = False, data['text']

        if error:
            pywikibot.error('OCR query %s: %s' % (cmd_uri, _text))
            return error, _text
        else:
            return error, parser_func(_text)
Пример #47
0
    def _ocr_callback(self, cmd_uri, parser_func=None, ocr_tool=None):
        """OCR callback function.

        @return: tuple (error, text [error description in case of error]).
        """
        def identity(x):
            return x

        if not cmd_uri:
            raise ValueError('Parameter cmd_uri is mandatory.')

        if parser_func is None:
            parser_func = identity

        if not callable(parser_func):
            raise TypeError('Keyword parser_func must be callable.')

        if ocr_tool not in self._OCR_METHODS:
            raise TypeError("ocr_tool must be in %s, not '%s'." %
                            (self._OCR_METHODS, ocr_tool))

        # wrong link fail with Exceptions
        retry = 0
        while retry < 5:
            pywikibot.debug('{0}: get URI {1!r}'.format(ocr_tool, cmd_uri),
                            _logger)
            try:
                response = http.fetch(cmd_uri)
            except requests.exceptions.ReadTimeout as e:
                retry += 1
                pywikibot.warning('ReadTimeout %s: %s' % (cmd_uri, e))
                pywikibot.warning('retrying in %s seconds ...' % (retry * 5))
                time.sleep(retry * 5)
            except Exception as e:
                pywikibot.error('"%s": %s' % (cmd_uri, e))
                return (True, e)
            else:
                pywikibot.debug('{0}: {1}'.format(ocr_tool, response.text),
                                _logger)
                break

        if 400 <= response.status < 600:
            return (True, 'Http response status {0}'.format(response.status))

        data = json.loads(response.text)

        if ocr_tool == self._PHETOOLS:  # phetools
            assert 'error' in data, 'Error from phetools: %s' % data
            assert data['error'] in [0, 1, 2,
                                     3], ('Error from phetools: %s' % data)
            error, _text = bool(data['error']), data['text']
        else:  # googleOCR
            if 'error' in data:
                error, _text = True, data['error']
            else:
                error, _text = False, data['text']

        if error:
            pywikibot.error('OCR query %s: %s' % (cmd_uri, _text))
            return (error, _text)
        else:
            return (error, parser_func(_text))
Пример #48
0
    def run(self):
        """Run bot."""
        def convert_from_bytes(total_bytes):
            for unit in ['B', 'K', 'M', 'G', 'T']:
                if abs(total_bytes) < 1024:
                    return str(total_bytes) + unit
                total_bytes = float(format(total_bytes / 1024.0, '.2f'))
            return str(total_bytes) + 'P'

        pywikibot.output('Downloading dump from ' + self.opt.wikiname)

        download_filename = '{wikiname}-{dumpdate}-{filename}'.format_map(
            self.opt)
        temp_filename = download_filename + '-' + \
            binascii.b2a_hex(urandom(8)).decode('ascii') + '.part'

        file_final_storepath = os.path.join(
            self.opt.storepath, download_filename)
        file_current_storepath = os.path.join(
            self.opt.storepath, temp_filename)

        # https://wikitech.wikimedia.org/wiki/Help:Toolforge#Dumps
        toolforge_dump_filepath = self.get_dump_name(
            self.opt.wikiname, self.opt.filename, self.opt.dumpdate)

        # First iteration for atomic download with temporary file
        # Second iteration for fallback non-atomic download
        for non_atomic in range(2):
            try:
                if toolforge_dump_filepath:
                    pywikibot.output('Symlinking file from '
                                     + toolforge_dump_filepath)
                    if non_atomic:
                        if os.path.exists(file_final_storepath):
                            remove(file_final_storepath)
                    symlink(toolforge_dump_filepath, file_current_storepath)
                else:
                    url = 'https://dumps.wikimedia.org/{}/{}/{}'.format(
                        self.opt.wikiname, self.opt.dumpdate,
                        download_filename)
                    pywikibot.output('Downloading file from ' + url)
                    response = fetch(url, stream=True)

                    if response.status_code != 200:
                        if response.status_code == 404:
                            pywikibot.output(
                                'File with name {filename!r}, from dumpdate '
                                '{dumpdate!r}, and wiki {wikiname!r} ({url}) '
                                "isn't available in the Wikimedia Dumps"
                                .format(url=url, **self.opt))
                        return

                    with open(file_current_storepath, 'wb') as result_file:
                        total = int(response.headers['content-length'])
                        if total == -1:
                            pywikibot.warning("'content-length' missing in "
                                              'response headers')
                        downloaded = 0
                        parts = 50
                        display_string = ''

                        pywikibot.output('')
                        for data in response.iter_content(100 * 1024):
                            result_file.write(data)

                            if total <= 0:
                                continue

                            downloaded += len(data)
                            done = int(parts * downloaded / total)
                            display = map(convert_from_bytes,
                                          (downloaded, total))
                            prior_display = display_string
                            display_string = '\r|{}{}|{}{}/{}'.format(
                                '=' * done,
                                '-' * (parts - done),
                                ' ' * 5,
                                *display)
                            # Add whitespace to cover up prior bar
                            display_string += ' ' * (
                                len(prior_display.rstrip())
                                - len(display_string.rstrip()))

                            pywikibot.output(display_string, newline=False)
                        pywikibot.output('')

                # Rename the temporary file to the target file
                # if the download completes successfully
                if not non_atomic:
                    replace(file_current_storepath, file_final_storepath)
                    break

            except (OSError, IOError):
                pywikibot.exception()

                try:
                    remove(file_current_storepath)
                except (OSError, IOError):
                    pywikibot.exception()

                # If the atomic download fails, try without a temporary file
                # If the non-atomic download also fails, exit the script
                if non_atomic:
                    return

                pywikibot.output('Cannot make temporary file, '
                                 'falling back to non-atomic download')
                file_current_storepath = file_final_storepath

        pywikibot.output('Done! File stored as ' + file_final_storepath)
Пример #49
0
    def run(self):
        """Run bot."""
        pywikibot.output('Downloading dump from ' + self.getOption('wikiname'))

        download_filename = '{wiki_name}-{revision}-{filename}'.format(
            wiki_name=self.getOption('wikiname'),
            revision=self.getOption('revision'),
            filename=self.getOption('filename')
        )
        temp_filename = download_filename + '-' + \
            binascii.b2a_hex(urandom(8)).decode('ascii') + '.part'

        file_final_storepath = os.path.join(
            self.getOption('storepath'), download_filename)
        file_current_storepath = os.path.join(
            self.getOption('storepath'), temp_filename)

        # https://wikitech.wikimedia.org/wiki/Help:Toolforge#Dumps
        toolforge_dump_filepath = self.get_dump_name(
            self.getOption('wikiname'), self.getOption('filename'))

        # First iteration for atomic download with temporary file
        # Second iteration for fallback non-atomic download
        for non_atomic in range(2):
            try:
                if toolforge_dump_filepath:
                    pywikibot.output('Symlinking file from ' +
                                     toolforge_dump_filepath)
                    if non_atomic:
                        if os.path.exists(file_final_storepath):
                            remove(file_final_storepath)
                    symlink(toolforge_dump_filepath, file_current_storepath)
                else:
                    url = 'https://dumps.wikimedia.org/{0}/{1}/{2}'.format(
                        self.getOption('wikiname'),
                        self.getOption('revision'),
                        download_filename)
                    pywikibot.output('Downloading file from ' + url)
                    response = fetch(url, stream=True)
                    if response.status == 200:
                        with open(file_current_storepath, 'wb') as result_file:
                            for data in response.data.iter_content(100 * 1024):
                                result_file.write(data)
                    elif response.status == 404:
                        pywikibot.output(
                            'File with name "{filename}", '
                            'from revision "{revision}", '
                            'and wiki "{wikiname}" ({url}) isn\'t '
                            'available in the Wikimedia Dumps'.format(
                                filename=self.getOption('filename'),
                                revision=self.getOption('revision'),
                                url=url,
                                wikiname=self.getOption('wikiname')))
                        return
                    else:
                        return
                # Rename the temporary file to the target file
                # if the download completes successfully
                if not non_atomic:
                    replace(file_current_storepath, file_final_storepath)
                    break
            except (OSError, IOError):
                pywikibot.exception()

                try:
                    remove(file_current_storepath)
                except (OSError, IOError):
                    pywikibot.exception()

                # If the atomic download fails, try without a temporary file
                # If the non-atomic download also fails, exit the script
                if not non_atomic:
                    pywikibot.output('Cannot make temporary file, ' +
                                     'falling back to non-atomic download')
                    file_current_storepath = file_final_storepath
                else:
                    return False

        pywikibot.output('Done! File stored as ' + file_final_storepath)
        return
Пример #50
0
    def run(self):
        """Run bot."""
        def convert_from_bytes(bytes):
            for unit in ['B', 'K', 'M', 'G', 'T']:
                if abs(bytes) < 1024:
                    return str(bytes) + unit
                bytes = float(format(bytes / 1024.0, '.2f'))
            return str(bytes) + 'P'

        pywikibot.output('Downloading dump from ' + self.getOption('wikiname'))

        download_filename = '{wiki_name}-{dumpdate}-{filename}'.format(
            wiki_name=self.getOption('wikiname'),
            dumpdate=self.getOption('dumpdate'),
            filename=self.getOption('filename'))
        temp_filename = download_filename + '-' + \
            binascii.b2a_hex(urandom(8)).decode('ascii') + '.part'

        file_final_storepath = os.path.join(self.getOption('storepath'),
                                            download_filename)
        file_current_storepath = os.path.join(self.getOption('storepath'),
                                              temp_filename)

        # https://wikitech.wikimedia.org/wiki/Help:Toolforge#Dumps
        toolforge_dump_filepath = self.get_dump_name(
            self.getOption('wikiname'), self.getOption('filename'))

        # First iteration for atomic download with temporary file
        # Second iteration for fallback non-atomic download
        for non_atomic in range(2):
            try:
                if toolforge_dump_filepath:
                    pywikibot.output('Symlinking file from ' +
                                     toolforge_dump_filepath)
                    if non_atomic:
                        if os.path.exists(file_final_storepath):
                            remove(file_final_storepath)
                    symlink(toolforge_dump_filepath, file_current_storepath)
                else:
                    url = 'https://dumps.wikimedia.org/{0}/{1}/{2}'.format(
                        self.getOption('wikiname'), self.getOption('dumpdate'),
                        download_filename)
                    pywikibot.output('Downloading file from ' + url)
                    response = fetch(url, stream=True)
                    if response.status == 200:
                        with open(file_current_storepath, 'wb') as result_file:
                            try:
                                total = int(response.
                                            response_headers['content-length'])
                            except KeyError:
                                pywikibot.exception()
                                total = -1
                            downloaded = 0
                            parts = 50
                            display_string = ''

                            pywikibot.output('')
                            for data in response.data.iter_content(100 * 1024):
                                result_file.write(data)

                                if total > 0:
                                    downloaded += len(data)
                                    done = int(parts * downloaded / total)
                                    display = map(convert_from_bytes,
                                                  (downloaded, total))
                                    prior_display = display_string
                                    display_string = ('\r|{0}{1}|' + ' ' * 5 +
                                                      '{2}/{3}').format(
                                                          '=' * done,
                                                          '-' * (parts - done),
                                                          *display)
                                    # Add whitespace to cover up prior bar
                                    display_string += ' ' * (
                                        len(prior_display.rstrip()) -
                                        len(display_string.rstrip()))

                                    pywikibot.output(display_string,
                                                     newline=False)
                            pywikibot.output('')
                    elif response.status == 404:
                        pywikibot.output(
                            'File with name "{filename}", '
                            'from dumpdate "{dumpdate}", '
                            'and wiki "{wikiname}" ({url}) isn\'t '
                            'available in the Wikimedia Dumps'.format(
                                filename=self.getOption('filename'),
                                dumpdate=self.getOption('dumpdate'),
                                url=url,
                                wikiname=self.getOption('wikiname')))
                        return
                    else:
                        return
                # Rename the temporary file to the target file
                # if the download completes successfully
                if not non_atomic:
                    replace(file_current_storepath, file_final_storepath)
                    break
            except (OSError, IOError):
                pywikibot.exception()

                try:
                    remove(file_current_storepath)
                except (OSError, IOError):
                    pywikibot.exception()

                # If the atomic download fails, try without a temporary file
                # If the non-atomic download also fails, exit the script
                if not non_atomic:
                    pywikibot.output('Cannot make temporary file, ' +
                                     'falling back to non-atomic download')
                    file_current_storepath = file_final_storepath
                else:
                    return False

        pywikibot.output('Done! File stored as ' + file_final_storepath)
        return
Пример #51
0
    def __init__(self, fromurl):
        """
        Constructor.

        @raises ServerError: a server error occurred while loading the site
        @raises Timeout: a timeout occurred while loading the site
        @raises RuntimeError: Version not found or version less than 1.14
        """
        if fromurl.endswith("$1"):
            fromurl = fromurl[:-2]
        r = fetch(fromurl)
        if r.status == 503:
            raise ServerError('Service Unavailable')
        elif r.status == 500:
            raise ServerError('Internal Server Error')

        if fromurl != r.data.url:
            pywikibot.log('{0} redirected to {1}'.format(fromurl, r.data.url))
            fromurl = r.data.url

        self.fromurl = fromurl

        data = r.content

        wp = WikiHTMLPageParser(fromurl)
        wp.feed(data)

        self.version = wp.version
        self.server = wp.server
        self.scriptpath = wp.scriptpath
        self.articlepath = None

        try:
            self._parse_pre_117(data)
        except Exception as e:
            pywikibot.log('MW pre-1.17 detection failed: {0!r}'.format(e))

        if self.api:
            try:
                self._parse_post_117()
            except Exception as e:
                pywikibot.log('MW 1.17+ detection failed: {0!r}'.format(e))

            if not self.version:
                self._fetch_old_version()

        if not self.api:
            raise RuntimeError('Unsupported url: {0}'.format(self.fromurl))

        if not self.articlepath:
            if self.private_wiki:
                if self.api != self.fromurl and self.private_wiki:
                    self.articlepath = self.fromurl.rsplit('/', 1)[0] + '/$1'
                else:
                    raise RuntimeError(
                        'Unable to determine articlepath because the wiki is '
                        'private. Use the Main Page URL instead of the API.')
            else:
                raise RuntimeError('Unable to determine articlepath: '
                                   '{0}'.format(self.fromurl))

        if (not self.version or
                self.version < MediaWikiVersion('1.14')):
            raise RuntimeError('Unsupported version: {0}'.format(self.version))
Пример #52
0
    def read_file_content(self, file_url: str):
        """Return name of temp file in which remote file is saved."""
        pywikibot.output('Reading file ' + file_url)

        handle, tempname = tempfile.mkstemp()
        path = Path(tempname)
        size = 0

        dt_gen = (el for el in (15, 30, 45, 60, 120, 180, 240, 300))
        while True:
            file_len = path.stat().st_size
            if file_len:
                pywikibot.output('Download resumed.')
                headers = {'Range': 'bytes={}-'.format(file_len)}
            else:
                headers = {}

            with open(str(path), 'ab') as fd:  # T272345: Python 3.5 needs str
                os.lseek(handle, file_len, 0)
                try:
                    response = http.fetch(file_url,
                                          stream=True,
                                          headers=headers)
                    response.raise_for_status()

                    # get download info, if available
                    # Note: this is not enough to exclude pages
                    #       e.g. 'application/json' is also not a media
                    if 'text/' in response.headers['Content-Type']:
                        raise FatalServerError('The requested URL was not '
                                               'found on server.')
                    size = max(size,
                               int(response.headers.get('Content-Length', 0)))

                    # stream content to temp file (in chunks of 1Mb)
                    for chunk in response.iter_content(chunk_size=1024 * 1024):
                        fd.write(chunk)

                # raised from connection lost during response.iter_content()
                except requests.ConnectionError:
                    fd.flush()
                    pywikibot.output('Connection closed at byte {}'.format(
                        path.stat().st_size))
                # raised from response.raise_for_status()
                except requests.HTTPError as e:
                    # exit criteria if size is not available
                    # error on last iteration is OK, we're requesting
                    #    {'Range': 'bytes=file_len-'}
                    err = HTTPStatus.REQUESTED_RANGE_NOT_SATISFIABLE
                    if response.status_code == err and path.stat().st_size:
                        break
                    raise FatalServerError(str(e)) from e

            if size and size == path.stat().st_size:
                break
            try:
                dt = next(dt_gen)
                pywikibot.output('Sleeping for {} seconds ...'.format(dt))
                pywikibot.sleep(dt)
            except StopIteration:
                raise FatalServerError('Download failed, too many retries!')

        pywikibot.output('Downloaded {} bytes'.format(path.stat().st_size))
        return tempname
Пример #53
0
    def test_http(self):
        """Test with http, standard http interface for pywikibot."""
        r = http.fetch(uri=self.url)

        self.assertEqual(r.raw, self.png)
Пример #54
0
 def test_http_504(self):
     """Test that a HTTP 504 raises the correct exception."""
     with self.assertRaisesRegex(
             Server504Error, r'Server ([^\:]+|[^\:]+:[0-9]+)'
             r' timed out'):
         http.fetch(self.get_httpbin_url('/status/504'))