def getPageAndHeaders(url, contextFactory=None, *args, **kwargs): factory = client._makeGetterFactory( url, client.HTTPClientFactory, contextFactory=contextFactory, *args, **kwargs) return factory.deferred.addCallback(lambda page: (page, factory.response_headers))
def quiet_get_page(url, contextFactory=None, *args, **kwargs): """A version of getPage that uses QuietHTTPClientFactory.""" return _makeGetterFactory( url, QuietHTTPClientFactory, contextFactory=contextFactory, *args, **kwargs).deferred
def downloadFile(url, file, statusCallback=None, bucketFilter=None, contextFactory=None, *args, **kwargs): factoryFactory = lambda url, *a, **kw: HTTPManagedDownloader(url, file, statusCallback=statusCallback, bucketFilter=bucketFilter, *a, **kw) return _makeGetterFactory( url, factoryFactory, contextFactory=contextFactory, *args, **kwargs).deferred
def getPageAndHeaders(url, contextFactory=None, *args, **kwargs): factory = client._makeGetterFactory(url, client.HTTPClientFactory, contextFactory=contextFactory, *args, **kwargs) return factory.deferred.addCallback(lambda page: (page, factory.response_headers))
def fetch(url, contextFactory=None, *args, **kwargs): def wrapper(error): return Page(error=error.getErrorMessage()) d = client._makeGetterFactory(url, HTTPClientFactory, contextFactory=contextFactory, *args, **kwargs).deferred d.addErrback(wrapper) return d
def makeUpstreamGetter(upstream): identifier, url, args, kwargs = upstream subfactory = client._makeGetterFactory(url, factory, context_factory, *args, **kwargs) subfactory.deferred.addBoth(lambda x: (x, identifier, subfactory)) return subfactory.deferred
def _request_cookies(self, url, method="GET", data=None): factory = _makeGetterFactory(self.base_url + url, HTTPClientFactory, method=method, postdata=data, cookies=self.cookies) factory.deferred.addErrback(self._log_errback) def cookie_magic(data): return (data, factory.cookies) factory.deferred.addCallback(cookie_magic) return factory.deferred
def getTwitterStream(url, username, password, contextFactory=None, *args, **kwargs): encoded_auth = base64.encodestring("%s:%s" % (username, password)) authorization_header = "Basic %s" % encoded_auth kwargs.update({'headers' : {'authorization': authorization_header}}) return _makeGetterFactory( url, StreamingTwitterClientFactory, contextFactory=contextFactory, *args, **kwargs).deferred
def getPage(url, contextFactory=None, *args, **kwargs): """Adapted version of twisted.web.client.getPage""" def _clientfactory(*args, **kwargs): timeout = kwargs.pop('timeout', 0) f = client.ScrapyHTTPClientFactory(Request(*args, **kwargs), timeout=timeout) f.deferred.addCallback(lambda r: r.body) return f from twisted.web.client import _makeGetterFactory return _makeGetterFactory(url, _clientfactory, contextFactory=contextFactory, *args, **kwargs).deferred
def start(self): self.original_mimetype = self.download.mime_type self.download.status = Status.STARTING bucketFilter = ThrottledBucketFilter(0, self.manager.get_download_rate_filter()) factoryFactory = lambda url, *a, **kw: HTTPManagedDownloader(str(self.download.url), os.path.join(self.directory, self.download.filename), statusCallback=DownloadStatus(self.download), bucketFilter=bucketFilter, *a, **kw) self.factory = _makeGetterFactory(str(self.download.url), factoryFactory) self.factory.deferred.addCallback(self.check_mimetype); self.factory.deferred.addErrback(self.errback); return True
def get_page(self, url, contextFactory=None, *args, **kwargs): '''Adapted version of twisted.web.client.getPage''' def _clientfactory(*args, **kwargs): timeout = kwargs.pop('timeout', 0) download_size = kwargs.pop('download_size', 0) f = CrawlmiHTPPClientFactory(Request(*args, **kwargs), timeout=timeout, download_size=download_size) f.deferred.addCallback(lambda r: r.body) return f from twisted.web.client import _makeGetterFactory return _makeGetterFactory(url, _clientfactory, contextFactory=contextFactory, *args, **kwargs).deferred
def getPage(url, contextFactory=None, response_transform=None, *args, **kwargs): """Adapted version of twisted.web.client.getPage""" def _clientfactory(url, *args, **kwargs): url = to_unicode(url) timeout = kwargs.pop('timeout', 0) f = client.ScrapyHTTPClientFactory( Request(url, *args, **kwargs), timeout=timeout) f.deferred.addCallback(response_transform or (lambda r: r.body)) return f from twisted.web.client import _makeGetterFactory return _makeGetterFactory(to_bytes(url), _clientfactory, contextFactory=contextFactory, *args, **kwargs).deferred
def test_infiniteRedirection(self): """ When more than C{redirectLimit} HTTP redirects are encountered, the page request fails with L{InfiniteRedirection}. """ def checkRedirectCount(*a): self.assertEqual(f._redirectCount, 13) self.assertEqual(self.infiniteRedirectResource.count, 13) f = client._makeGetterFactory(self.getURL("infiniteRedirect"), client.HTTPClientFactory, redirectLimit=13) d = self.assertFailure(f.deferred, error.InfiniteRedirection) d.addCallback(checkRedirectCount) return d
def getPageAndHeaders(url, contextFactory=None, *args, **kwargs): """Return deferred with a (body, headers) success result. This is a small modification to twisted.web.client.getPage that allows the caller access to the response headers. """ factory = txwebclient._makeGetterFactory( url, txwebclient.HTTPClientFactory, contextFactory=contextFactory, *args, **kwargs) return factory.deferred.addCallback( lambda page: (page, factory.response_headers))
def getPage(url, contextFactory=None, *args, **kwargs): """ Download a web page as a string. Download a page. Return a deferred, which will callback with a page (as a string) or errback with a description of the error. See HTTPClientFactory to see what extra args can be passed. """ kwargs['agent'] = "Coherence PageGetter" return client._makeGetterFactory( url, HeaderAwareHTTPClientFactory, contextFactory=contextFactory, *args, **kwargs).deferred
def subgen(): lastError = None for (identifier, url, args, kwargs) in upstreams: subfactory = client._makeGetterFactory(url, factory, context_factory, *args, **kwargs) wait = defer.waitForDeferred(subfactory.deferred) yield wait try: yield (wait.getResult(), identifier, subfactory) return except ConnectError: lastError = sys.exc_info()[1] raise lastError and lastError or error.Error(http.INTERNAL_SERVER_ERROR)
def test_infiniteRedirection(self): """ When more than C{redirectLimit} HTTP redirects are encountered, the page request fails with L{InfiniteRedirection}. """ def checkRedirectCount(*a): self.assertEquals(f._redirectCount, 13) self.assertEquals(self.infiniteRedirectResource.count, 13) f = client._makeGetterFactory(self.getURL('infiniteRedirect'), client.HTTPClientFactory, redirectLimit=13) d = self.assertFailure(f.deferred, error.InfiniteRedirection) d.addCallback(checkRedirectCount) return d
def test_downloadHeaders(self): """ After L{client.HTTPDownloader.deferred} fires, the L{client.HTTPDownloader} instance's C{status} and C{response_headers} attributes are populated with the values from the response. """ def checkHeaders(factory): self.assertEqual(factory.status, b"200") self.assertEqual(factory.response_headers[b"content-type"][0], b"text/html") self.assertEqual(factory.response_headers[b"content-length"][0], b"10") os.unlink(factory.fileName) factory = client._makeGetterFactory(self.getURL("file"), client.HTTPDownloader, fileOrName=self.mktemp()) return factory.deferred.addCallback(lambda _: checkHeaders(factory))
def test_downloadRedirectLimit(self): """ When more than C{redirectLimit} HTTP redirects are encountered, the page request fails with L{InfiniteRedirection}. """ def checkRedirectCount(*a): self.assertEquals(f._redirectCount, 7) self.assertEquals(self.infiniteRedirectResource.count, 7) f = client._makeGetterFactory(self.getURL('infiniteRedirect'), client.HTTPDownloader, fileOrName=self.mktemp(), redirectLimit=7) d = self.assertFailure(f.deferred, error.InfiniteRedirection) d.addCallback(checkRedirectCount) return d
def test_downloadHeaders(self): """ After L{client.HTTPDownloader.deferred} fires, the L{client.HTTPDownloader} instance's C{status} and C{response_headers} attributes are populated with the values from the response. """ def checkHeaders(factory): self.assertEqual(factory.status, b'200') self.assertEqual(factory.response_headers[b'content-type'][0], b'text/html') self.assertEqual(factory.response_headers[b'content-length'][0], b'10') os.unlink(factory.fileName) factory = client._makeGetterFactory( self.getURL('file'), client.HTTPDownloader, fileOrName=self.mktemp()) return factory.deferred.addCallback(lambda _: checkHeaders(factory))
def test_downloadCookies(self): """ The C{cookies} dict passed to the L{client.HTTPDownloader} initializer is used to populate the I{Cookie} header included in the request sent to the server. """ output = self.mktemp() factory = client._makeGetterFactory( self.getURL("cookiemirror"), client.HTTPDownloader, fileOrName=output, cookies={b"foo": b"bar"} ) def cbFinished(ignored): self.assertEqual(FilePath(output).getContent(), "[('foo', 'bar')]") factory.deferred.addCallback(cbFinished) return factory.deferred
def test_downloadRedirectLimit(self): """ When more than C{redirectLimit} HTTP redirects are encountered, the page request fails with L{InfiniteRedirection}. """ def checkRedirectCount(*a): self.assertEqual(f._redirectCount, 7) self.assertEqual(self.infiniteRedirectResource.count, 7) f = client._makeGetterFactory( self.getURL("infiniteRedirect"), client.HTTPDownloader, fileOrName=self.mktemp(), redirectLimit=7 ) d = self.assertFailure(f.deferred, error.InfiniteRedirection) d.addCallback(checkRedirectCount) return d
def getPage(url, contextFactory=None, *args, **kwargs): ''' Download a web page as a string. Download a page. Return a deferred, which will callback with a page (as a string) or errback with a description of the error. See :obj:`twisted.web.client.HTTPClientFactory` to see what extra args can be passed. .. note:: This function is like `twisted.web.client.getPage`, except it uses our HeaderAwareHTTPClientFactory instead of HTTPClientFactory and sets the user agent. ''' url_bytes = to_bytes(url) if args is None: args = [] if kwargs is None: kwargs = {} if 'headers' in kwargs and 'user-agent' in kwargs['headers']: kwargs['agent'] = kwargs['headers']['user-agent'] elif 'agent' not in kwargs: kwargs['agent'] = 'Coherence PageGetter' new_kwargs = {} for k, v in kwargs.items(): if k == 'headers': new_kwargs[k] = {} for kh, vh in kwargs['headers'].items(): h_key = to_bytes(kh) h_val = to_bytes(vh) new_kwargs['headers'][h_key] = h_val else: new_kwargs[k] = v logger.info(f'getPage [url]: {url} [type: {type(url)}]') logger.debug(f'\t->[args]: {args} [type: {type(args)}]') logger.debug(f'\t->[kwargs]: {kwargs}') logger.debug(f'\t->[new_kwargs]: {new_kwargs}]') return client._makeGetterFactory( url_bytes, HeaderAwareHTTPClientFactory, contextFactory=contextFactory, *args, **new_kwargs, ).deferred
def getPageWebClient(self, url, contextFactory=None, *args, **kwargs): """ Download a web page as a string. COPY OF twisted.web.client.getPage to store the factory Download a page. Return a deferred, which will callback with a page (as a string) or errback with a description of the error. See L{HTTPClientFactory} to see what extra arguments can be passed. """ self.httpGetterFactory = _makeGetterFactory( url, HTTPClientFactory, contextFactory=contextFactory, *args, **kwargs) return self.httpGetterFactory.deferred
def test_downloadCookies(self): """ The C{cookies} dict passed to the L{client.HTTPDownloader} initializer is used to populate the I{Cookie} header included in the request sent to the server. """ output = self.mktemp() factory = client._makeGetterFactory(self.getURL('cookiemirror'), client.HTTPDownloader, fileOrName=output, cookies={'foo': 'bar'}) def cbFinished(ignored): self.assertEqual(FilePath(output).getContent(), "[('foo', 'bar')]") factory.deferred.addCallback(cbFinished) return factory.deferred
def getPage(url, contextFactory=None, *args, **kwargs): """ Download a web page as a string. Download a page. Return a deferred, which will callback with a page (as a string) or errback with a description of the error. See HTTPClientFactory to see what extra args can be passed. """ # This function is like twisted.web.client.getPage, except it uses # our HeaderAwareHTTPClientFactory instead of HTTPClientFactory # and sets the user agent. if 'headers' in kwargs and 'user-agent' in kwargs['headers']: kwargs['agent'] = kwargs['headers']['user-agent'] elif not 'agent' in kwargs: kwargs['agent'] = "Coherence PageGetter" return client._makeGetterFactory(url.encode('utf-8'), HeaderAwareHTTPClientFactory, contextFactory=contextFactory, *args, **kwargs).deferred
def getPage(url, contextFactory=None, *args, **kwargs): """ Download a web page as a string. Download a page. Return a deferred, which will callback with a page (as a string) or errback with a description of the error. See HTTPClientFactory to see what extra args can be passed. """ # This function is like twisted.web.client.getPage, except it uses # our HeaderAwareHTTPClientFactory instead of HTTPClientFactory # and sets the user agent. if 'headers' in kwargs and 'user-agent' in kwargs['headers']: kwargs['agent'] = kwargs['headers']['user-agent'] elif not 'agent' in kwargs: kwargs['agent'] = "Coherence PageGetter" return client._makeGetterFactory( url, HeaderAwareHTTPClientFactory, contextFactory=contextFactory, *args, **kwargs).deferred
def get_page_wDigest(uri_auth_tuples, digestMgr=None, goodSSLCertHandler=None, session=None, *args, **kwargs): """Get the "page", performing the HTTP request, maybe with DIGEST auth. @param uri_auth_tuples: The iterable over the tuples (URI, realm, SSL context) which might be used to attempt to access the peer. @type uri_auth_tuples: deque @type session: IOutgoingSession, NoneType @precondition: consists_of(uri_auth_tuples, tuple) # uri_auth_tuples @returns: C{Deferred} object, whose {.callback} gets called when the body contents is received B{completely}. Its C{.errback} may be called with C{error.ConnectionRefusedError()} if there are no more URLs to try for this host, so the connection cannot be established at all. Also, if any other error occurs during the message processing (except C{error.ConnectError()}, C{error.UserError()}, and C{SSL.Error()} which are handled internally), it may be used for errback as well. @rtype: defer.Deferred """ assert in_main_thread if uri_auth_tuples: # There are still URIs in the list, so we may proceed. _uri, _realm, ssl_ctx_factory = uri_auth_tuples[0] if session is not None: session.connecting(_realm) if __debug__: logger.debug('Sending message to peer %s(%s)/%s', _uri, _realm, uri_auth_tuples) assert _uri.startswith('https://'), \ "{!r} doesn't start from a valid protocol name!".format(_uri) factory = _makeGetterFactory(str(_uri), # Explicitly convert to str HTTPClientFactory_wDigest, contextFactory=ssl_ctx_factory, factory_contextFactory=ssl_ctx_factory, digestMgr=digestMgr, goodSSLCertHandler=goodSSLCertHandler, session=session, realm=_realm, *args, **kwargs) @exceptions_logged(logger) def get_page_success_handler(body): """ @note: Deferred callback, exceptions logged. """ if session is not None: session.connection_completed() return (body, factory) @exceptions_logged(logger) @contract_epydoc def get_page_error_handler(failure): """ @type failure: Failure @note: Deferred errback, exceptions logged. """ # Checking for the most general ConnectError/UserError errors, # rather than for their particular kinds. # Also, all SSL validation errors cause to switch the URL as well. if failure.check(error.ConnectError, error.UserError, SSL.Error): # Let's try to reconnect using the remaining URIs. uri_auth_tuples.popleft() logger.debug('Connection refused (%r): %r', failure, failure.getErrorMessage()) logger.verbose('Connection error traceback: %s', failure.getTraceback(detail='verbose')) logger.debug('Reconnecting using %r', uri_auth_tuples) return get_page_wDigest(uri_auth_tuples=uri_auth_tuples, digestMgr=digestMgr, goodSSLCertHandler=goodSSLCertHandler, session=session, *args, **kwargs) elif failure.check(error.ConnectionDone): # Even though completed more-or-less normally, # still return a failure, but don't retry. if session is not None: session.connection_completed() elif failure.check(error.ConnectionLost): # And this is bad! But don't retry either. if session is not None: session.connection_failed() return failure d = factory.deferred d.addCallbacks(get_page_success_handler, get_page_error_handler) return d else: # No more URIs to try. if session is not None: session.connection_failed() return defer.fail(error.ConnectionRefusedError('All URIs unavailable'))
def getPage(method, url, accessToken=None): headers = {} if accessToken is not None: headers["Authorization"] = "token {0}".format(accessToken) factory = _makeGetterFactory(url, _HTTPClientFactory, headers=headers, method=method) return factory.deferred
def myGetPage(url, contextFactory=None, *args, **kwargs): return _makeGetterFactory( url, HTTPClientFactory, contextFactory=contextFactory, *args, **kwargs)
def myGetPage(url, contextFactory=None, *args, **kwargs): return _makeGetterFactory(url, HTTPClientFactory, contextFactory=contextFactory, *args, **kwargs)
def downloadWithProgress2(url): factory = client._makeGetterFactory(url, lambda *args: HttpStreamClient(url)) return factory.deferred
def getPage(url, contextFactory=None, *args, **kwargs): return client._makeGetterFactory( url, HTTPClientFactory, contextFactory=contextFactory, *args, **kwargs).deferred