def _get_agent(self, request, timeout): bindaddress = request.meta.get("bindaddress") or self._bindAddress proxy = request.meta.get("proxy") if proxy: _, _, proxyHost, proxyPort, proxyParams = _parse(proxy) scheme = _parse(request.url)[0] omitConnectTunnel = proxyParams.find("noconnect") >= 0 if scheme == "https" and not omitConnectTunnel: proxyConf = (proxyHost, proxyPort, request.headers.get("Proxy-Authorization", None)) return self._TunnelingAgent( reactor, proxyConf, contextFactory=self._contextFactory, connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool, ) else: endpoint = TCP4ClientEndpoint(reactor, proxyHost, proxyPort, timeout=timeout, bindAddress=bindaddress) return self._ProxyAgent(endpoint) return self._Agent( reactor, contextFactory=self._contextFactory, connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool, )
def _get_agent(self, request, timeout): bindAddress = request.meta.get('bindaddress') or self._bindAddress proxy = request.meta.get('proxy') if proxy: _, _, proxyHost, proxyPort, proxyParams = _parse(proxy) _, _, host, port, proxyParams = _parse(request.url) proxyEndpoint = TCP4ClientEndpoint(reactor, proxyHost, proxyPort, timeout=timeout, bindAddress=bindAddress) agent = SOCKS5Agent(reactor, proxyEndpoint=proxyEndpoint) return agent return self._Agent(reactor, contextFactory=self._contextFactory, connectTimeout=timeout, bindAddress=bindAddress, pool=self._pool)
def _get_agent(self, request, timeout): from twisted.internet import reactor bindaddress = request.meta.get('bindaddress') or self._bindAddress proxy = request.meta.get('proxy') if proxy: proxyScheme, proxyNetloc, proxyHost, proxyPort, proxyParams = _parse( proxy) scheme = _parse(request.url)[0] proxyHost = to_unicode(proxyHost) omitConnectTunnel = b'noconnect' in proxyParams if omitConnectTunnel: warnings.warn( "Using HTTPS proxies in the noconnect mode is deprecated. " "If you use Zyte Smart Proxy Manager, it doesn't require " "this mode anymore, so you should update scrapy-crawlera " "to scrapy-zyte-smartproxy and remove '?noconnect' " "from the Zyte Smart Proxy Manager URL.", ScrapyDeprecationWarning, ) if scheme == b'https' and not omitConnectTunnel: proxyAuth = request.headers.get(b'Proxy-Authorization', None) proxyConf = (proxyHost, proxyPort, proxyAuth) return self._TunnelingAgent( reactor=reactor, proxyConf=proxyConf, contextFactory=self._contextFactory, connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool, ) else: proxyScheme = proxyScheme or b'http' proxyHost = to_bytes(proxyHost, encoding='ascii') proxyPort = to_bytes(str(proxyPort), encoding='ascii') proxyURI = urlunparse( (proxyScheme, proxyNetloc, proxyParams, '', '', '')) return self._ProxyAgent( reactor=reactor, proxyURI=to_bytes(proxyURI, encoding='ascii'), connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool, ) return self._Agent( reactor=reactor, contextFactory=self._contextFactory, connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool, )
def testParse(self): lip = "127.0.0.1" tests = ( ("http://127.0.0.1?c=v&c2=v2#fragment", ("http", lip, lip, 80, "/?c=v&c2=v2")), ("http://127.0.0.1/?c=v&c2=v2#fragment", ("http", lip, lip, 80, "/?c=v&c2=v2")), ("http://127.0.0.1/foo?c=v&c2=v2#frag", ("http", lip, lip, 80, "/foo?c=v&c2=v2")), ("http://127.0.0.1:100?c=v&c2=v2#fragment", ("http", lip + ":100", lip, 100, "/?c=v&c2=v2")), ("http://127.0.0.1:100/?c=v&c2=v2#frag", ("http", lip + ":100", lip, 100, "/?c=v&c2=v2")), ("http://127.0.0.1:100/foo?c=v&c2=v2#frag", ("http", lip + ":100", lip, 100, "/foo?c=v&c2=v2")), ("http://127.0.0.1", ("http", lip, lip, 80, "/")), ("http://127.0.0.1/", ("http", lip, lip, 80, "/")), ("http://127.0.0.1/foo", ("http", lip, lip, 80, "/foo")), ("http://127.0.0.1?param=value", ("http", lip, lip, 80, "/?param=value")), ("http://127.0.0.1/?param=value", ("http", lip, lip, 80, "/?param=value")), ("http://127.0.0.1:12345/foo", ("http", lip + ":12345", lip, 12345, "/foo")), ("http://spam:12345/foo", ("http", "spam:12345", "spam", 12345, "/foo")), ("http://spam.test.org/foo", ("http", "spam.test.org", "spam.test.org", 80, "/foo")), ("https://127.0.0.1/foo", ("https", lip, lip, 443, "/foo")), ("https://127.0.0.1/?param=value", ("https", lip, lip, 443, "/?param=value")), ("https://127.0.0.1:12345/", ("https", lip + ":12345", lip, 12345, "/")), ("http://scrapytest.org/foo ", ("http", "scrapytest.org", "scrapytest.org", 80, "/foo")), ("http://egg:7890 ", ("http", "egg:7890", "egg", 7890, "/")), ) for url, test in tests: self.assertEquals(client._parse(url), test, url)
def testParse(self): lip = '127.0.0.1' tests = ( ("http://127.0.0.1?c=v&c2=v2#fragment", ('http', lip, lip, 80, '/?c=v&c2=v2')), ("http://127.0.0.1/?c=v&c2=v2#fragment", ('http', lip, lip, 80, '/?c=v&c2=v2')), ("http://127.0.0.1/foo?c=v&c2=v2#frag", ('http', lip, lip, 80, '/foo?c=v&c2=v2')), ("http://127.0.0.1:100?c=v&c2=v2#fragment", ('http', lip + ':100', lip, 100, '/?c=v&c2=v2')), ("http://127.0.0.1:100/?c=v&c2=v2#frag", ('http', lip + ':100', lip, 100, '/?c=v&c2=v2')), ("http://127.0.0.1:100/foo?c=v&c2=v2#frag", ('http', lip + ':100', lip, 100, '/foo?c=v&c2=v2')), ("http://127.0.0.1", ('http', lip, lip, 80, '/')), ("http://127.0.0.1/", ('http', lip, lip, 80, '/')), ("http://127.0.0.1/foo", ('http', lip, lip, 80, '/foo')), ("http://127.0.0.1?param=value", ('http', lip, lip, 80, '/?param=value')), ("http://127.0.0.1/?param=value", ('http', lip, lip, 80, '/?param=value')), ("http://127.0.0.1:12345/foo", ('http', lip + ':12345', lip, 12345, '/foo')), ("http://spam:12345/foo", ('http', 'spam:12345', 'spam', 12345, '/foo')), ("http://spam.test.org/foo", ('http', 'spam.test.org', 'spam.test.org', 80, '/foo')), ("https://127.0.0.1/foo", ('https', lip, lip, 443, '/foo')), ("https://127.0.0.1/?param=value", ('https', lip, lip, 443, '/?param=value')), ("https://127.0.0.1:12345/", ('https', lip + ':12345', lip, 12345, '/')), ("http://scrapytest.org/foo ", ('http', 'scrapytest.org', 'scrapytest.org', 80, '/foo')), ("http://egg:7890 ", ('http', 'egg:7890', 'egg', 7890, '/')), ) for url, test in tests: test = tuple( to_bytes(x) if not isinstance(x, int) else x for x in test) self.assertEqual(client._parse(url), test, url)
def testParse(self): lip = '127.0.0.1' tests = ( ("http://127.0.0.1?c=v&c2=v2#fragment", ('http', lip, lip, 80, '/?c=v&c2=v2')), ("http://127.0.0.1/?c=v&c2=v2#fragment", ('http', lip, lip, 80, '/?c=v&c2=v2')), ("http://127.0.0.1/foo?c=v&c2=v2#frag", ('http', lip, lip, 80, '/foo?c=v&c2=v2')), ("http://127.0.0.1:100?c=v&c2=v2#fragment", ('http', lip+':100', lip, 100, '/?c=v&c2=v2')), ("http://127.0.0.1:100/?c=v&c2=v2#frag", ('http', lip+':100', lip, 100, '/?c=v&c2=v2')), ("http://127.0.0.1:100/foo?c=v&c2=v2#frag", ('http', lip+':100', lip, 100, '/foo?c=v&c2=v2')), ("http://127.0.0.1", ('http', lip, lip, 80, '/')), ("http://127.0.0.1/", ('http', lip, lip, 80, '/')), ("http://127.0.0.1/foo", ('http', lip, lip, 80, '/foo')), ("http://127.0.0.1?param=value", ('http', lip, lip, 80, '/?param=value')), ("http://127.0.0.1/?param=value", ('http', lip, lip, 80, '/?param=value')), ("http://127.0.0.1:12345/foo", ('http', lip+':12345', lip, 12345, '/foo')), ("http://spam:12345/foo", ('http', 'spam:12345', 'spam', 12345, '/foo')), ("http://spam.test.org/foo", ('http', 'spam.test.org', 'spam.test.org', 80, '/foo')), ("https://127.0.0.1/foo", ('https', lip, lip, 443, '/foo')), ("https://127.0.0.1/?param=value", ('https', lip, lip, 443, '/?param=value')), ("https://127.0.0.1:12345/", ('https', lip+':12345', lip, 12345, '/')), ("http://scrapytest.org/foo ", ('http', 'scrapytest.org', 'scrapytest.org', 80, '/foo')), ("http://egg:7890 ", ('http', 'egg:7890', 'egg', 7890, '/')), ) for url, test in tests: self.assertEquals(client._parse(url), test, url)
def _get_agent(self, request, timeout): from twisted.internet import reactor bindaddress = request.meta.get("bindaddress") or self._bindAddress proxy = request.meta.get("proxy") if proxy: _, _, proxyHost, proxyPort, proxyParams = _parse(proxy) scheme = _parse(request.url)[0] proxyHost = to_unicode(proxyHost) omitConnectTunnel = b"noconnect" in proxyParams if omitConnectTunnel: warnings.warn( "Using HTTPS proxies in the noconnect mode is deprecated. " "If you use Zyte Smart Proxy Manager (formerly Crawlera), " "it doesn't require this mode anymore, so you should " "update scrapy-crawlera to 1.3.0+ and remove '?noconnect' " "from the Zyte Smart Proxy Manager URL.", ScrapyDeprecationWarning, ) if scheme == b"https" and not omitConnectTunnel: proxyAuth = request.headers.get(b"Proxy-Authorization", None) proxyConf = (proxyHost, proxyPort, proxyAuth) return self._TunnelingAgent( reactor=reactor, proxyConf=proxyConf, contextFactory=self._contextFactory, connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool, ) else: return self._ProxyAgent( reactor=reactor, proxyURI=to_bytes(proxy, encoding="ascii"), connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool, ) return self._Agent( reactor=reactor, contextFactory=self._contextFactory, connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool, )
def _get_agent(self, request, timeout): from twisted.internet import reactor bindaddress = request.meta.get('bindaddress') or self._bindAddress proxy = request.meta.get('proxy') if proxy: _, _, proxyHost, proxyPort, proxyParams = _parse(proxy) scheme = _parse(request.url)[0] proxyHost = to_unicode(proxyHost) omitConnectTunnel = b'noconnect' in proxyParams if omitConnectTunnel: warnings.warn( "Using HTTPS proxies in the noconnect mode is deprecated. " "If you use Crawlera, it doesn't require this mode anymore, " "so you should update scrapy-crawlera to 1.3.0+ " "and remove '?noconnect' from the Crawlera URL.", ScrapyDeprecationWarning) if scheme == b'https' and not omitConnectTunnel: proxyAuth = request.headers.get(b'Proxy-Authorization', None) proxyConf = (proxyHost, proxyPort, proxyAuth) return self._TunnelingAgent( reactor=reactor, proxyConf=proxyConf, contextFactory=self._contextFactory, connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool, ) else: return self._ProxyAgent( reactor=reactor, proxyURI=to_bytes(proxy, encoding='ascii'), connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool, ) return self._Agent( reactor=reactor, contextFactory=self._contextFactory, connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool, )
def _get_agent(self, request, timeout): bindaddress = request.meta.get('bindaddress') or self._bindAddress proxy = request.meta.get('proxy') if proxy: scheme, _, host, port, _ = _parse(proxy) endpoint = TCP4ClientEndpoint(reactor, host, port, timeout=timeout, bindAddress=bindaddress) return self._ProxyAgent(endpoint) return self._Agent(reactor, contextFactory=self._contextFactory, connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool)
def _get_agent(self, request, timeout): bindaddress = request.meta.get('bindaddress') or self._bindAddress proxy = request.meta.get('proxy') if proxy: _, _, proxyHost, proxyPort, proxyParams = _parse(proxy) scheme = _parse(request.url)[0] proxyHost = to_unicode(proxyHost) omitConnectTunnel = b'noconnect' in proxyParams if scheme == b'https' and not omitConnectTunnel: proxyConf = (proxyHost, proxyPort, request.headers.get(b'Proxy-Authorization', None)) return self._TunnelingAgent(reactor, proxyConf, contextFactory=self._contextFactory, connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool) else: return self._ProxyAgent(reactor, proxyURI=to_bytes(proxy, encoding='ascii'), connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool) return self._Agent(reactor, contextFactory=self._contextFactory, connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool)
def _get_agent(self, request, timeout): bindaddress = request.meta.get('bindaddress') or self._bindAddress proxy = request.meta.get('proxy') if proxy: _, _, proxyHost, proxyPort, proxyParams = _parse(proxy) scheme = _parse(request.url)[0] omitConnectTunnel = proxyParams.find('noconnect') >= 0 if scheme == 'https' and not omitConnectTunnel: proxyConf = (proxyHost, proxyPort, request.headers.get('Proxy-Authorization', None)) return self._TunnelingAgent(reactor, proxyConf, contextFactory=self._contextFactory, connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool) else: endpoint = TCP4ClientEndpoint(reactor, proxyHost, proxyPort, timeout=timeout, bindAddress=bindaddress) return self._ProxyAgent(endpoint) return self._Agent(reactor, contextFactory=self._contextFactory, connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool)
def _get_agent(self, request: Request, timeout: Optional[float]) -> H2Agent: from twisted.internet import reactor bind_address = request.meta.get('bindaddress') or self._bind_address proxy = request.meta.get('proxy') if proxy: _, _, proxy_host, proxy_port, proxy_params = _parse(proxy) scheme = _parse(request.url)[0] proxy_host = proxy_host.decode() omit_connect_tunnel = b'noconnect' in proxy_params if omit_connect_tunnel: warnings.warn( "Using HTTPS proxies in the noconnect mode is not " "supported by the downloader handler. If you use Zyte " "Smart Proxy Manager, it doesn't require this mode " "anymore, so you should update scrapy-crawlera to " "scrapy-zyte-smartproxy and remove '?noconnect' from the " "Zyte Smart Proxy Manager URL.") if scheme == b'https' and not omit_connect_tunnel: # ToDo raise NotImplementedError( 'Tunneling via CONNECT method using HTTP/2.0 is not yet supported' ) return self._ProxyAgent( reactor=reactor, context_factory=self._context_factory, proxy_uri=URI.fromBytes(to_bytes(proxy, encoding='ascii')), connect_timeout=timeout, bind_address=bind_address, pool=self._pool, ) return self._Agent( reactor=reactor, context_factory=self._context_factory, connect_timeout=timeout, bind_address=bind_address, pool=self._pool, )
def _get_agent(self, request, timeout): bindaddress = request.meta.get('bindaddress') or self._bindAddress proxy = request.meta.get('proxy') if proxy: _, _, proxyHost, proxyPort, proxyParams = _parse(proxy) scheme = _parse(request.url)[0] omitConnectTunnel = proxyParams.find('noconnect') >= 0 if scheme == 'https' and not omitConnectTunnel: proxyConf = (proxyHost, proxyPort, request.headers.get('Proxy-Authorization', None)) return self._TunnelingAgent(reactor, proxyConf, contextFactory=self._contextFactory, connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool) else: _, _, host, port, proxyParams = _parse(request.url) proxyEndpoint = TCP4ClientEndpoint(reactor, proxyHost, proxyPort, timeout=timeout, bindAddress=bindaddress) agent = SOCKS5Agent(reactor, proxyEndpoint=proxyEndpoint) return agent return self._Agent(reactor, contextFactory=self._contextFactory, connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool)
def _get_agent(self, request, timeout): bindAddress = request.meta.get('bindaddress') or self._bindAddress # this needs http_proxy environment variable or proxy middleware, # otherwise it will be none # proxy = request.meta.get('proxy') proxy = settings.get('SOCKSPROXY', '127.0.0.1:9050') logger.debug('downloader agent proxy: %s' % proxy) if proxy: _, _, proxyHost, proxyPort, proxyParams = _parse(proxy) _, _, host, port, proxyParams = _parse(request.url) proxyEndpoint = TCP4ClientEndpoint(reactor, proxyHost, proxyPort, timeout=timeout, bindAddress=bindAddress) newcirc = settings.get('NEWCIRC', False) if newcirc: username = hex(random.randint(0, 2**32)) password = hex(random.randint(0, 2**32)) agent = SOCKS5Agent(reactor, proxyEndpoint=proxyEndpoint, endpointArgs=dict(methods=dict(login=(username,password)))) else: agent = SOCKS5Agent(reactor, proxyEndpoint=proxyEndpoint) return agent return self._Agent(reactor, contextFactory=self._contextFactory, connectTimeout=timeout, bindAddress=bindAddress, pool=self._pool)
def _get_agent(self, request, timeout): bind_address = request.meta.get('bindaddress') or self._bindAddress proxy = os.environ.get("SOCKS_PROXY", request.meta.get('proxy')) _proxy_protocol, _proxy_hostport, proxy_host, proxy_port, _proxy_params = _parse( proxy) proxy_endpoint = TCP4ClientEndpoint(reactor, proxy_host, proxy_port, timeout=timeout, bindAddress=bind_address) agent = txtorcon_web.tor_agent(reactor, socks_endpoint=proxy_endpoint) return agent
def _get_agent(self, request, timeout): proxy = request.meta['proxy'] if proxy: proxy_scheme, _, proxy_host, proxy_port, _ = _parse(proxy) proxy_scheme = str(proxy_scheme, 'utf-8') if proxy_scheme == 'socks5': endpoint = TCP4ClientEndpoint(reactor, proxy_host, proxy_port) self._sslMethod = openssl_methods[DOWNLOADER_CLIENT_TLS_METHOD] self._contextFactoryClass = load_object( DOWNLOADER_CLIENTCONTEXTFACTORY) self._contextFactory = create_instance( objcls=self._contextFactoryClass, settings=settings, crawler=None, method=self._sslMethod, ) return self._Agent(reactor, proxyEndpoint=endpoint, contextFactory=self._contextFactory) return super(TorScrapyAgent, self)._get_agent(request, timeout)
def testFactoryInfo(self): url = self.getURL('file') scheme, netloc, host, port, path = client._parse(url) factory = client.ScrapyHTTPClientFactory(Request(url)) reactor.connectTCP(host, port, factory) return factory.deferred.addCallback(self._cbFactoryInfo, factory)
def testFactoryInfo(self): url = self.getURL('file') _, _, host, port, _ = client._parse(url) factory = client.ScrapyHTTPClientFactory(Request(url)) reactor.connectTCP(to_unicode(host), port, factory) return factory.deferred.addCallback(self._cbFactoryInfo, factory)
def _get_agent(self, request, timeout): bindaddress = request.meta.get('bindaddress') or self._bindAddress proxy = request.meta.get('proxy') if proxy: proxyScheme, _, proxyHost, proxyPort, proxyParams = _parse(proxy) if proxyScheme.startswith(b'http'): scheme = _parse(request.url)[0] proxyHost = to_unicode(proxyHost) omitConnectTunnel = b'noconnect' in proxyParams if scheme == b'https' and not omitConnectTunnel: proxyConf = (proxyHost, proxyPort, request.headers.get(b'Proxy-Authorization', None)) return self._TunnelingAgent( reactor, proxyConf, contextFactory=self._contextFactory, connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool) else: return self._ProxyAgent(reactor, proxyURI=to_bytes( proxy, encoding='ascii'), connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool) elif proxyScheme == b'socks4': proxyEndPoint = TCP4ClientEndpoint(reactor, proxyHost, proxyPort, timeout=timeout, bindAddress=bindaddress) agent = SOCKS4Agent(reactor, proxyEndPoint=proxyEndPoint, contextFactory=self._contextFactory, connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool) return agent elif proxyScheme == b'socks5': proxyEndPoint = TCP4ClientEndpoint(reactor, proxyHost, proxyPort, timeout=timeout, bindAddress=bindaddress) proxyAuth = request.headers.get(b'Proxy-Authorization', None) if proxyAuth: proxyUser, proxyPassword = b64decode( proxyAuth.split()[-1]).split(b':') agent = SOCKS5Agent( reactor, proxyEndpoint=proxyEndPoint, endpointArgs=dict( methods={'login': (proxyUser, proxyPassword)}), contextFactory=self._contextFactory, connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool) else: agent = SOCKS5Agent(reactor, proxyEndpoint=proxyEndPoint, contextFactory=self._contextFactory, connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool) return agent return self._Agent(reactor, contextFactory=self._contextFactory, connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool)