def __init__(self, document_body=None, transport=None, **kwargs): """ Create Grab instance """ self.meta = {} self._doc = None self.config = default_config() self.config['common_headers'] = self.common_headers() self.cookies = CookieManager() self.proxylist = ProxyList() # makes pylint happy self.request_counter = None self.request_head = None self.request_body = None self.request_method = None self.transport_param = transport self.transport = None self.reset() if kwargs: self.setup(**kwargs) if document_body is not None: self.setup_document(document_body)
def load_proxylist(self, source, source_type=None, proxy_type='http', auto_init=True, auto_change=True, **kwargs): self.proxylist = ProxyList() if isinstance(source, BaseProxySource): self.proxylist.set_source(source) elif isinstance(source, six.string_types): if source_type == 'text_file': self.proxylist.load_file(source, proxy_type=proxy_type) elif source_type == 'url': self.proxylist.load_url(source, proxy_type=proxy_type) else: raise SpiderMisuseError('Method `load_proxylist` received ' 'invalid `source_type` argument: %s' % source_type) else: raise SpiderMisuseError('Method `load_proxylist` received ' 'invalid `source` argument: %s' % source) self.proxylist_enabled = True self.proxy = None if not auto_change and auto_init: self.proxy = self.proxylist.get_random_proxy() self.proxy_auto_change = auto_change
def test_get_next_proxy(self): pl = ProxyList() path = self.generate_plist_file('foo:1\nbar:1') pl.load_file(path) self.assertEqual(pl.get_next_proxy().host, 'foo') self.assertEqual(pl.get_next_proxy().host, 'bar') self.assertEqual(pl.get_next_proxy().host, 'foo') pl.load_file(path) self.assertEqual(pl.get_next_proxy().host, 'foo')
def load_proxylist(self, source, source_type, proxy_type='http', auto_init=True, auto_change=True, **kwargs): self.proxylist = ProxyList(source, source_type, proxy_type=proxy_type, **kwargs) self.proxylist_enabled = True self.proxy = None if not auto_change and auto_init: self.proxy = self.proxylist.get_random() self.proxy_auto_change = auto_change
def test_get_next_proxy(self): with temp_file() as path: plist = ProxyList() self.generate_plist_file(path, 'foo:1\nbar:1') plist.load_file(path) self.assertEqual(plist.get_next_proxy().host, 'foo') self.assertEqual(plist.get_next_proxy().host, 'bar') self.assertEqual(plist.get_next_proxy().host, 'foo') plist.load_file(path) self.assertEqual(plist.get_next_proxy().host, 'foo')
def __init__(self, document_body=None, transport='pycurl', **kwargs): """ Create Grab instance """ self.meta = {} self._doc = None self.config = default_config() self.config['common_headers'] = self.common_headers() self.cookies = CookieManager() self.proxylist = ProxyList() self.setup_transport(transport) self.reset() if kwargs: self.setup(**kwargs) if document_body is not None: self.setup_document(document_body)
def load_proxylist(self, source, source_type=None, proxy_type='http', auto_init=True, auto_change=True): """ Load proxy list. :param source: Proxy source. Accepts string (file path, url) or ``BaseProxySource`` instance. :param source_type: The type of the specified source. Should be one of the following: 'text_file' or 'url'. :param proxy_type: Should be one of the following: 'socks4', 'socks5' or'http'. :param auto_change: If set to `True` then automatical random proxy rotation will be used. Proxy source format should be one of the following (for each line): - ip:port - ip:port:login:password """ self.proxylist = ProxyList() if isinstance(source, BaseProxySource): self.proxylist.set_source(source) elif isinstance(source, six.string_types): if source_type == 'text_file': self.proxylist.load_file(source, proxy_type=proxy_type) elif source_type == 'url': self.proxylist.load_url(source, proxy_type=proxy_type) else: raise SpiderMisuseError('Method `load_proxylist` received ' 'invalid `source_type` argument: %s' % source_type) else: raise SpiderMisuseError('Method `load_proxylist` received ' 'invalid `source` argument: %s' % source) self.proxylist_enabled = True self.proxy = None if not auto_change and auto_init: self.proxy = self.proxylist.get_random_proxy() self.proxy_auto_change = auto_change
def test_web_proxy_source(self): pl = ProxyList() self.server.response['data'] = DEFAULT_PLIST_DATA pl.load_url(self.server.get_url()) self.assertEqual(2, pl.size())
def test_file_proxy_source(self): pl = ProxyList() path = self.generate_plist_file() pl.load_file(path) self.assertEqual(2, pl.size())
def test_basic(self): pl = ProxyList() self.assertEqual(0, pl.size())
def test_file_proxy_source(self): with temp_file() as path: plist = ProxyList() self.generate_plist_file(path) plist.load_file(path) self.assertEqual(2, plist.size())