def startRequest(self, request, url, feed_config = None, selector_defer=None, sanitize=False): downloader = self.downloadercls(self.feed, self.debug, self.snapshot_dir, self.stat_tool, self.memon, request=request, url=url, feed_config=feed_config, selector_defer=selector_defer, sanitize=sanitize, max_size=self.max_size) sresponse = self.tryLocalPage(url) if sresponse: if selector_defer: reactor.callLater(0, selector_defer.callback, sresponse) else: downloader.writeResponse(request, sresponse, feed_config) else: agent = BrowserLikeRedirectAgent( Agent(reactor, contextFactory=ScrapyClientContextFactory(), # skip certificate verification connectTimeout=10), #pool=pool), redirectLimit=5 ) d = agent.request( 'GET', url, twisted_headers({ 'Accept': ['text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'], 'Accept-Encoding': ['gzip, deflate, sdch'], 'User-Agent': [self.user_agent] }), None ) print('Request <GET %s> started' % (url,)) d.addCallback(downloader.downloadStarted) d.addErrback(downloader.downloadError)
def getContext(self, hostname=None, port=None): self.method = SSL.SSLv23_METHOD ctx = ScrapyClientContextFactory.getContext(self) ctx.set_options(SSL.OP_ALL) if hostname: ClientTLSOptions(hostname, ctx) return ctx
def testPayloadDefaultCiphers(self): s = "0123456789" * 10 d = getPage( self.getURL("payload"), body=s, contextFactory=ScrapyClientContextFactory()) return self.assertFailure(d, OpenSSL.SSL.Error)
def getContext(self, hostname=None, port=None): ctx = ScrapyClientContextFactory.getContext(self) # Enable all workarounds to SSL bugs as documented by # http://www.openssl.org/docs/ssl/SSL_CTX_set_options.html ctx.set_options(SSL.OP_ALL) if hostname: ClientTLSOptions(hostname, ctx) return ctx
r.set(url, int(time.time())) return 0 GC_PERIOD_SECONDS = 3 * 60 * 60 # 3 hours def periodical_garbage_collect(): tm = int(time.time()) if tm - periodical_garbage_collect.time >= GC_PERIOD_SECONDS: print('GC: the number of unreachable objects: %s' % gc.collect()) periodical_garbage_collect.time = tm periodical_garbage_collect.time = int(time.time()) agent = BrowserLikeRedirectAgent( Agent(reactor, contextFactory=ScrapyClientContextFactory(), # skip certificate verification connectTimeout=10), redirectLimit=5 ) def html2json(el): return [ el.tag, {"tag-id": el.attrib["tag-id"]}, [html2json(e) for e in el.getchildren() if isinstance(e, etree.ElementBase)] ] def setBaseAndRemoveScriptsAndMore(response, url): response.selector.remove_namespaces() tree = response.selector.root.getroottree()
def __init__(self): ScrapyClientContextFactory.__init__(self) self.method = SSL.TLSv1_2_METHOD