def fetchFreeProxy(num=500): global availables import sys del sys.modules['twisted.internet.reactor'] from twisted.internet import reactor from twisted.internet import default default.install() res = requests.get( "http://www.89ip.cn/tqdl.html?api=1&num=%d&port=&address=&isp=" % (num)) proxies = [] # add new proxy into proxy list for proxy in re.findall("([0-9\.:]{10,})", res.content): proxies.append('http://' + proxy) # merge old available proxy into proxy list for item in availables: proxies.append(item['proxy']) # clean availables availables = [] proxies = list(set(proxies)) r = requests.session() url = "https://www.baidu.com/img/bd_logo1.png?where=super" deferred_list = [] if True: for proxy in proxies: for i in range(0, 1): deferred = checkStatus(url=url, proxy=proxy, timeout=10) deferred.addCallback(callback) # 请求返回后的回调函数 deferred.addErrback(errback) deferred_list.append(deferred) # 把所有的请求加到列表里,后面要检测 dlist = defer.DeferredList(deferred_list) # 检测所有的请求 dlist.addBoth(lambda _: reactor.stop()) # 检测到所有请求都执行完,执行的方法 reactor.run()
from twisted.web.client import getPage, defer from twisted.internet import reactor def all_done(arg): reactor.stop() # 结束死循环 def callback(contents): print(contents.decode('utf-8')) deferred_list = [] url_list = [ 'http://www.bing.com', 'http://www.baidu.com', ] for url in url_list: deferred = getPage(bytes(url, encoding='utf8')) # 发送请求 deferred.addCallback(callback) # 回调函数 deferred_list.append(deferred) # 收集结果 dlist = defer.DeferredList(deferred_list) # 延迟等待 dlist.addBoth(all_done) reactor.run() # 死循环, 查看是否响应了
def start(self): dl = defer.DeferredList(self._active) dl.addBoth(self._stop_reactor) # 所有的爬虫终止 调用 reactor.run()
def stop(self): return defer.DeferredList([c.stop() for c in list(self.crawlers)])
def start(self): #开始运行 dl = defer.DeferredList(self._active) dl.addBoth(self._stop_reactor) reactor.run()
def _stop_reactor(_=None): reactor.stop() count = 1 def callback(response): global count count += 1 print(len(response)) if count > 3: return None for i in range(10): yield Request("http://dig.chouti.com/all/hot/recent/%s" % i, callback) if __name__ == '__main__': spider_list = [ [Request("http://www.baidu.com", callback), ] ] _active = set() for spider in spider_list: ret = crawl(spider) _active.add(ret) dl = defer.DeferredList(_active) dl.addBoth(_stop_reactor) reactor.run()
#twisted 是一个网络框架,其中一个功能是发送异步请求,检测IO并自动切换 #twisted 基本用法 from twisted.web.client import getPage, defer from twisted.internet.reactor import sto def all_done(arg): reactor.stop() def callback(res): print(res) return 1 defer_list = [] urls = [ 'https://www.baidu.com', 'https://www.sina.com.cn', 'https://www.python.org', ] for url in urls: obj = getPage(url.encode('utf=-8'), ) obj.addCallback(callback) defer_list.append(obj) defer.DeferredList(defer_list).addBoth(all_done) reactor.run()
from twisted.web.client import getPage, defer from twisted.internet import reactor def stop_loop(arg): # 停止循环 reactor.stop() def get_response(contents): print(contents) deferred_list = [] url_list = [ 'http://www.baidu.com/', 'https://www.cnblogs.com/', 'https://www.cnblogs.com/news/', 'https://www.cn.bing.com/', 'https://stackoverflow.com/', ] for url in url_list: deferred = getPage(bytes(url, encoding='utf8')) deferred.addCallback(get_response) deferred_list.append(deferred) dlist = defer.DeferredList(deferred_list) # 监听是否完成了 完成了就执行下面 dlist.addBoth(stop_loop) reactor.run()