def deal_data(url, text, selector): ''' 处理响应结果 :return: ''' if 'ArticleTitle' in text and 't20131029_1012002' not in url: redis_db.sadd(content_key, text) ctx.log('详情内容获取成功, 已缓存至redis!')
def deal_data(url, text, selector): ''' 处理响应结果 :return: ''' if 'ArticleTitle' in text: redis_db.sadd(content_key, text) ctx.log('详情内容获取成功, 已缓存至redis!')
def cach_task(url, text, selector): ''' 缓存任务 :return: ''' # 分页任务缓存 page_count = re.findall(r"createPageHTML\((\d+)", text, re.S) if page_count and 'index_' not in url: url_list = [] for page in range(1, int(page_count[0])): url_list.append(urljoin(url, 'index_{}.shtml'.format(page))) redis_db.sadd(catlog_key, url_list) ctx.log('缓存 catlog_page_list 成功!') # 详情任务缓存 detail_url_list = selector.css('.lsj-list li a::attr(href)').extract() if detail_url_list: redis_db.sadd(detail_key, detail_url_list) ctx.log('缓存 detail_url_list 成功!')
def start(): ip = '127.0.0.1' port = 8080 print("温馨提示:服务IP {} 端口 {} 请确保代理已配置".format(ip, port)) myaddon = RuiShuCapture() opts = options.Options(listen_port=port) pconf = proxy.config.ProxyConfig(opts) m = DumpMaster(opts) m.server = proxy.server.ProxyServer(pconf) m.addons.add(myaddon) try: m.run() except KeyboardInterrupt: m.shutdown() if __name__ == '__main__': tart_url = [ 'http://fgw.hubei.gov.cn/gzjj/tzgg/tz/', 'http://fgw.hubei.gov.cn/gzjj/dtyw/fgyw/', ] # 推送任务入口 if not redis_db.sget('gov:start_task'): redis_db.sadd('gov:start_task', tart_url) print('pushed start_tasks !') start()