def __init__(self, spider_name): self.spider_name = spider_name self.redis_key = f'{spider_name}:start_urls' self.redis_dupefilter = f'{spider_name}:dupefilter' self.redis_requests = f'{spider_name}:requests' self.reqs = [] self.redisctrl = RedisCtrl()
def list_parse(self, response): rows = BeautifulSoup(response.text, 'lxml').find_all('div', class_="fangyuan_list-con") reqs = [] for row in rows: detail_url = row.find('a').get('href') list_item = zhifang_list_Item() # save value for your item here like: # list_item['title'] = row.find('a').text list_item['tit'] = row.find('p', class_="tit").text list_item['txt'] = row.find('p', class_="txt").text list_item['tit2'] = row.find_all('p', class_="tit")[-1].text list_item['price'] = row.find('p', class_="price").text list_item['agent'] = row.find('p', class_="name").text # default column list_item['detail_full_url'] = response.urljoin(detail_url) list_item['pkey'] = md5(list_item['detail_full_url']) list_item['pagenum'] = response.meta.get('pagenum') yield list_item req = ScheduledRequest( url=list_item['detail_full_url'], method='GET', callback='detail', body={}, # 如果是POST,在这边填写post字典 meta={ 'fkey': list_item['pkey'], 'pagenum': list_item['pagenum'], # 反爬相关的meta字典也填写这边,然后在spider中启用相应的中间件 # 'cookies': {}, # 一般反爬 # 'splash': {'wait': 2} # js加载、异步加载渲染 }) reqs.append(req) # 将详情链接作为新的任务 推到redis RedisCtrl().reqs_push(self.redis_key, reqs)
def delete(self): redisctrl = RedisCtrl() redisctrl.keys_del( [self.redis_key, self.redis_dupefilter, self.redis_requests])
class SPJob: def __init__(self, spider_name): self.spider_name = spider_name self.redis_key = f'{spider_name}:start_urls' self.redis_dupefilter = f'{spider_name}:dupefilter' self.redis_requests = f'{spider_name}:requests' self.reqs = [] self.redisctrl = RedisCtrl() # 生成任务,重写该函数 def make_job(self, pages): for pagenum in range(1, pages + 1): req = ScheduledRequest( url='', # 请求地址 method='', # 请求方式 GET/POST callback='', # 回调函数标识 body={}, # post表单 meta={} # 元数据和反爬配置 ) self.reqs.append(req) self.push() # 将任务推到redis def push(self): self.redisctrl.reqs_push(self.redis_key, self.reqs) self.reqs.clear() # 删除redis上一次残留任务 def delete(self): self.redisctrl.keys_del( [self.redis_key, self.redis_dupefilter, self.redis_requests]) # cluster模式下:ssh 启动slave爬虫 def ssh_run(self, *args): slave = random.sample(SLAVES, 1)[0] if not SLAVES_BALANCE else SLAVES_BALANCE if SLAVES_ENV: cmd = f'source {SLAVES_ENV}/bin/activate; cd {SLAVES_WORKSPACE}; scrapy crawl {self.spider_name};' else: cmd = f"cd {SLAVES_WORKSPACE}; scrapy crawl {self.spider_name};" ssh = SSH(slave) hostname = ssh.hostname logger.info(f"slave:{hostname} 爬虫正在采集...") status, msg_out, msg_error = ssh.execute(cmd) if status != 0: logger.error(f"slave:{hostname} 爬虫执行失败:{msg_out + msg_error}") else: logger.info(f"slave:{hostname} 爬虫执行成功") # standalone模式下:启动爬虫 def run(self, *args): cmd = f"scrapy crawl {self.spider_name}" logger.info(f"爬虫正在采集...") p = subprocess.Popen(cmd, shell=True) p.communicate() stdout, stderr = p.communicate() # 忽视输出 if p.returncode != 0: logger.error( f"爬虫执行失败:{stdout.decode('utf-8')} {stderr.decode('utf-8')}") else: logger.info(f"爬虫执行成功") # 执行爬虫 def crawl(self, num=1): size = self.redisctrl.key_len(self.redis_key) if not size: logger.info(f"{self.redis_key} 中没有待执行的任务, 请检查!") return if CRAWL_MODEL.lower() == 'cluster': logger.name = "spiderman.model.cluster" if not (SLAVES or SLAVES_BALANCE): logger.error(f"请配置 SLAVES 机器!") return logger.info(f"初始任务数: {size} 启动爬虫数量: {num}") pool = ThreadPoolExecutor(max_workers=num) for i in pool.map(self.ssh_run, [i for i in range(num)]): ... # 等待所有线程完成 else: logger.name = "spiderman.model.standalone" logger.info(f"初始任务数: {size} 启动爬虫数量: {num}") pool = ThreadPoolExecutor(max_workers=num) for i in pool.map(self.run, [i for i in range(num)]): ... # 等待所有线程完成
def push(self): redisctrl = RedisCtrl() redisctrl.reqs_push(self.redis_key, self.reqs) self.reqs.clear()