示例#1
0
 def __init__(self, spider_name):
     self.spider_name = spider_name
     self.redis_key = f'{spider_name}:start_urls'
     self.redis_dupefilter = f'{spider_name}:dupefilter'
     self.redis_requests = f'{spider_name}:requests'
     self.reqs = []
     self.redisctrl = RedisCtrl()
示例#2
0
    def list_parse(self, response):
        rows = BeautifulSoup(response.text,
                             'lxml').find_all('div',
                                              class_="fangyuan_list-con")
        reqs = []
        for row in rows:
            detail_url = row.find('a').get('href')
            list_item = zhifang_list_Item()
            # save value for your item here like:
            # list_item['title'] = row.find('a').text
            list_item['tit'] = row.find('p', class_="tit").text
            list_item['txt'] = row.find('p', class_="txt").text
            list_item['tit2'] = row.find_all('p', class_="tit")[-1].text
            list_item['price'] = row.find('p', class_="price").text
            list_item['agent'] = row.find('p', class_="name").text
            # default column
            list_item['detail_full_url'] = response.urljoin(detail_url)
            list_item['pkey'] = md5(list_item['detail_full_url'])
            list_item['pagenum'] = response.meta.get('pagenum')
            yield list_item

            req = ScheduledRequest(
                url=list_item['detail_full_url'],
                method='GET',
                callback='detail',
                body={},  # 如果是POST,在这边填写post字典
                meta={
                    'fkey': list_item['pkey'],
                    'pagenum': list_item['pagenum'],
                    # 反爬相关的meta字典也填写这边,然后在spider中启用相应的中间件
                    # 'cookies': {},      # 一般反爬
                    # 'splash': {'wait': 2}  # js加载、异步加载渲染
                })
            reqs.append(req)

        # 将详情链接作为新的任务 推到redis
        RedisCtrl().reqs_push(self.redis_key, reqs)
示例#3
0
 def delete(self):
     redisctrl = RedisCtrl()
     redisctrl.keys_del(
         [self.redis_key, self.redis_dupefilter, self.redis_requests])
示例#4
0
class SPJob:
    def __init__(self, spider_name):
        self.spider_name = spider_name
        self.redis_key = f'{spider_name}:start_urls'
        self.redis_dupefilter = f'{spider_name}:dupefilter'
        self.redis_requests = f'{spider_name}:requests'
        self.reqs = []
        self.redisctrl = RedisCtrl()

    # 生成任务,重写该函数
    def make_job(self, pages):
        for pagenum in range(1, pages + 1):
            req = ScheduledRequest(
                url='',  # 请求地址
                method='',  # 请求方式  GET/POST
                callback='',  # 回调函数标识
                body={},  # post表单
                meta={}  # 元数据和反爬配置
            )
            self.reqs.append(req)
        self.push()

    # 将任务推到redis
    def push(self):
        self.redisctrl.reqs_push(self.redis_key, self.reqs)
        self.reqs.clear()

    # 删除redis上一次残留任务
    def delete(self):
        self.redisctrl.keys_del(
            [self.redis_key, self.redis_dupefilter, self.redis_requests])

    # cluster模式下:ssh 启动slave爬虫
    def ssh_run(self, *args):
        slave = random.sample(SLAVES,
                              1)[0] if not SLAVES_BALANCE else SLAVES_BALANCE
        if SLAVES_ENV:
            cmd = f'source {SLAVES_ENV}/bin/activate; cd {SLAVES_WORKSPACE}; scrapy crawl {self.spider_name};'
        else:
            cmd = f"cd {SLAVES_WORKSPACE}; scrapy crawl {self.spider_name};"
        ssh = SSH(slave)
        hostname = ssh.hostname
        logger.info(f"slave:{hostname} 爬虫正在采集...")
        status, msg_out, msg_error = ssh.execute(cmd)
        if status != 0:
            logger.error(f"slave:{hostname} 爬虫执行失败:{msg_out + msg_error}")
        else:
            logger.info(f"slave:{hostname} 爬虫执行成功")

    # standalone模式下:启动爬虫
    def run(self, *args):
        cmd = f"scrapy crawl {self.spider_name}"
        logger.info(f"爬虫正在采集...")
        p = subprocess.Popen(cmd, shell=True)
        p.communicate()
        stdout, stderr = p.communicate()  # 忽视输出
        if p.returncode != 0:
            logger.error(
                f"爬虫执行失败:{stdout.decode('utf-8')} {stderr.decode('utf-8')}")
        else:
            logger.info(f"爬虫执行成功")

    # 执行爬虫
    def crawl(self, num=1):
        size = self.redisctrl.key_len(self.redis_key)

        if not size:
            logger.info(f"{self.redis_key} 中没有待执行的任务, 请检查!")
            return

        if CRAWL_MODEL.lower() == 'cluster':
            logger.name = "spiderman.model.cluster"
            if not (SLAVES or SLAVES_BALANCE):
                logger.error(f"请配置 SLAVES 机器!")
                return
            logger.info(f"初始任务数: {size} 启动爬虫数量: {num}")
            pool = ThreadPoolExecutor(max_workers=num)
            for i in pool.map(self.ssh_run, [i for i in range(num)]):
                ...  # 等待所有线程完成
        else:
            logger.name = "spiderman.model.standalone"
            logger.info(f"初始任务数: {size} 启动爬虫数量: {num}")
            pool = ThreadPoolExecutor(max_workers=num)
            for i in pool.map(self.run, [i for i in range(num)]):
                ...  # 等待所有线程完成
示例#5
0
 def push(self):
     redisctrl = RedisCtrl()
     redisctrl.reqs_push(self.redis_key, self.reqs)
     self.reqs.clear()