def start_requests(self): if not self.max_page: self.max_page = 1 page = 0 while page < self.max_page: page += 1 for i in range(len(self.start_urls)): url = self.start_urls[i] url = url.format(page=page) log.info(f'爬取第 {i+1}/{len(self.start_urls)} 个地址,第 {page}/{self.max_page} 页,{url}') yield scrapy.Request(url=url)
def process_request(self, request, spider): """处理请求""" if self.use_unique_proxy: if not self.unique_proxy: self.unique_proxy = proxy_manager.get() log.info(f'use unique proxy {self.unique_proxy}') proxy = self.unique_proxy else: proxy = proxy_manager.get() log.info(f'use random proxy {proxy}') request.meta["proxy"] = str(proxy)
def fail(self, proxy: ProxyItem): """失败时调用""" # 添加失败次数 self._execute(self._sql_add_times, proxy, key='fail_times') # 获取失败次数 sql = self._sql_get_fail_times_by_ip.format(**proxy) record = asyncio.get_event_loop().run_until_complete(self.manager.conn.fetchrow(sql)) if record: fail_times = record['fail_times'] if fail_times and fail_times >= 5: # 大于 10 次,置为不可用 log.info(f'{proxy} fail set available=0') self._execute(self._sql_update_fail, proxy) else: log.info(f'{proxy} fail set fail_times={fail_times}')
def filter_proxy_list(self, proxy_list): """过滤并保存代理""" log.info(f'抓取 {self.name} 共 {len(proxy_list)} 个代理,校验有效性') # 设置名字 for proxy in proxy_list: proxy['source_domain'] = self.name # 每次都新建,各自过滤各自保存 proxy_filter = self.get_proxy_filter(proxy_list) if proxy_filter is not None: available_proxy_list = proxy_filter.filter() else: available_proxy_list = proxy_list log.info(f'{self.name} 共 {len(available_proxy_list)}/{len(proxy_list)} 个代理有效') return available_proxy_list
def validate_response(self, proxy, result) -> bool: """从爬取代理的过程中,因为直接爬了抖音,所以解析数据""" status_code = result['status_code'] if status_code == 0: aweme_list = result['aweme_list'] proxy['available'] = 1 log.info(f'{proxy}-代理有效,爬到 {len(aweme_list)} items') for aweme in aweme_list: item = DouyinItem(aweme) self.items.append(item) return True elif status_code == 2154: proxy['available'] = 2 proxy['banned_time'] = time.time() log.info(f"{proxy}-代理有效,但已被禁{result['status_code']}") return True else: print(f"{proxy}-代理无效,返回状态码{result['status_code']}") return False
def start_requests(self): if not self.max_page: self.max_page = 1 keyword_length = len(self.keyword_list) for i in range(keyword_length): self.keyword = self.keyword_list[i] page = 0 while page < self.max_page: page += 1 url_length = len(self.start_urls) for j in range(url_length): url = self.start_urls[j] url = url.format(keyword=urllib.parse.quote(self.keyword), page=page) log.info( f'爬取关键字{i + 1}/{keyword_length},地址{j + 1}/{url_length},页数{page}/{self.max_page},' f'{url}') yield scrapy.Request(url=url)
def start_requests(self): self.statistics.start_time = time.time() i = 0 self.statistics.start_craw_count = self.manager.count() log.info(f'爬取前 item 数量 {self.statistics.start_craw_count}') while i < 1: # i += 1 if not ANONYMOUS: log.info(f'sleep {self.sleep_time}') time.sleep(self.sleep_time) self.sleep_time = 1 # 并发的时候,time 是相同的,被 scrapy 认为是相同地址而忽略 # 后来发现要设置 dont_filter anonymous = ANONYMOUS url = douyin.generate_feed_url('http', anonymous) headers = douyin.generate_headers(anonymous) cookies = douyin.generate_cookies(anonymous) self.statistics.crawled_pages += 1 log.info(f'crawl {self.statistics.crawled_pages} page:' + url) yield scrapy.Request(url=url, headers=headers, cookies=cookies, dont_filter=True) if self.has_more == 0 or self.exit_code == 0: break
def banned(self, proxy: ProxyItem): """被禁时调用""" log.info(f'{proxy} banned') self._execute(self._sql_update_banned, proxy)
def success(self, proxy: ProxyItem): """成功时调用""" log.info(f'{proxy} success') self._execute(self._sql_update_success, proxy)
def save_items(self): log.info(f'共爬取到 {len(self.items)} items,保存中') for item in self.items: self.pipeline.process_item(item, None) self.items.clear() self.pipeline.close_spider(None)
def parse(self, response): try: body = response.body.decode() if body == 'error': print('body 为 error,异常已拦截') return result = json.loads(body) status_code = result['status_code'] if result['status_code'] == 0: self.has_more = result['has_more'] aweme_list = result['aweme_list'] # 之前的数量 before_item_count = self.manager.count() for aweme in aweme_list: item = DouyinItem(aweme) yield item # 保存后再统计 # 之后的数量 current_item_count = self.manager.count() self.statistics.crawled_success__pages += 1 self.statistics.crawled_items += len(aweme_list) minute = (time.time() - self.statistics.start_time) / 60 available_count = current_item_count - before_item_count if available_count <= 2: # 获取数量较小,需要等待 if self.statistics.few_available_result_times >= 5: self.statistics.few_available_result_times = 0 log.info('有 5 次抓取到的有效结果都较少,等待 600 s') self.sleep_time = 600 else: self.statistics.few_available_result_times += 1 log.info('抓取到的有效结果较少,等待 60s') self.sleep_time = 60 else: self.statistics.few_available_result_times = 0 log.info( f'scraped {len(aweme_list)} items,available {available_count} items.' ) speed = self.statistics.crawled_items / minute log.info( f'scraped {self.statistics.crawled_success__pages}/{self.statistics.crawled_pages} pages,' f'{current_item_count - self.statistics.start_craw_count}/{self.statistics.crawled_items} items,' f'spend {self.parse_time(minute)},speed {speed:#.2f} items/min.' ) elif status_code == 2145: log.warning('请求已过期') self.exit_code = 0 elif status_code == 2151: log.warning('签名错误') self.exit_code = 0 elif status_code == 2154: # 大约会被禁 1 个小时 log.warning('请求太频繁,设备被禁') if ANONYMOUS: # 已经在下载器中间件拦截,应该不会走到这里的 pass else: # 不匿名需要处理 log.warning('休息 10 分钟') self.sleep_time = 10 * 60 # 仅休眠,不退出 # self.exit_code = 0 else: log.warning('错误码 %d' % status_code) log.warning(response.body.decode()) self.exit_code = 0 except Exception as e: # TODO 这里要解析代理出错,或者在中间件里处理 log.error('出错了') log.error(repr(e))
def crawl_in_loop(self, runner): """在循环中爬取""" # 遍历取出 spider spider_list = [] for spider_class in iter_spider_classes(regex_proxy_spider): ip_count = getattr(spider_class, 'ip_count', 0) if ip_count > 0: spider_list.append(spider_class) all_loop = ProxyCounter() single_loop = ProxyCounter() # 开始时起动,每轮结束后计数 all_loop.start() # 无限循环 loop_times = 0 while loop_times >= 0: loop_times += 1 # 每轮开始时启动,每个爬虫结束时计数 single_loop.start() while single_loop.available.start_num > 100: print(f'有效 ip {single_loop.available.start_num} 个,休息 10 分钟') time.sleep(60 * 10) single_loop.start() # 开始时的数量 log.info(f'第 {loop_times} 轮爬取开始') # 爬取 for i in range(len(spider_list)): spider = spider_list[i] log.info( f'第 {loop_times} 轮,第 {i + 1}/{len(spider_list)} 个爬虫 {spider.name} 开始爬取,' f'{single_loop.print_count()}') spider = spider_list[i] try: yield runner.crawl(spider) except SystemExit: pass sleep_time = 10 divider = '-' * 10 single_loop.count() log.info( f'{divider}第 {loop_times} 轮,第 {i + 1}/{len(spider_list)} 个爬虫 {spider.name} 爬取结束,' f'{single_loop.print_count()} {divider}') log.info(f'等待执行下一爬虫,sleep {sleep_time}') time.sleep(sleep_time) # 延时下一轮 sleep_time = 60 log.info(f'本轮爬取结束,等待下一轮,sleep {sleep_time}') all_loop.count() log.info(all_loop.print_count()) time.sleep(sleep_time) # noinspection PyUnresolvedReferences reactor.stop()
def process_request(self, request, spider): proxy = proxy_manager.get() log.info(f'use random proxy {proxy}') request.meta["proxy"] = str(proxy)
def crawl_in_loop(self, runner): """在循环中爬取""" # 遍历取出 spider spider_list = [] for spider_class in iter_spider_classes(regex_proxy_spider): ip_count = getattr(spider_class, 'ip_count', 0) if ip_count > 0: spider_list.append(spider_class) loop_times = 0 loop_end_count = 0 all_loop_proxy_count = 0 """整个循环中爬取的代理总数""" # 无限循环 while loop_times >= 0: loop_times += 1 # 开始时的数量 if loop_end_count == 0: # 首次获取 loop_start_count = proxy_manager.count() else: # 取循环结束时的获取 loop_start_count = loop_end_count log.info(f'第 {loop_times} 轮爬取开始,当前 ip 共 {loop_start_count} 个') # 爬取 spider_end_count = 0 for i in range(len(spider_list)): spider = spider_list[i] if spider_end_count == 0: spider_start_count = loop_start_count else: spider_start_count = spider_end_count log.info( f'第 {loop_times} 轮,第 {i+1}/{len(spider_list)} 个爬虫 {spider.name} 开始爬取,' f'当前 ip 共 {spider_start_count} 个') spider = spider_list[i] try: yield runner.crawl(spider) except SystemExit: pass sleep_time = 10 spider_end_count = proxy_manager.count() spider_crawled_count = spider_end_count - spider_start_count loop_crawled_count = spider_end_count - loop_start_count # 单次循环爬取到的数量 all_loop_proxy_count += loop_crawled_count divider = '-' * 10 log.info( f'{divider}第 {loop_times} 轮,第 {i+1}/{len(spider_list)} 个爬虫 {spider.name} 爬取结束,' f'共爬取到 {spider_crawled_count}/{loop_crawled_count}/{all_loop_proxy_count} 个代理{divider}' ) log.info(f'等待执行下一爬虫,sleep {sleep_time}') log.info(f'当前有效代理共 {proxy_manager.available_count()} 个') time.sleep(sleep_time) # 结束时的数量 loop_end_count = proxy_manager.count() # 延时下一轮 sleep_time = 60 log.info( f'本轮共爬到 {loop_end_count-loop_start_count}/{loop_end_count} 个代理,等待下一轮,sleep {sleep_time}' ) log.info(f'当前有效代理共 {proxy_manager.available_count()} 个') time.sleep(sleep_time) reactor.stop()