def process_response(self, request, response, spider): """ 要考虑两种情况,一是被封,二是ip 失效 :param request: :param response: :param spider: :return: """ proxy_str = request.meta['proxy'] proxy = ProxyItem.parse(proxy_str) # 持有的是方法,只有一个实例,所以并发时 self.proxy 应该是不准确的,需从 request 获取 if isinstance(response, ErrorResponse): proxy_manager.fail(proxy) return self.on_request_error(request, response, spider) code, _ = douyin.parse_result(response.body.decode()) if code == 1: proxy_manager.success(proxy) elif code == 2: proxy_manager.banned(proxy) if douyin_spider.ANONYMOUS: # 匿名则忽略并继续 ,不匿名返回处理 return self.on_request_error(request, response, spider) else: return response else: proxy_manager.fail(proxy) return self.on_request_error(request, response, spider) return response
async def fetch(self, sql): record_list = await self.manager.fetch(sql) if record_list: print(f'读取到 ip {len(record_list)} 个') for record in record_list: item = ProxyItem(**record) self._proxy_queue.put(item)
class TestProxyManager(TestCase): item = ProxyItem(ip='123') def test_count(self): print(ProxyManager().count()) def test_available_count(self): print(ProxyManager().available_count()) def test_get(self): proxy = ProxyManager().get() print(proxy) def test_remove(self): ProxyManager().remove(self.item) def test_success(self): ProxyManager().success(ProxyItem(self.item)) def test_fail(self): ProxyManager().fail(ProxyItem(self.item)) def test_banned(self): ProxyManager().banned(ProxyItem(self.item)) def test_sql(self): pm = proxy_manager for k, v in pm.__dict__.items(): if k.startswith('_sql'): print(f'{k} = {v}')
def process_exception(self, request, exception, spider): """ 处理异常 如果超时了,则抛出一个 ErrorResponse,交给 process_response 处理 如果其返回 None ,Scrapy将会继续处理该异常,接着调用已安装的其他中间件的 process_exception() 方法,直到所有中间件都被调用完毕,则调用默认的异常处理。 如果其返回一个 Response 对象,则已安装的中间件链的 process_response() 方法被调用。Scrapy将不会调用任何其他中间件的 process_exception() 方法。 如果其返回一个 Request 对象, 则返回的request将会被重新调用下载。这将停止中间件的 process_exception() 方法执行,就如返回一个response的那样。 """ # 检查是否是已知的错误,如果是未知错误,可能需要记录处理 fail_exception_list = [ ConnectError, TunnelError, ResponseFailed, ] for e in fail_exception_list: if isinstance(exception, e): # 被拒绝 # 此处不调用 fail,返回 response 之后会在 process_response 中处 # proxy_manager.fail(proxy) return self.get_error_response(exception) if 'proxy' in request.meta: proxy_str = request.meta['proxy'] proxy = ProxyItem.parse(proxy_str) else: proxy = '' log.error(f"process_exception 还有未处理的异常" f"\nproxy is {proxy}" f"\ntype is {type(exception)}" f"\nexception is {exception}") return self.get_error_response(exception)
def process_exception(self, request, exception, spider): proxy_str = request.meta['proxy'] proxy = ProxyItem.parse(proxy_str) if isinstance(exception, IgnoreRequest): # 已忽略 return self.get_error_response(exception) else: # 检查是否是已知的错误,如果是未知错误,可能需要记录处理 fail_exception_list = [ error.ConnectError, error.ConnectionRefusedError, error.TCPTimedOutError, error.TimeoutError, # 超过设定的 timeout ResponseNeverReceived, ] for e in fail_exception_list: if isinstance(exception, e): # 被拒绝 # 此处不调用 fail,返回 response 之后会在 process_response 中处 # proxy_manager.fail(proxy) return self.get_error_response(exception) log.error("process_exception") log.error(f'proxy is {proxy}') log.error(type(exception)) log.error(exception)
def _execute(self, sql, proxy=None, key=None): """执行""" if proxy: if isinstance(proxy, str): sql = sql.format(now=int(time.time()), key=key, **ProxyItem.parse(proxy)) elif isinstance(proxy, ProxyItem): sql = sql.format(now=int(time.time()), key=key, **proxy) # print(f'执行 {sql}') asyncio.get_event_loop().run_until_complete(self.manager.execute(sql))
def parse_proxy_from_element(element, element_xpath_formatter=None, element_xpath_dict=None): """从元素中解析代理""" proxy_item = ProxyItem() if element_xpath_formatter and element_xpath_dict: for k, v in element_xpath_dict.items(): if '-' in v: v, xpath = v.split('-') else: xpath = element_xpath_formatter xpath = xpath % v proxy_item[k] = element.xpath(xpath)[0] return proxy_item
def parse_proxy_list(self, text): """解析代理""" # 将回车替换,方便正则查找 text = text.replace('\n', '') all_match = re.findall(self.pattern, text) proxy_list = [] if all_match: for match in all_match: ip, port, http_type = self.parse_proxy_from_match(match) if ip not in [proxy['ip'] for proxy in proxy_list]: proxy_item = ProxyItem() proxy_item['ip'] = ip proxy_item['port'] = port proxy_item['http_type'] = http_type proxy_list.append(proxy_item) return proxy_list
def process_response(self, request, response, spider): """ 处理回复 如果是 ErrorResponse 则记录失败 否则判断并记录成功 如果其返回一个 Response (可以与传入的response相同,也可以是全新的对象), 该response会被在链中的其他中间件的 process_response() 方法处理。 如果其返回一个 Request 对象,则中间件链停止, 返回的request会被重新调度下载。处理类似于 process_request() 返回request所做的那样。 如果其抛出一个 IgnoreRequest 异常,则调用request的errback(Request.errback)。 如果没有代码处理抛出的异常,则该异常被忽略且不记录(不同于其他异常那样)。 """ if not ('proxy' in request.meta): if isinstance(response, ErrorResponse): print(f'忽略') raise IgnoreRequest() else: return response proxy_str = request.meta['proxy'] proxy = ProxyItem.parse(proxy_str) if isinstance(response, ErrorResponse): self.unique_proxy = '' # 请求失败 print('请求失败,记录失败,重新请求') proxy_manager.fail(proxy) return self.on_request_error(request, response, spider) elif not self.is_success_response(response): self.unique_proxy = '' # 回复解析失败 print('回复解析失败,记录失败,重新请求') if self.is_banned_response(response): proxy_manager.banned(proxy) else: proxy_manager.fail(proxy) return self.on_request_error(request, response, spider) else: # 回复成功 if not self.use_unique_proxy: # 唯一代理就不用记录了 print('回复解析成功,记录成功') proxy_manager.success(proxy) return response
def test_banned(self): ProxyManager().banned(ProxyItem(self.item))
def test_fail(self): ProxyManager().fail(ProxyItem(self.item))
def test_success(self): ProxyManager().success(ProxyItem(self.item))
def __init__(self): item = ProxyItem() self.manager = PostgreSQLManager(item) self._proxy_queue = queue.Queue() # 一些 sql table_name = item.get_table_name() available_condition = """ WHERE available=1 OR (available=2 AND EXTRACT(EPOCH from NOW()- INTERVAL'1 HOUR') > banned_time) """ # 按使用次数排序,保存都能用到 # 要让并发每次都能取到,如果每次请求 1s,则每隔 1s 就可能重复取到 ip # 乘以 10,可以让 10 轮请求后,才会重复 limit = douyin_spider.CONCURRENT_REQUESTS * 10 if limit > 200: limit = 200 self._sql_fetch_available = item.generate_get_sql() + available_condition + f""" ORDER BY used_times LIMIT {limit} """ """获取可用的""" self._sql_count = f""" SELECT COUNT(*) FROM {table_name} """ """统计数量""" self._sql_available_count = f""" SELECT COUNT(*) FROM {table_name} {available_condition} """ """统计有效数量""" primary_key = 'ip' self._sql_update = f""" UPDATE {table_name} %s WHERE {primary_key} = '{{{primary_key}}}' """ """后面的 3 个 {} 先求中间值,外面 2 个{{}}表示 1 个{} 用于格式化""" self._sql_get_fail_times_by_ip = f""" SELECT fail_times FROM {table_name} WHERE {primary_key} = '{{{primary_key}}}' """ """获取失败次数""" self._sql_add_times = self._sql_update % 'SET {key}={key}+1 ,update_time={now}' """添加次数""" self._sql_update_success = self._sql_update % 'SET success_times=success_times+1 ,' \ ' update_time={now} , available=1' """更新为成功""" self._sql_update_fail = self._sql_update % 'SET update_time={now} , available=0' """更新为失败""" self._sql_update_banned = self._sql_update % 'SET fail_times=fail_times+1 ,' \ ' banned_time={now} , update_time={now} , available=2' """更新为被禁""" # 初始化 asyncio.get_event_loop().run_until_complete(self.manager.connect_database()) asyncio.get_event_loop().run_until_complete(self.manager.create_table())
class ProxyPostgreSQLPipeline(BasePostgreSQLPipeline): """保存代理""" item = ProxyItem()
def remove(self, proxy: ProxyItem): """删除""" self._execute(proxy.generate_delete_sql())
class ProxyPostgreSQLPipelineTest(BasePostgreSQLPipelineTest): pipeline = ProxyPostgreSQLPipeline() insert_item = ProxyItem(ip=1)
def test_insert(self): pipeline = ProxyPostgreSQLPipeline() pipeline.open_spider(None) pipeline.process_item(ProxyItem(ip=1), None)