from tasks import login if __name__ == '__main__': # 由于celery的定时器有延迟,所以第一次需要手动 login.excute_login_task()
def get_page(url, user_verify=True, need_login=True): """ :param url: 待出现 :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码),否为抓取转发的ajax连接 :param need_login: 抓取页面是否需要登录,这样做可以减小一些账号的压力 :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串 """ crawler.info('本次抓取的url为{url}'.format(url=url)) count = 0 latest_name_cookies = None while count < max_retries: if need_login: # 每次重试的时候都换cookies,并且和上次不同 name_cookies = Cookies.fetch_cookies() if name_cookies is None: crawler.warning('cookie池中不存在cookie,正在检查是否有可用账号') rs = get_login_info() if len(rs) == 0: crawler.error('账号均不可用,请检查账号健康状况') # 杀死所有关于celery的进程 if 'win32' in sys.platform: os.popen('taskkill /F /IM "celery*"') else: os.popen('pkill -f "celery"') else: # 如果有可用账号,那么就拿来登录,这里用了本地调用,好像不是很合理,因为如果login queue # 不会在本机上,那么该调用就会无效但是用网络调用,如何保证不会出现在某些不常用登录地的节点 # 上还有待考量,亦或者找到一个更适合的方法可以突破异地登录的限制 # TODO 衡量是用网络调用还是直接调用 login.get_session()方法,这里应该不是很合理 # 目前暂时不考虑节点登录出现验证码的问题, 考虑到大规模账号登录的话,需要把login_queue的节点放在账号常用地 crawler.info('重新获取cookie中...') login.excute_login_task() time.sleep(10) if name_cookies == latest_name_cookies: continue latest_name_cookies = name_cookies try: if need_login: resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False) if "$CONFIG['islogin'] = '******'" in resp.text: crawler.warning('账号{}出现异常'.format(name_cookies[0])) freeze_account(name_cookies[0]) Cookies.delete_cookies(name_cookies[0]) continue else: resp = requests.get(url, headers=headers, timeout=time_out, verify=False) page = resp.text if page: page = page.encode('utf-8', 'ignore').decode('utf-8') else: continue # 每次抓取过后程序sleep的时间,降低封号危险 time.sleep(interal) if user_verify: if 'unfreeze' in resp.url or 'accessdeny' in resp.url or is_403( page): crawler.warning('账号{}已经被冻结'.format(name_cookies[0])) freeze_account(name_cookies[0]) Cookies.delete_cookies(name_cookies[0]) count += 1 continue if not is_complete(page): count += 1 continue if is_404(page): crawler.warning('url为{url}的连接不存在'.format(url=url)) return '' except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e: crawler.warning('抓取{}出现异常,具体信息是{}'.format(url, e)) count += 1 time.sleep(excp_interal) else: Urls.store_crawl_url(url, 1) return page crawler.warning('抓取{}已达到最大重试次数,请在redis的失败队列中查看该url并检查原因'.format(url)) Urls.store_crawl_url(url, 0) return ''
# coding:utf-8 from tasks import login if __name__ == '__main__': # you should execute this file, because celery timer will execute login delayed login.excute_login_task()
def get_page(url, user_verify=True, need_login=True): """ :param url: 待抓取url :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码),否为抓取转发的ajax连接 :param need_login: 抓取页面是否需要登录,这样做可以减小一些账号的压力 :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串 """ crawler.info('本次抓取的url为{url}'.format(url=url)) count = 0 while count < max_retries: if need_login: # 每次重试的时候都换cookies,并且和上次不同,如果只有一个账号,那么就允许相同 name_cookies = Cookies.fetch_cookies() if name_cookies is None: crawler.warning('cookie池中不存在cookie,正在检查是否有可用账号') rs = get_login_info() # 选择状态正常的账号进行登录,账号都不可用就停掉celery worker if len(rs) == 0: crawler.error('账号均不可用,请检查账号健康状况') # 杀死所有关于celery的进程 if 'win32' in sys.platform: os.popen('taskkill /F /IM "celery*"') else: os.popen('pkill -f "celery"') else: crawler.info('重新获取cookie中...') login.excute_login_task() time.sleep(10) try: if need_login: resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False) if "$CONFIG['islogin'] = '******'" in resp.text: crawler.warning('账号{}出现异常'.format(name_cookies[0])) freeze_account(name_cookies[0], 0) Cookies.delete_cookies(name_cookies[0]) continue else: resp = requests.get(url, headers=headers, timeout=time_out, verify=False) page = resp.text if page: page = page.encode('utf-8', 'ignore').decode('utf-8') else: continue # 每次抓取过后程序sleep的时间,降低封号危险 time.sleep(interal) if user_verify: if 'unfreeze' in resp.url or 'accessdeny' in resp.url or 'userblock' in resp.url or is_403( page): crawler.warning('账号{}已经被冻结'.format(name_cookies[0])) freeze_account(name_cookies[0], 0) Cookies.delete_cookies(name_cookies[0]) count += 1 continue if 'verifybmobile' in resp.url: crawler.warning('账号{}功能被锁定,需要手机解锁'.format(name_cookies[0])) freeze_account(name_cookies[0], -1) Cookies.delete_cookies(name_cookies[0]) continue if not is_complete(page): count += 1 continue if is_404(page): crawler.warning('url为{url}的连接不存在'.format(url=url)) return '' except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e: crawler.warning('抓取{}出现异常,具体信息是{}'.format(url, e)) count += 1 time.sleep(excp_interal) else: Urls.store_crawl_url(url, 1) return page crawler.warning('抓取{}已达到最大重试次数,请在redis的失败队列中查看该url并检查原因'.format(url)) Urls.store_crawl_url(url, 0) return ''
def get_page(url, user_verify=True): """ :param url: 待出现 :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码),否为抓取转发的ajax连接 :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串 """ crawler.info('本次抓取的url为{url}'.format(url=url)) count = 0 latest_name_cookies = None while count < max_retries: # 每次重试的时候都换cookies,并且和上次不同 name_cookies = Cookies.fetch_cookies() if name_cookies is None: crawler.warning('cookie池中不存在cookie,正在检查是否有可用账号') rs = get_login_info() if len(rs) == 0: crawler.error('账号均不可用,请检查账号健康状况') sys.exit(-1) else: # 如果有可用账号,那么就拿来登录 crawler.info('重新获取cookie中...') login.excute_login_task() time.sleep(10) if name_cookies == latest_name_cookies: continue latest_name_cookies = name_cookies try: resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False) page = resp.text if page: page = page.encode('utf-8', 'ignore').decode('utf-8') else: continue # 每次抓取过后程序sleep的时间,降低封号危险 time.sleep(interal) if user_verify: if 'unfreeze' in resp.url or is_403(page): crawler.warning('账号{}已经被冻结'.format(name_cookies[0])) freeze_account(name_cookies[0]) Cookies.delete_cookies(name_cookies[0]) count += 1 continue if not is_complete(page): count += 1 continue if is_404(page): crawler.warning('url为{url}的连接不存在'.format(url=url)) return '' except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e: crawler.warning('抓取{}出现异常,具体信息是{}'.format(url, e)) count += 1 time.sleep(excp_interal) else: Urls.store_crawl_url(url, 1) return page crawler.warning('抓取{}已达到最大重试次数,请在redis的失败队列中查看该url并检查原因'.format(url)) Urls.store_crawl_url(url, 0) return ''