def _get_current_source(url, wb_mid): """ :param url: 当前微博url :param wb_mid: 当前微博mid :return: 转发数,微博用户id,用户名 """ html = get_page(url) if not html or basic.is_404(html): return None reposts = parse_status.get_repostcounts(html) comments = parse_status.get_commentcounts(html) # 更新weibo_search_data表中的转发数、评论数 weibosearch_dao.update_repost_comment(mid=wb_mid, reposts=reposts, comments=comments) root_url = url user_id = parse_status.get_userid(html) user_name = parse_status.get_username(html) post_time = parse_status.get_statustime(html) device = parse_status.get_statussource(html) comments_count = parse_status.get_commentcounts(html) reposts_count = parse_status.get_repostcounts(html) root_user = user.get_profile(user_id) # 源微博的相关信息存储 spread_original_dao.save(root_user, wb_mid, post_time, device, reposts_count, comments_count, root_url) crawler.info('该微博转发数为{counts}'.format(counts=reposts_count)) return reposts_count, user_id, user_name
def crawl_person_infos(uid): """ Crawl user info and their fans and followers For the limit of weibo's backend, we can only crawl 5 pages of the fans and followers. We also have no permissions to view enterprise's followers and fans info :param uid: current user id :return: None """ if not uid: return url = home_url.format(uid) html = get_page(url) if is_404(html): return None domain = public.get_userdomain(html) user, is_crawled = user_get.get_profile(uid, domain) # If it's enterprise user, just skip it if user and user.verify_type == 2: set_seed_other_crawled(uid) return # Crawl fans and followers if not is_crawled: app.send_task('tasks.user.crawl_follower_fans', args=(uid, domain), queue='fans_followers', routing_key='for_fans_followers')
def get_status_info(url, user_id, name, mid=''): soc = SpreadOtherCache() print('当前转发微博url为:' + url) repost_cont = get_page(url) if not is_404(repost_cont): repost_user_id = parse_status.get_userid(repost_cont) if repost_user_id == '': return None repost_user_name = parse_status.get_username(repost_cont) soc.set_id(repost_user_id) soc.set_name(repost_user_name) so = SpreadOther() so.id = repost_user_id so.screen_name = repost_user_name so.upper_user_name = parse_status.get_upperusername(repost_cont, name) cur_user = user.get_profile(repost_user_id) try: so.province = cur_user.province so.city = cur_user.city so.location = cur_user.location so.description = cur_user.description so.domain_name = cur_user.domain_name so.blog_url = cur_user.blog_url so.gender = cur_user.gender so.headimg_url = cur_user.headimg_url so.followers_count = cur_user.followers_count so.friends_count = cur_user.friends_count so.status_count = cur_user.status_count so.verify_type = cur_user.verify_type so.verify_info = cur_user.verify_info so.register_time = cur_user.register_time if so.screen_name == name: so.id = user_id so.mid = parse_status.get_mid(repost_cont) so.status_post_time = parse_status.get_statustime(repost_cont) so.device = parse_status.get_statussource(repost_cont) if mid: so.original_status_id = mid else: so.original_status_id = parse_status.get_orignalmid(repost_cont) so.comments_count = parse_status.get_commentcounts(repost_cont) so.reposts_count = parse_status.get_repostcounts(repost_cont) so.like_count = parse_status.get_likecounts(repost_cont) so.status_url = url except AttributeError as e: # todo:找出这里的问题 logging.info('解析{user_id}失败, 堆栈为{e}'.format(user_id=user_id, e=e)) logging.info(r'该转发页面的源代码为:\n{repost_cont}'.format(repost_cont=repost_cont)) return None else: return SpreadOtherAndCache(so, soc) else: return None
def get_url_from_web(user_id): """ Get user info according to user id. If user domain is 100505,the url is just 100505+userid; If user domain is 103505 or 100306, we need to request once more to get his info If user type is enterprise or service, we just crawl their home page info :param: user id :return: user entity """ if not user_id: return None url = base_url.format('100505', user_id) html = get_page(url) if not is_404(html): domain = public.get_userdomain(html) # writers(special users) if domain == '103505' or domain == '100306': url = base_url.format(domain, user_id) html = get_page(url) user = get_user_detail(user_id, html) # normal users elif domain == '100505': user = get_user_detail(user_id, html) # enterprise or service else: user = get_enterprise_detail(user_id, html) if user is None: return None user.name = public.get_username(html) user.head_img = public.get_headimg(html) user.verify_type = public.get_verifytype(html) user.verify_info = public.get_verifyreason(html, user.verify_type) user.level = public.get_level(html) if user.name: save_user(user) storage.info('has stored user {id} info successfully'.format(id=user_id)) return user else: return None else: return None
def get_url_from_web(user_id, domain): """ Get user info according to user id. If user domain is 100505,the url is just 100505+userid; If user domain is 103505 or 100306, we need to request once more to get his info If user type is enterprise or service, we just crawl their home page info :param: user id :return: user entity """ if not user_id: return None url = base_url.format(domain, user_id) html = get_page(url) if not is_404(html): # writers(special users) if domain == '103505' or domain == '100306': # url = base_url.format(domain, user_id) # html = get_page(url) user = get_user_detail(user_id, html) # normal users elif domain == '100505': user = get_user_detail(user_id, html) # enterprise or service else: user = get_enterprise_detail(user_id, html) if user is None: return None user.name = public.get_username(html) user.head_img = public.get_headimg(html) user.verify_type = public.get_verifytype(html) user.verify_info = public.get_verifyreason(html, user.verify_type) user.level = public.get_level(html) if user.name: save_user(user) storage.info( 'has stored user {id} info successfully'.format(id=user_id)) return user else: return None else: return None
def get_url_from_web(user_id): """ 根据用户id获取用户资料:如果用户的domain为100505,那么会直接返回用户详细资料;如果是103505或者100306,那么需要再进行 一次请求,因为用base_url的方式它只会定位到用户主页而不是详细资料页;如果是企业和服务号等,通过base_url访问也会跳转到该 用户的主页,由于该类用户的详细页价值不大,所以不再进行请求它们的详细页 :param user_id: 用户id :return: 用户类实体 """ if not user_id: return None url = base_url.format('100505', user_id) html = get_page(url) if not is_404(html): domain = public.get_userdomain(html) # 作家 if domain == '103505' or domain == '100306': url = base_url.format(domain, user_id) html = get_page(url) user = get_user_detail(user_id, html) # 普通用户 elif domain == '100505': user = get_user_detail(user_id, html) # 默认是企业 else: user = get_enterprise_detail(user_id, html) if user is None: return None user.name = public.get_username(html) user.head_img = public.get_headimg(html) user.verify_type = public.get_verifytype(html) user.verify_info = public.get_verifyreason(html, user.verify_type) user.level = public.get_level(html) # 保存用户信息到数据库 save_user(user) storage.info('已经成功保存ID为{id}的用户信息'.format(id=user_id)) return user else: return None
def get_page(url, session, headers, user_verify=True): """ :param user_verify: 是否为可能出现验证码的页面(搜索页面的403还没解析),否为抓取转发的ajax连接 """ crawler.info('本次抓取的url为{url}'.format(url=url)) try: page = session.get(url, headers=headers, timeout=time_out, verify=False).text. \ encode('utf-8', 'ignore').decode('utf-8') time.sleep(interal) if user_verify: if is_403(page): crawler.warning('本账号已经被冻结') crawler.info('本次抓取结束,时间是:{curtime}'.format(curtime=time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime()))) exit(-1) if is_404(page): crawler.warning('url为{url}的连接不存在'.format(url=url)) return '' if not is_complete(page): time.sleep(excp_interal) try: page = session.get(url, headers=headers, timeout=time_out, verify=False).text. \ encode('utf-8', 'ignore').decode('utf-8') except Exception as why: crawler.error(why) return '' except requests.exceptions.ReadTimeout: crawler.warning('抓取{url}时连接目标服务器超时'.format(url=url)) time.sleep(excp_interal) return '' except requests.exceptions.ConnectionError as e: crawler.warning('目标服务器拒绝连接,程序休眠1分钟,具体异常信息为:{e}'.format(e=e)) time.sleep(excp_interal) return '' else: return page
def get_page(url, user_verify=True): """ :param url: 待出现 :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码),否为抓取转发的ajax连接 :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串 """ crawler.info('本次抓取的url为{url}'.format(url=url)) count = 0 latest_name_cookies = None while count < max_retries: # 每次重试的时候都换cookies,并且和上次不同 name_cookies = Cookies.fetch_cookies() if name_cookies is None: crawler.error('cookie池中不存在cookie,请检查账号和登录任务是否正常。采集程序退出。') os._exit(0) if name_cookies == latest_name_cookies: continue latest_name_cookies = name_cookies try: resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False) page = resp.text if page: page = page.encode('utf-8', 'ignore').decode('utf-8') else: continue # 每次抓取过后程序sleep的时间,降低封号危险 time.sleep(interal) if user_verify: if 'unfreeze' in resp.url or is_403(page): crawler.warning('账号{}已经被冻结'.format(name_cookies[0])) freeze_account(name_cookies[0]) Cookies.delete_cookies(name_cookies[0]) count += 1 continue if not is_complete(page): count += 1 continue if is_404(page): crawler.warning('url为{url}的连接不存在'.format(url=url)) return '' except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e: crawler.warning('抓取{}出现异常,具体信息是{}'.format(url, e)) count += 1 time.sleep(excp_interal) else: Urls.store_crawl_url(url, 1) return page crawler.warning('抓取{}已达到最大重试次数,请在redis的失败队列中查看该url并检查原因'.format(url)) Urls.store_crawl_url(url, 0) return ''
def get_page(url, user_verify=True, need_login=True): """ :param url: url to be crawled :param user_verify: if it's ajax url, the value is False, else True :param need_login: if the url is need to login, the value is True, else False :return: return '' if exception happens or status_code != 200 """ crawler.info('the crawling url is {url}'.format(url=url)) count = 0 while count < max_retries: if need_login: name_cookies = Cookies.fetch_cookies() if name_cookies is None: crawler.warning('no cookies in cookies pool, please find out the reason') send_email() os.kill(os.getppid(), signal.SIGTERM) try: if need_login: resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False) if "$CONFIG['islogin'] = '******'" in resp.text: crawler.warning('account {} has been banned'.format(name_cookies[0])) freeze_account(name_cookies[0], 0) Cookies.delete_cookies(name_cookies[0]) continue else: resp = requests.get(url, headers=headers, timeout=time_out, verify=False) page = resp.text if page: page = page.encode('utf-8', 'ignore').decode('utf-8') else: continue # slow down to aviod being banned time.sleep(interal) if user_verify: if is_banned(resp.url) or is_403(page): crawler.warning('account {} has been banned'.format(name_cookies[0])) freeze_account(name_cookies[0], 0) Cookies.delete_cookies(name_cookies[0]) count += 1 continue if 'verifybmobile' in resp.url: crawler.warning('account {} has been locked,you should use your phone to unlock it'. format(name_cookies[0])) freeze_account(name_cookies[0], -1) Cookies.delete_cookies(name_cookies[0]) continue if not is_complete(page): count += 1 continue if is_404(page): crawler.warning('{url} seems to be 404'.format(url=url)) return '' except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e: crawler.warning('excepitons happens when crawling {},specific infos are {}'.format(url, e)) count += 1 time.sleep(excp_interal) else: Urls.store_crawl_url(url, 1) return page crawler.warning('max tries for {},check the url in redis db2'.format(url)) Urls.store_crawl_url(url, 0) return ''
def _get_current_reposts(url, session, weibo_mid): """ 修改过后的抓取主程序,由于微博频率限制比较严格,目前只抓取当前微博及其子微博,不抓取源微博 """ spread_other_caches = list() spread_others = list() spread_other_and_caches = list() html = get_page(url, session, headers) reposts = status_parse.get_repostcounts(html) comments = status_parse.get_commentcounts(html) # 更新weibo_search_data表中的转发数、评论数 weibosearch_dao.update_repost_comment(mid=weibo_mid, reposts=reposts, comments=comments) if not basic.is_404(html): root_url = url mid = status_parse.get_mid(html) user_id = status_parse.get_userid(html) user_name = status_parse.get_username(html) post_time = status_parse.get_statustime(html) device = status_parse.get_statussource(html) comments_count = status_parse.get_commentcounts(html) reposts_count = status_parse.get_repostcounts(html) root_user = user.get_profile(user_id, session, headers) spread_original_dao.save(root_user, mid, post_time, device, reposts_count, comments_count, root_url) crawler.info('该微博转发数为{counts}'.format(counts=reposts_count)) if reposts_count > 0: base_url = 'http://weibo.com/aj/v6/mblog/info/big?ajwvr=6&id={mid}&page={currpage}' soc = SpreadOtherCache() soc.set_id(user_id) soc.set_name(user_name) spread_other_caches.append(soc) page = 1 ajax_url = base_url.format(mid=mid, currpage=page) source = get_page(ajax_url, session, headers, False) crawler.info('本次转发信息url为:' + ajax_url) try: repost_json = json.loads(source) total_page = int(repost_json['data']['page']['totalpage']) except Exception as why: parser.error('{url}使用json解析转发信息出现异常,具体信息为:{why}'.format( url=ajax_url, why=why)) else: page = total_page page_counter = 0 while page > 0 and page_counter < page_max: ajax_url = base_url.format(mid=mid, currpage=page) repost_info = get_page(ajax_url, session, headers, False) try: repost_json = json.loads(repost_info) repost_html = repost_json['data']['html'] except Exception as why: parser.error( '{url}使用json解析转发信息出现异常,具体信息为:{why}'.format( url=ajax_url, why=why)) else: repost_urls = status_parse.get_reposturls(repost_html) # 转发节点排序逻辑 for repost_url in repost_urls: repost_cont = status.get_status_info( repost_url, session, user_id, user_name, headers, mid) if repost_cont is not None: spread_other_and_caches.append(repost_cont) for soac in spread_other_and_caches: if soac.get_so().id != '': spread_others.append(soac.get_so()) spread_other_caches.append(soac.get_soc()) finally: print('当前位于第{currpage}页'.format(currpage=page)) page -= 1 page_counter += 1 for so in spread_others: if so.verify_type == '': so.verify_type = 0 for i in spread_other_caches: if so.upper_user_name == i.get_name(): so.upper_user_id = i.get_id() break else: so.upper_user_id = user_id spread_others = list(set(spread_others)) spread_other_dao.save(spread_others) crawler.info('一共获取了{num}条转发信息,该条微博的转发信息已经采集完成'.format( num=len(spread_others))) else: crawler.info('{url}为404页面'.format(url=url))
def get_page(url, user_verify=True, need_login=True): """ :param url: 待出现 :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码),否为抓取转发的ajax连接 :param need_login: 抓取页面是否需要登录,这样做可以减小一些账号的压力 :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串 """ crawler.info('本次抓取的url为{url}'.format(url=url)) count = 0 latest_name_cookies = None while count < max_retries: if need_login: # 每次重试的时候都换cookies,并且和上次不同 name_cookies = Cookies.fetch_cookies() if name_cookies is None: crawler.warning('cookie池中不存在cookie,正在检查是否有可用账号') rs = get_login_info() if len(rs) == 0: crawler.error('账号均不可用,请检查账号健康状况') # 杀死所有关于celery的进程 if 'win32' in sys.platform: os.popen('taskkill /F /IM "celery*"') else: os.popen('pkill -f "celery"') else: # 如果有可用账号,那么就拿来登录,这里用了本地调用,好像不是很合理,因为如果login queue # 不会在本机上,那么该调用就会无效但是用网络调用,如何保证不会出现在某些不常用登录地的节点 # 上还有待考量,亦或者找到一个更适合的方法可以突破异地登录的限制 # TODO 衡量是用网络调用还是直接调用 login.get_session()方法,这里应该不是很合理 # 目前暂时不考虑节点登录出现验证码的问题, 考虑到大规模账号登录的话,需要把login_queue的节点放在账号常用地 crawler.info('重新获取cookie中...') login.excute_login_task() time.sleep(10) if name_cookies == latest_name_cookies: continue latest_name_cookies = name_cookies try: if need_login: resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False) if "$CONFIG['islogin'] = '******'" in resp.text: crawler.warning('账号{}出现异常'.format(name_cookies[0])) freeze_account(name_cookies[0]) Cookies.delete_cookies(name_cookies[0]) continue else: resp = requests.get(url, headers=headers, timeout=time_out, verify=False) page = resp.text if page: page = page.encode('utf-8', 'ignore').decode('utf-8') else: continue # 每次抓取过后程序sleep的时间,降低封号危险 time.sleep(interal) if user_verify: if 'unfreeze' in resp.url or 'accessdeny' in resp.url or is_403( page): crawler.warning('账号{}已经被冻结'.format(name_cookies[0])) freeze_account(name_cookies[0]) Cookies.delete_cookies(name_cookies[0]) count += 1 continue if not is_complete(page): count += 1 continue if is_404(page): crawler.warning('url为{url}的连接不存在'.format(url=url)) return '' except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e: crawler.warning('抓取{}出现异常,具体信息是{}'.format(url, e)) count += 1 time.sleep(excp_interal) else: Urls.store_crawl_url(url, 1) return page crawler.warning('抓取{}已达到最大重试次数,请在redis的失败队列中查看该url并检查原因'.format(url)) Urls.store_crawl_url(url, 0) return ''
def get_page(url, user_verify=True, need_login=True): """ :param url: 待抓取url :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码), 否为抓取转发的ajax连接 :param need_login: 抓取页面是否需要登录,这样做可以减小一些账号的压力 :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串 """ crawler.info('本次抓取的url为{url}'.format(url=url)) count = 0 while count < max_retries: if need_login: name_cookies = Cookies.fetch_cookies() if name_cookies is None: crawler.warning('cookie池中不存在cookie,请检查账号是否正常') other.warning('正在关闭爬虫程序...') if 'win32' in sys.platform: os.popen('taskkill /F /IM "celery*"') else: os.popen('pkill -f "celery"') try: if need_login: resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False) if "$CONFIG['islogin'] = '******'" in resp.text: crawler.warning('账号{}出现异常'.format(name_cookies[0])) freeze_account(name_cookies[0], 0) Cookies.delete_cookies(name_cookies[0]) continue else: resp = requests.get(url, headers=headers, timeout=time_out, verify=False) page = resp.text if page: page = page.encode('utf-8', 'ignore').decode('utf-8') else: continue # 每次抓取过后程序sleep的时间,降低封号危险 time.sleep(interal) if user_verify: if 'unfreeze' in resp.url or 'accessdeny' in resp.url or 'userblock' in resp.url or is_403( page): crawler.warning('账号{}已经被冻结'.format(name_cookies[0])) freeze_account(name_cookies[0], 0) Cookies.delete_cookies(name_cookies[0]) count += 1 continue if 'verifybmobile' in resp.url: crawler.warning('账号{}功能被锁定,需要手机解锁'.format(name_cookies[0])) freeze_account(name_cookies[0], -1) Cookies.delete_cookies(name_cookies[0]) continue if not is_complete(page): count += 1 continue if is_404(page): crawler.warning('url为{url}的连接不存在'.format(url=url)) return '' except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e: crawler.warning('抓取{}出现异常,具体信息是{}'.format(url, e)) count += 1 time.sleep(excp_interal) else: Urls.store_crawl_url(url, 1) return page crawler.warning('抓取{}已达到最大重试次数,请在redis的失败队列中查看该url并检查原因'.format(url)) Urls.store_crawl_url(url, 0) return ''
def get_profile(user_id, session, headers): """ 默认为个人用户,如果为作家,则需要再做一次抓取,而为企业用户,它会重定向到企业主页,直接解析即可 登陆后可以根据http://weibo.com/u/userId来进行确定用户主页,不知道稳定不,todo 测试这个路径 好像'http://weibo.com/p/100505' + user_id + '/info?mod=pedit_more' 这个路径可以解决大部分路径问题,只是非普通用户 会被重定向到主页,有的并不行,比如domain=100106 """ if user_id == '': return User() user = User() info = get_user(user_id) if info: user.id = user_id user.screen_name = info.get('name') user.province = info.get('province') user.city = info.get('city') user.location = info.get('location') user.description = info.get('description') user.headimg_url = info.get('headimg_url') user.blog_url = info.get('blog_url') user.domain_name = info.get('domain_name') user.gender = info.get('gender') user.followers_count = info.get('followers_count') user.friends_count = info.get('friends_count') user.status_count = info.get('status_count') user.birthday = info.get('birthday') user.verify_type = info.get('verify_type') user.verify_info = info.get('verify_info') user.register_time = info.get('register_time') # 防止在插入数据库的时候encode()出问题 for key in user.__dict__: if user.__dict__[key] is None: setattr(user, key, '') storage.info('ID为{id}的用户信息已经存在于数据库中'.format(id=user_id)) else: url = 'http://weibo.com/p/100505' + user_id + '/info?mod=pedit_more' html = get_page(url, session, headers) if not is_404(html): domain = get_publicinfo.get_userdomain(html) if domain == '100505' or domain == '103505' or domain == '100306': user = get_personalinfo.get_detail(html) if user is not None: user.followers_count = get_personalinfo.get_fans(html) user.friends_count = get_personalinfo.get_friends(html) user.status_count = get_personalinfo.get_status(html) else: user = User() else: # 为了尽可能少抓取url,所以这里不适配所有服务号 if domain == '100106': url = 'http://weibo.com/p/'+domain+user_id+'/home' html = get_page(url, session, headers) if html == '': return user user.followers_count = get_enterpriseinfo.get_fans(html) user.friends_count = get_enterpriseinfo.get_friends(html) user.status_count = get_enterpriseinfo.get_status(html) user.description = get_enterpriseinfo.get_description(html).encode('gbk', 'ignore').decode('gbk') user.id = user_id user.screen_name = get_publicinfo.get_username(html) user.headimg_url = get_publicinfo.get_headimg(html) user.verify_type = get_publicinfo.get_verifytype(html) user.verify_info = get_publicinfo.get_verifyreason(html, user.verify_type) save_user(user) storage.info('已经成功保存ID为{id}的用户信息'.format(id=user_id)) return user
def get_page(url, user_verify=True, need_login=True): """ :param url: 待抓取url :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码),否为抓取转发的ajax连接 :param need_login: 抓取页面是否需要登录,这样做可以减小一些账号的压力 :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串 """ crawler.info('本次抓取的url为{url}'.format(url=url)) count = 0 latest_name_cookies = None while count < max_retries: if need_login: # 每次重试的时候都换cookies,并且和上次不同,如果只有一个账号,那么就允许相同 name_cookies, cookies_count = Cookies.fetch_cookies() if name_cookies is None: crawler.warning('cookie池中不存在cookie,正在检查是否有可用账号') rs = get_login_info() # 选择状态正常的账号进行登录,账号都不可用就停掉celery worker if len(rs) == 0: crawler.error('账号均不可用,请检查账号健康状况') # 杀死所有关于celery的进程 if 'win32' in sys.platform: os.popen('taskkill /F /IM "celery*"') else: os.popen('pkill -f "celery"') else: crawler.info('重新获取cookie中...') login.excute_login_task() time.sleep(10) # 只有cookies总数大于1的时候才会在每次重试的时候切换不同cookie if cookies_count > 1 and name_cookies == latest_name_cookies: continue latest_name_cookies = name_cookies try: if need_login: resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False) if "$CONFIG['islogin'] = '******'" in resp.text: crawler.warning('账号{}出现异常'.format(name_cookies[0])) freeze_account(name_cookies[0], 0) Cookies.delete_cookies(name_cookies[0]) continue else: resp = requests.get(url, headers=headers, timeout=time_out, verify=False) page = resp.text if page: page = page.encode('utf-8', 'ignore').decode('utf-8') else: continue # 每次抓取过后程序sleep的时间,降低封号危险 time.sleep(interal) if user_verify: if 'unfreeze' in resp.url or 'accessdeny' in resp.url or is_403(page): crawler.warning('账号{}已经被冻结'.format(name_cookies[0])) freeze_account(name_cookies[0], 0) Cookies.delete_cookies(name_cookies[0]) count += 1 continue if not is_complete(page): count += 1 continue if is_404(page): crawler.warning('url为{url}的连接不存在'.format(url=url)) return '' except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e: crawler.warning('抓取{}出现异常,具体信息是{}'.format(url, e)) count += 1 time.sleep(excp_interal) else: Urls.store_crawl_url(url, 1) return page crawler.warning('抓取{}已达到最大重试次数,请在redis的失败队列中查看该url并检查原因'.format(url)) Urls.store_crawl_url(url, 0) return ''