def get_redirect(name, data, post_url, session): """ :param name: 登录用户名 :param data: 需要提交的数据,可以通过抓包来确定部分不变的 :param post_url: post地址 :param session: :return: 服务器返回的下一次需要请求的url,如果打码出错,返回特定字符串好做特殊处理 """ logining_page = session.post(post_url, data=data, headers=headers) login_loop = logining_page.content.decode("GBK") # 如果是账号密码不正确,那么就将该字段置为2 if 'retcode=101' in login_loop: crawler.error('账号{}的密码不正确'.format(name)) freeze_account(name, 2) return '' if 'retcode=2070' in login_loop: crawler.error('输入的验证码不正确') return 'pinerror' if '正在登录' or 'Signing in' in login_loop: pa = r'location\.replace\([\'"](.*?)[\'"]\)' return re.findall(pa, login_loop)[0] else: return ''
def get_session(name, password): url, yundama_obj, cid, session = do_login(name, password) if url != '': rs_cont = session.get(url, headers=headers) login_info = rs_cont.text u_pattern = r'"uniqueid":"(.*)",' m = re.search(u_pattern, login_info) if m and m.group(1): # check if account is valid check_url = 'http://weibo.com/2671109275/about' resp = session.get(check_url, headers=headers) if is_403(resp.text): other.error('account {} has been forbidden'.format(name)) freeze_account(name, 0) return None other.info('The login account is {}'.format(name)) cookies = session.cookies r = requests.post('http://api.weibo.com/webim/2/direct_messages/new.json?source=209678993', data={'text': 'what are u doing', 'uid':1794652091}, cookies=cookies, headers=personal_message_headers) print(r.text) # Cookies.store_cookies(name, session.cookies.get_dict()) return session other.error('login failed for {}'.format(name)) return None
def get_session(name, password): url, yundama_obj, cid, session = do_login(name, password) if url != '': rs_cont = session.get(url, headers=headers) login_info = rs_cont.text u_pattern = r'"uniqueid":"(.*)",' m = re.search(u_pattern, login_info) if m and m.group(1): # check if account is valid check_url = 'http://weibo.com/2671109275/about' resp = session.get(check_url, headers=headers) if is_403(resp.text): other.error('account {} has been forbidden'.format(name)) freeze_account(name, 0) return None other.info( 'Login successful! The login account is {}'.format(name)) Cookies.store_cookies(name, session.cookies.get_dict()) return session other.error('login failed for {}'.format(name)) return None
def get_redirect(name, data, post_url, session): logining_page = session.post(post_url, data=data, headers=headers) login_loop = logining_page.content.decode("GBK") # if name or password is wrong, set the value to 2 if 'retcode=101' in login_loop: crawler.error( 'invalid password for {}, please ensure your account and password'. format(name)) freeze_account(name, 2) return '' if 'retcode=2070' in login_loop: crawler.error('invalid verification code') return 'pinerror' if 'retcode=4049' in login_loop: crawler.warning('account {} need verification for login'.format(name)) return 'login_need_pincode' if '正在登录' or 'Signing in' in login_loop: pa = r'location\.replace\([\'"](.*?)[\'"]\)' return re.findall(pa, login_loop)[0] else: return ''
def get_session(name, password, need_verify): url, yundama_obj, cid, session = do_login(name, password, need_verify) # 打码出错处理 while url == 'pinerror' and yundama_obj is not None: yundama_obj.report_error(cid) url, yundama_obj, cid, session = do_login(name, password, need_verify) if url != '': rs_cont = session.get(url, headers=headers) login_info = rs_cont.text u_pattern = r'"uniqueid":"(.*)",' m = re.search(u_pattern, login_info) if m and m.group(1): # 任意验证一个页面看能否访问,使用这个方法验证比较依赖外部条件,但是没找到更好的方式(有的情况下, # 账号存在问题,但是可以访问自己的主页,所以通过自己的主页验证账号是否正常不恰当) check_url = 'http://weibo.com/p/1005051764222885/info?mod=pedit_more' resp = session.get(check_url, headers=headers) # 通过实验,目前发现未经过手机验证的账号是救不回来了... if is_403(resp.text): other.error('账号{}已被冻结'.format(name)) crawler.warning('账号{}已经被冻结'.format(name)) freeze_account(name, 0) return None other.info('本次登陆账号为:{}'.format(name)) Cookies.store_cookies(name, session.cookies.get_dict()) return session other.error('本次账号{}登陆失败'.format(name)) return None
def test_freeze_account(self): from db import login_info login_info.freeze_account('18708103033') infos = login_info.get_login_info() for info in infos: if info[0] == '18708103033': self.assertEqual(info.enable, 0)
def test_freeze_account(self): """ 测试账号被封后是否会去查找还有可用账号没有,这里如果需要测试请换成自己数据库中的账号 """ from db import login_info login_info.freeze_account('18708103033') infos = login_info.get_login_info() for info in infos: if info[0] == '18708103033': self.assertEqual(info.enable, 0)
def get_redirect(name, data, post_url, session): logining_page = session.post(post_url, data=data, headers=headers) login_loop = logining_page.content.decode("GBK") # if name or password is wrong, set the value to 2 if 'retcode=101' in login_loop: crawler.error('invalid password for {}, please ensure your account and password'.format(name)) freeze_account(name, 2) return '' if 'retcode=2070' in login_loop: crawler.error('invalid verification code') return 'pinerror' if 'retcode=4049' in login_loop: crawler.warning('account {} need verification for login'.format(name)) return 'login_need_pincode' if '正在登录' in login_loop or 'Signing in' in login_loop: pa = r'location\.replace\([\'"](.*?)[\'"]\)' return re.findall(pa, login_loop)[0] else: return ''
def get_session(name, password): url, yundama_obj, cid, session = do_login(name, password) if url != '': rs_cont = session.get(url, headers=headers) login_info = rs_cont.text u_pattern = r'"uniqueid":"(.*)",' m = re.search(u_pattern, login_info) if m and m.group(1): # 访问微博官方账号看是否正常 check_url = 'http://weibo.com/2671109275/about' resp = session.get(check_url, headers=headers) # 通过实验,目前发现未经过手机验证的账号是救不回来了... if is_403(resp.text): other.error('账号{}已被冻结'.format(name)) freeze_account(name, 0) return None other.info('本次登陆账号为:{}'.format(name)) Cookies.store_cookies(name, session.cookies.get_dict()) return session other.error('本次账号{}登陆失败'.format(name)) return None
def get_session(name, password): url, yundama_obj, cid, session = do_login(name, password) if url != '': rs_cont = session.get(url, headers=headers) login_info = rs_cont.text u_pattern = r'"uniqueid":"(.*)",' m = re.search(u_pattern, login_info) if m and m.group(1): # check if account is valid check_url = 'http://weibo.com/2671109275/about' resp = session.get(check_url, headers=headers) if is_403(resp.text): other.error('account {} has been forbidden'.format(name)) freeze_account(name, 0) return None other.info('Login successful! The login account is {}'.format(name)) Cookies.store_cookies(name, session.cookies.get_dict()) return session other.error('login failed for {}'.format(name)) return None
def get_page(url, user_verify=True, need_login=True): """ :param url: url to be crawled :param user_verify: if it's ajax url, the value is False, else True :param need_login: if the url is need to login, the value is True, else False :return: return '' if exception happens or status_code != 200 """ crawler.info('the crawling url is {url}'.format(url=url)) count = 0 while count < max_retries: if need_login: name_cookies = Cookies.fetch_cookies() if name_cookies is None: crawler.warning('no cookies in cookies pool, please find out the reason') send_email() os.kill(os.getppid(), signal.SIGTERM) try: if need_login: resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False) if "$CONFIG['islogin'] = '******'" in resp.text: crawler.warning('account {} has been banned'.format(name_cookies[0])) freeze_account(name_cookies[0], 0) Cookies.delete_cookies(name_cookies[0]) continue else: resp = requests.get(url, headers=headers, timeout=time_out, verify=False) page = resp.text if page: page = page.encode('utf-8', 'ignore').decode('utf-8') else: continue # slow down to aviod being banned time.sleep(interal) if user_verify: if is_banned(resp.url) or is_403(page): crawler.warning('account {} has been banned'.format(name_cookies[0])) freeze_account(name_cookies[0], 0) Cookies.delete_cookies(name_cookies[0]) count += 1 continue if 'verifybmobile' in resp.url: crawler.warning('account {} has been locked,you should use your phone to unlock it'. format(name_cookies[0])) freeze_account(name_cookies[0], -1) Cookies.delete_cookies(name_cookies[0]) continue if not is_complete(page): count += 1 continue if is_404(page): crawler.warning('{url} seems to be 404'.format(url=url)) return '' except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e: crawler.warning('excepitons happens when crawling {},specific infos are {}'.format(url, e)) count += 1 time.sleep(excp_interal) else: Urls.store_crawl_url(url, 1) return page crawler.warning('max tries for {},check the url in redis db2'.format(url)) Urls.store_crawl_url(url, 0) return ''
def send_personal_message(target_uid, adver_message, user_verify=True, need_login=True): """ :param url: url to be crawled :param user_verify: if it's ajax url, the value is False, else True :param need_login: if the url is need to login, the value is True, else False :return: return '' if exception happens or status_code != 200 """ crawler.info('the send_personal_message uid is {uid}'.format(uid=str(target_uid))) count = 0 while count < max_retries: if need_login: name_cookies = Cookies.fetch_cookies() print(name_cookies) # check adver_timers if int(name_cookies[3]) >= int(adver_timers): continue if name_cookies is None: crawler.warning('no cookies in cookies pool, please find out the reason') send_email() os.kill(os.getppid(), signal.SIGTERM) try: if need_login: resp = requests.post('http://api.weibo.com/webim/2/direct_messages/new.json?source='+str(name_cookies[2]), data={'text': adver_message, 'uid':str(target_uid)}, cookies=name_cookies[1], headers=personal_message_headers) if "error" in resp.text: crawler.warning('account {} has been banned, resp.text is: {}'.format(name_cookies[0], resp.text)) freeze_account(name_cookies[0], 0) Cookies.delete_cookies(name_cookies[0]) continue else: # update adver_times Cookies.store_cookies(name_cookies[0], name_cookies[1], name_cookies[2], 1) return None # if "$CONFIG['islogin'] = '******'" in resp.text: # crawler.warning('account {} has been banned'.format(name_cookies[0])) # freeze_account(name_cookies[0], 0) # Cookies.delete_cookies(name_cookies[0]) # continue # # else: # # resp = requests.get(url, headers=headers, timeout=time_out, verify=False) # # page = resp.text # # if page: # page = page.encode('utf-8', 'ignore').decode('utf-8') # else: # continue # # # slow down to aviod being banned # time.sleep(interal) # # if user_verify: # if is_banned(resp.url) or is_403(page): # crawler.warning('account {} has been banned'.format(name_cookies[0])) # freeze_account(name_cookies[0], 0) # Cookies.delete_cookies(name_cookies[0]) # count += 1 # continue # # if 'verifybmobile' in resp.url: # crawler.warning('account {} has been locked,you should use your phone to unlock it'. # format(name_cookies[0])) # # freeze_account(name_cookies[0], -1) # Cookies.delete_cookies(name_cookies[0]) # continue # # if not is_complete(page): # count += 1 # continue # # if is_404(page): # crawler.warning('send_personal_message{uid} seems to be 404'.format(uid=str(target_uid))) # return '' except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e: crawler.warning('excepitons happens when send_personal_message {},specific infos are {}'.format(target_uid, e)) count += 1 time.sleep(excp_interal) else: # Urls.store_crawl_url(url, 1) # return page return None crawler.warning('max tries for {},check the target_uid in redis db2'.format(target_uid)) # Urls.store_crawl_url(url, 0) return ''
def get_session(name, password): session = requests.Session() su = get_encodename(name) sever_data = get_server_data(su, session) servertime = sever_data["servertime"] nonce = sever_data['nonce'] rsakv = sever_data["rsakv"] pubkey = sever_data["pubkey"] sp = get_password(password, servertime, nonce, pubkey) # 提交的数据可以根据抓包获得 data = { 'encoding': 'UTF-8', 'entry': 'weibo', 'from': '', 'gateway': '1', 'nonce': nonce, 'pagerefer': "", 'prelt': 67, 'pwencode': 'rsa2', "returntype": "META", 'rsakv': rsakv, 'savestate': '7', 'servertime': servertime, 'service': 'miniblog', 'sp': sp, 'sr': '1920*1080', 'su': su, 'useticket': '1', 'vsnf': '1', 'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack' } post_url = 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18)' url = get_redirect(data, post_url, session) if url != '': rs_cont = session.get(url, headers=headers) login_info = rs_cont.text u_pattern = r'"uniqueid":"(.*)",' m = re.search(u_pattern, login_info) if m: if m.group(1): # 任意验证一个页面看能否访问,使用这个方法验证比较依赖外部条件,但是没找到更好的方式(有的情况下, # 账号存在问题,但是可以访问自己的主页,所以通过自己的主页验证账号是否正常不恰当) check_url = 'http://weibo.com/p/1005051764222885/info?mod=pedit_more' resp = session.get(check_url, headers=headers) if is_403(resp.text): other.error('账号{}已被冻结'.format(name)) crawler.warning('账号{}已经被冻结'.format(name)) freeze_account(name) return None other.info('本次登陆账号为:{}'.format(name)) Cookies.store_cookies(name, session.cookies.get_dict()) return session else: other.error('本次账号{}登陆失败'.format(name)) return None else: other.error('本次账号{}登陆失败'.format(name)) return None else: other.error('本次账号{}登陆失败'.format(name)) return None
def get_page(url, user_verify=True, need_login=True): """ :param url: 待出现 :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码),否为抓取转发的ajax连接 :param need_login: 抓取页面是否需要登录,这样做可以减小一些账号的压力 :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串 """ crawler.info('本次抓取的url为{url}'.format(url=url)) count = 0 latest_name_cookies = None while count < max_retries: if need_login: # 每次重试的时候都换cookies,并且和上次不同 name_cookies = Cookies.fetch_cookies() if name_cookies is None: crawler.warning('cookie池中不存在cookie,正在检查是否有可用账号') rs = get_login_info() if len(rs) == 0: crawler.error('账号均不可用,请检查账号健康状况') # 杀死所有关于celery的进程 if 'win32' in sys.platform: os.popen('taskkill /F /IM "celery*"') else: os.popen('pkill -f "celery"') else: # 如果有可用账号,那么就拿来登录,这里用了本地调用,好像不是很合理,因为如果login queue # 不会在本机上,那么该调用就会无效但是用网络调用,如何保证不会出现在某些不常用登录地的节点 # 上还有待考量,亦或者找到一个更适合的方法可以突破异地登录的限制 # TODO 衡量是用网络调用还是直接调用 login.get_session()方法,这里应该不是很合理 # 目前暂时不考虑节点登录出现验证码的问题, 考虑到大规模账号登录的话,需要把login_queue的节点放在账号常用地 crawler.info('重新获取cookie中...') login.excute_login_task() time.sleep(10) if name_cookies == latest_name_cookies: continue latest_name_cookies = name_cookies try: if need_login: resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False) if "$CONFIG['islogin'] = '******'" in resp.text: crawler.warning('账号{}出现异常'.format(name_cookies[0])) freeze_account(name_cookies[0]) Cookies.delete_cookies(name_cookies[0]) continue else: resp = requests.get(url, headers=headers, timeout=time_out, verify=False) page = resp.text if page: page = page.encode('utf-8', 'ignore').decode('utf-8') else: continue # 每次抓取过后程序sleep的时间,降低封号危险 time.sleep(interal) if user_verify: if 'unfreeze' in resp.url or 'accessdeny' in resp.url or is_403( page): crawler.warning('账号{}已经被冻结'.format(name_cookies[0])) freeze_account(name_cookies[0]) Cookies.delete_cookies(name_cookies[0]) count += 1 continue if not is_complete(page): count += 1 continue if is_404(page): crawler.warning('url为{url}的连接不存在'.format(url=url)) return '' except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e: crawler.warning('抓取{}出现异常,具体信息是{}'.format(url, e)) count += 1 time.sleep(excp_interal) else: Urls.store_crawl_url(url, 1) return page crawler.warning('抓取{}已达到最大重试次数,请在redis的失败队列中查看该url并检查原因'.format(url)) Urls.store_crawl_url(url, 0) return ''
def get_page(url, user_verify=True, need_login=True): """ :param url: 待抓取url :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码),否为抓取转发的ajax连接 :param need_login: 抓取页面是否需要登录,这样做可以减小一些账号的压力 :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串 """ crawler.info('本次抓取的url为{url}'.format(url=url)) count = 0 latest_name_cookies = None while count < max_retries: if need_login: # 每次重试的时候都换cookies,并且和上次不同,如果只有一个账号,那么就允许相同 name_cookies, cookies_count = Cookies.fetch_cookies() if name_cookies is None: crawler.warning('cookie池中不存在cookie,正在检查是否有可用账号') rs = get_login_info() # 选择状态正常的账号进行登录,账号都不可用就停掉celery worker if len(rs) == 0: crawler.error('账号均不可用,请检查账号健康状况') # 杀死所有关于celery的进程 if 'win32' in sys.platform: os.popen('taskkill /F /IM "celery*"') else: os.popen('pkill -f "celery"') else: crawler.info('重新获取cookie中...') login.excute_login_task() time.sleep(10) # 只有cookies总数大于1的时候才会在每次重试的时候切换不同cookie if cookies_count > 1 and name_cookies == latest_name_cookies: continue latest_name_cookies = name_cookies try: if need_login: resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False) if "$CONFIG['islogin'] = '******'" in resp.text: crawler.warning('账号{}出现异常'.format(name_cookies[0])) freeze_account(name_cookies[0], 0) Cookies.delete_cookies(name_cookies[0]) continue else: resp = requests.get(url, headers=headers, timeout=time_out, verify=False) page = resp.text if page: page = page.encode('utf-8', 'ignore').decode('utf-8') else: continue # 每次抓取过后程序sleep的时间,降低封号危险 time.sleep(interal) if user_verify: if 'unfreeze' in resp.url or 'accessdeny' in resp.url or is_403(page): crawler.warning('账号{}已经被冻结'.format(name_cookies[0])) freeze_account(name_cookies[0], 0) Cookies.delete_cookies(name_cookies[0]) count += 1 continue if not is_complete(page): count += 1 continue if is_404(page): crawler.warning('url为{url}的连接不存在'.format(url=url)) return '' except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e: crawler.warning('抓取{}出现异常,具体信息是{}'.format(url, e)) count += 1 time.sleep(excp_interal) else: Urls.store_crawl_url(url, 1) return page crawler.warning('抓取{}已达到最大重试次数,请在redis的失败队列中查看该url并检查原因'.format(url)) Urls.store_crawl_url(url, 0) return ''
def get_page(url, user_verify=True): """ :param url: 待出现 :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码),否为抓取转发的ajax连接 :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串 """ crawler.info('本次抓取的url为{url}'.format(url=url)) count = 0 latest_name_cookies = None while count < max_retries: # 每次重试的时候都换cookies,并且和上次不同 name_cookies = Cookies.fetch_cookies() if name_cookies is None: crawler.error('cookie池中不存在cookie,请检查账号和登录任务是否正常。采集程序退出。') os._exit(0) if name_cookies == latest_name_cookies: continue latest_name_cookies = name_cookies try: resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False) page = resp.text if page: page = page.encode('utf-8', 'ignore').decode('utf-8') else: continue # 每次抓取过后程序sleep的时间,降低封号危险 time.sleep(interal) if user_verify: if 'unfreeze' in resp.url or is_403(page): crawler.warning('账号{}已经被冻结'.format(name_cookies[0])) freeze_account(name_cookies[0]) Cookies.delete_cookies(name_cookies[0]) count += 1 continue if not is_complete(page): count += 1 continue if is_404(page): crawler.warning('url为{url}的连接不存在'.format(url=url)) return '' except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e: crawler.warning('抓取{}出现异常,具体信息是{}'.format(url, e)) count += 1 time.sleep(excp_interal) else: Urls.store_crawl_url(url, 1) return page crawler.warning('抓取{}已达到最大重试次数,请在redis的失败队列中查看该url并检查原因'.format(url)) Urls.store_crawl_url(url, 0) return ''
def get_page(url, user_verify=True, need_login=True): """ :param url: 待抓取url :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码), 否为抓取转发的ajax连接 :param need_login: 抓取页面是否需要登录,这样做可以减小一些账号的压力 :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串 """ crawler.info('本次抓取的url为{url}'.format(url=url)) count = 0 while count < max_retries: if need_login: name_cookies = Cookies.fetch_cookies() if name_cookies is None: crawler.warning('cookie池中不存在cookie,请检查账号是否正常') other.warning('正在关闭爬虫程序...') if 'win32' in sys.platform: os.popen('taskkill /F /IM "celery*"') else: os.popen('pkill -f "celery"') try: if need_login: resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False) if "$CONFIG['islogin'] = '******'" in resp.text: crawler.warning('账号{}出现异常'.format(name_cookies[0])) freeze_account(name_cookies[0], 0) Cookies.delete_cookies(name_cookies[0]) continue else: resp = requests.get(url, headers=headers, timeout=time_out, verify=False) page = resp.text if page: page = page.encode('utf-8', 'ignore').decode('utf-8') else: continue # 每次抓取过后程序sleep的时间,降低封号危险 time.sleep(interal) if user_verify: if 'unfreeze' in resp.url or 'accessdeny' in resp.url or 'userblock' in resp.url or is_403( page): crawler.warning('账号{}已经被冻结'.format(name_cookies[0])) freeze_account(name_cookies[0], 0) Cookies.delete_cookies(name_cookies[0]) count += 1 continue if 'verifybmobile' in resp.url: crawler.warning('账号{}功能被锁定,需要手机解锁'.format(name_cookies[0])) freeze_account(name_cookies[0], -1) Cookies.delete_cookies(name_cookies[0]) continue if not is_complete(page): count += 1 continue if is_404(page): crawler.warning('url为{url}的连接不存在'.format(url=url)) return '' except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e: crawler.warning('抓取{}出现异常,具体信息是{}'.format(url, e)) count += 1 time.sleep(excp_interal) else: Urls.store_crawl_url(url, 1) return page crawler.warning('抓取{}已达到最大重试次数,请在redis的失败队列中查看该url并检查原因'.format(url)) Urls.store_crawl_url(url, 0) return ''