def search_keyword(keyword, keyword_id): cur_page = 1 encode_keyword = url_parse.quote(keyword) #crawler.info(limit) while cur_page < limit: cur_url = url.format(encode_keyword, cur_page) search_page = get_page(cur_url) #crawler.info(search_page) if not search_page: crawler.warning('No result for keyword {}, the source page is {}'.format(keyword, search_page)) return search_list = parse_search.get_search_info(search_page) # Because the search results are sorted by time, if any result has been stored in mysql, # we need not crawl the same keyword in this turn for wb_data in search_list: rs = get_wb_by_mid(wb_data.weibo_id) if rs: crawler.info('keyword {} has been crawled in last turn'.format(keyword)) #continue return else: insert_weibo_data(wb_data) insert_keyword_wbid(keyword_id, wb_data.weibo_id) # send task for crawling user info app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid,), queue='user_crawler', routing_key='for_user_info') if 'page next S_txt1 S_line1' in search_page: cur_page += 1 else: crawler.info('keyword {} has been crawled in this turn'.format(keyword)) return
def get_redirect(name, data, post_url, session): logining_page = session.post(post_url, data=data, headers=headers) login_loop = logining_page.content.decode("GBK") # if name or password is wrong, set the value to 2 if 'retcode=101' in login_loop: crawler.error( 'invalid password for {}, please ensure your account and password'. format(name)) freeze_account(name, 2) return '' if 'retcode=2070' in login_loop: crawler.error('invalid verification code') return 'pinerror' if 'retcode=4049' in login_loop: crawler.warning('account {} need verification for login'.format(name)) return 'login_need_pincode' if '正在登录' or 'Signing in' in login_loop: pa = r'location\.replace\([\'"](.*?)[\'"]\)' return re.findall(pa, login_loop)[0] else: return ''
def crawl_weibo_datas(uid): limit = get_max_home_page() cur_page = 1 while cur_page <= limit: url = home_url.format(uid, cur_page) html = get_page(url) weibo_datas = get_wbdata_fromweb(html) if not weibo_datas: crawler.warning("user {} has no weibo".format(uid)) return insert_weibo_datas(weibo_datas) domain = public.get_userdomain(html) cur_time = int(time.time()*1000) ajax_url_0 = ajax_url.format(domain, 0, domain, uid, cur_page, cur_page, cur_time) ajax_url_1 = ajax_url.format(domain, 1, domain, uid, cur_page, cur_page, cur_time+100) if cur_page == 1: # here we use local call to get total page number total_page = get_total_page(crawl_ajax_page(ajax_url_1)) if total_page < limit: limit = total_page cur_page += 1 app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0,), queue='ajax_home_crawler', routing_key='ajax_home_info') app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1,), queue='ajax_home_crawler', routing_key='ajax_home_info') set_seed_home_crawled(uid)
def crawl_comment_page(mid): limit = get_max_comment_page() cur_page = 1 next_url = '' while cur_page <= limit: cur_time = int(time.time()*1000) if cur_page == 1: url = start_url.format(mid, cur_time) else: url = base_url.format(next_url, cur_time) html = get_page(url, user_verify=False) comment_datas = comment.get_comment_list(html, mid) if not comment_datas and cur_page == 1: crawler.warning('微博id为{}的微博评论未采集成功,请检查原因'.format(mid)) return save_comments(comment_datas) # 由于这里每一步都要根据上一步来迭代,所以不适合采用网络调用(主要是比较麻烦) next_url = comment.get_next_url(html) if not next_url: crawler.info('微博{}的评论采集已经完成'.format(mid)) return cur_page += 1
def search_keyword(keyword, keyword_id): cur_page = 1 encode_keyword = url_parse.quote(keyword) while cur_page < limit: cur_url = url.format(encode_keyword, cur_page) search_page = get_page(cur_url) if not search_page: crawler.warning('本次并没获取到关键词{}的相关微博,该页面源码是{}'.format( keyword, search_page)) return search_list = parse_search.get_search_info(search_page) # 先判断数据库里是否存在相关的微博,如果是已有的,那就说明是已经抓取的微博(因为结果默认按时间排序),就退出循环 for wb_data in search_list: rs = get_wb_by_mid(wb_data.weibo_id) if rs: crawler.info('关键词{}本次搜索更新的微博已经获取完成'.format(keyword)) return else: insert_weibo_data(wb_data) insert_keyword_wbid(keyword_id, wb_data.weibo_id) # 这里暂时使用网络调用而非本地调用,权衡两种方法的好处 app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid, ), queue='user_crawler', routing_key='for_user_info') # 判断是否包含下一页 if 'page next S_txt1 S_line1' in search_page: cur_page += 1 else: crawler.info('关键词{}搜索完成'.format(keyword)) return
def get_session(name, password, need_verify): url, yundama_obj, cid, session = do_login(name, password, need_verify) # 打码出错处理 while url == 'pinerror' and yundama_obj is not None: yundama_obj.report_error(cid) url, yundama_obj, cid, session = do_login(name, password, need_verify) if url != '': rs_cont = session.get(url, headers=headers) login_info = rs_cont.text u_pattern = r'"uniqueid":"(.*)",' m = re.search(u_pattern, login_info) if m and m.group(1): # 任意验证一个页面看能否访问,使用这个方法验证比较依赖外部条件,但是没找到更好的方式(有的情况下, # 账号存在问题,但是可以访问自己的主页,所以通过自己的主页验证账号是否正常不恰当) check_url = 'http://weibo.com/p/1005051764222885/info?mod=pedit_more' resp = session.get(check_url, headers=headers) # 通过实验,目前发现未经过手机验证的账号是救不回来了... if is_403(resp.text): other.error('账号{}已被冻结'.format(name)) crawler.warning('账号{}已经被冻结'.format(name)) freeze_account(name, 0) return None other.info('本次登陆账号为:{}'.format(name)) Cookies.store_cookies(name, session.cookies.get_dict()) return session other.error('本次账号{}登陆失败'.format(name)) return None
def search_keyword(keyword, keyword_id): cur_page = 1 encode_keyword = url_parse.quote(keyword) while cur_page < limit: cur_url = url.format(encode_keyword, cur_page) search_page = get_page(cur_url) if not search_page: crawler.warning('No result for keyword {}, the source page is {}'.format(keyword, search_page)) return search_list = parse_search.get_search_info(search_page) # Because the search results are sorted by time, if any result has been stored in mysql, # we need not crawl the same keyword in this turn for wb_data in search_list: rs = get_wb_by_mid(wb_data.weibo_id) if rs: crawler.info('keyword {} has been crawled in this turn'.format(keyword)) return else: insert_weibo_data(wb_data) insert_keyword_wbid(keyword_id, wb_data.weibo_id) # send task for crawling user info app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid,), queue='user_crawler', routing_key='for_user_info') if 'page next S_txt1 S_line1' in search_page: cur_page += 1 else: crawler.info('keyword {} has been crawled in this turn'.format(keyword)) return
def crawl_weibo_datas(uid): limit = get_max_home_page() cur_page = 1 while cur_page <= limit: url = home_url.format(uid, cur_page) html = get_page(url) weibo_datas = get_wbdata_fromweb(html) if not weibo_datas: crawler.warning('用户id为{}的用户主页微博数据未采集成功,请检查原因'.format(uid)) return insert_weibo_datas(weibo_datas) domain = public.get_userdomain(html) ajax_url_0 = ajax_url.format(domain, 0, domain, uid, cur_page, cur_page) ajax_url_1 = ajax_url.format(domain, 1, domain, uid, cur_page, cur_page) if cur_page == 1: total_page = get_total_page(crawl_ajax_page(ajax_url_1)) if total_page < limit: limit = total_page cur_page += 1 app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0,), queue='ajax_home_crawler', routing_key='ajax_home_info') app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1,), queue='ajax_home_crawler', routing_key='ajax_home_info')
def get_redirect(name, data, post_url, session): """ :param name: 登录用户名 :param data: 需要提交的数据,可以通过抓包来确定部分不变的 :param post_url: post地址 :param session: :return: 服务器返回的下一次需要请求的url,如果打码出错,返回特定字符串好做特殊处理 """ logining_page = session.post(post_url, data=data, headers=headers) login_loop = logining_page.content.decode("GBK") # 如果是账号密码不正确,那么就将该字段置为2 if 'retcode=101' in login_loop: crawler.error('账号{}的密码不正确'.format(name)) freeze_account(name, 2) return '' if 'retcode=2070' in login_loop: crawler.error('输入的验证码不正确') return 'pinerror' if 'retcode=4049' in login_loop: crawler.warning('账号{}登录需要验证码'.format(name)) return 'login_need_pincode' if '正在登录' or 'Signing in' in login_loop: pa = r'location\.replace\([\'"](.*?)[\'"]\)' return re.findall(pa, login_loop)[0] else: return ''
def check_cookies_timeout(cls, cookies): if cookies is None: return True if isinstance(cookies, bytes): cookies = cookies.decode('utf-8') cookies = json.loads(cookies) login_time = datetime.datetime.fromtimestamp(cookies['loginTime']) if datetime.datetime.now() - login_time > datetime.timedelta(hours=cookie_expire_time): crawler.warning('the account has been expired') return True return False
def crawl_weibo_datas(uid): limit = get_max_home_page() cur_page = 1 while cur_page <= limit: url = home_url.format(uid, cur_page) html = get_page(url) weibo_datas = get_wbdata_fromweb(html) if not weibo_datas: crawler.warning("user {} has no weibo".format(uid)) return insert_weibo_datas(weibo_datas) domain = public.get_userdomain(html) page_id = public.get_pageid(html) cur_time = int(time.time() * 1000) ajax_url_0 = ajax_url.format(domain, 0, page_id, uid, cur_page, cur_page, cur_time) ajax_url_1 = ajax_url.format(domain, 1, page_id, uid, cur_page, cur_page, cur_time + 100) if cur_page == 1: # here we use local call to get total page number total_page = get_total_page(crawl_ajax_page(ajax_url_1)) if total_page < limit: limit = total_page cur_page += 1 app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0, ), queue='ajax_home_crawler', routing_key='ajax_home_info') app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1, ), queue='ajax_home_crawler', routing_key='ajax_home_info') set_seed_home_crawled(uid)
def search_keyword(row): cur_page = 1 keyword = row.keyword if row.startTime: startTime = row.startTime.strftime('%Y-%m-%d') url = 'http://s.weibo.com/weibo/{}&scope=ori&suball=1&page={}×cope=custom:{}' if row.endTime: endTime = row.endTime.strftime('%Y-%m-%d') encode_keyword = url_parse.quote(keyword) while cur_page < limit: if row.startTime and row.endTime: finalTime = startTime + ':' + endTime cur_url = url.format(encode_keyword, cur_page, finalTime) else: cur_url = url.format(encode_keyword, cur_page) search_page = get_page(cur_url) if not search_page: crawler.warning( '本次并没获取到关键词{}的相关微博,该页面源码是{}'.format(keyword, search_page)) return search_list = parse_search.get_search_info(search_page) # 先判断数据库里是否存在相关的微博,如果是已有的,那就说明是已经抓取的微博(因为结果默认按时间排序),就退出循环 for wb_data in search_list: rs = get_wb_by_mid(wb_data.weibo_id) if rs: crawler.info('关键词{}本次搜索更新的微博已经获取完成'.format(keyword)) return else: insert_weibo_data(wb_data) # 这里暂时使用网络调用而非本地调用,权衡两种方法的好处 app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid,), queue='user_crawler', routing_key='for_user_info') # 判断是否包含下一页 if 'page next S_txt1 S_line1' in search_page: cur_page += 1 else: crawler.info('关键词{}搜索完成'.format(keyword)) return
def get_redirect(name, data, post_url, session): logining_page = session.post(post_url, data=data, headers=headers) login_loop = logining_page.content.decode("GBK") # if name or password is wrong, set the value to 2 if 'retcode=101' in login_loop: crawler.error('invalid password for {}, please ensure your account and password'.format(name)) freeze_account(name, 2) return '' if 'retcode=2070' in login_loop: crawler.error('invalid verification code') return 'pinerror' if 'retcode=4049' in login_loop: crawler.warning('account {} need verification for login'.format(name)) return 'login_need_pincode' if '正在登录' in login_loop or 'Signing in' in login_loop: pa = r'location\.replace\([\'"](.*?)[\'"]\)' return re.findall(pa, login_loop)[0] else: return ''
def get_page(url, need_login=True): """ :param url: url to be crawled :param need_login: if the url is need to login, the value is True, else False :return: return '' if exception happens or status_code != 200 """ crawler.info('the crawling url is {url}'.format(url=url)) count = 0 while count < max_retries: if need_login: name_cookies = Cookies.fetch_cookies() if name_cookies is None: crawler.warning( 'no cookies in cookies pool, please find out the reason') send_email() os.kill(os.getppid(), signal.SIGTERM) try: if need_login: resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False) else: resp = requests.get(url, headers=headers, timeout=time_out, verify=False) page = resp.text if page: page = page.encode('utf-8', 'ignore').decode('utf-8') else: continue # slow down to aviod being banned time.sleep(interal) except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e: crawler.warning( 'excepitons happens when crawling {},specific infos are {}'. format(url, e)) count += 1 time.sleep(excp_interal) else: Urls.store_crawl_url(url, 1) return page crawler.warning('max tries for {},check the url in redis db2'.format(url)) Urls.store_crawl_url(url, 0) return ''
def get_page(url, session, headers, user_verify=True): """ :param user_verify: 是否为可能出现验证码的页面(搜索页面的403还没解析),否为抓取转发的ajax连接 """ crawler.info('本次抓取的url为{url}'.format(url=url)) try: page = session.get(url, headers=headers, timeout=time_out, verify=False).text. \ encode('utf-8', 'ignore').decode('utf-8') time.sleep(interal) if user_verify: if is_403(page): crawler.warning('本账号已经被冻结') crawler.info('本次抓取结束,时间是:{curtime}'.format(curtime=time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime()))) exit(-1) if is_404(page): crawler.warning('url为{url}的连接不存在'.format(url=url)) return '' if not is_complete(page): time.sleep(excp_interal) try: page = session.get(url, headers=headers, timeout=time_out, verify=False).text. \ encode('utf-8', 'ignore').decode('utf-8') except Exception as why: crawler.error(why) return '' except requests.exceptions.ReadTimeout: crawler.warning('抓取{url}时连接目标服务器超时'.format(url=url)) time.sleep(excp_interal) return '' except requests.exceptions.ConnectionError as e: crawler.warning('目标服务器拒绝连接,程序休眠1分钟,具体异常信息为:{e}'.format(e=e)) time.sleep(excp_interal) return '' else: return page
def get_page(url, user_verify=True, need_login=True): """ :param url: url to be crawled :param user_verify: if it's ajax url, the value is False, else True :param need_login: if the url is need to login, the value is True, else False :return: return '' if exception happens or status_code != 200 """ crawler.info('the crawling url is {url}'.format(url=url)) count = 0 while count < max_retries: if need_login: name_cookies = Cookies.fetch_cookies() if name_cookies is None: crawler.warning('no cookies in cookies pool, please find out the reason') send_email() os.kill(os.getppid(), signal.SIGTERM) try: if need_login: resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False) if "$CONFIG['islogin'] = '******'" in resp.text: crawler.warning('account {} has been banned'.format(name_cookies[0])) freeze_account(name_cookies[0], 0) Cookies.delete_cookies(name_cookies[0]) continue else: resp = requests.get(url, headers=headers, timeout=time_out, verify=False) page = resp.text if page: page = page.encode('utf-8', 'ignore').decode('utf-8') else: continue # slow down to aviod being banned time.sleep(interal) if user_verify: if is_banned(resp.url) or is_403(page): crawler.warning('account {} has been banned'.format(name_cookies[0])) freeze_account(name_cookies[0], 0) Cookies.delete_cookies(name_cookies[0]) count += 1 continue if 'verifybmobile' in resp.url: crawler.warning('account {} has been locked,you should use your phone to unlock it'. format(name_cookies[0])) freeze_account(name_cookies[0], -1) Cookies.delete_cookies(name_cookies[0]) continue if not is_complete(page): count += 1 continue if is_404(page): crawler.warning('{url} seems to be 404'.format(url=url)) return '' except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e: crawler.warning('excepitons happens when crawling {},specific infos are {}'.format(url, e)) count += 1 time.sleep(excp_interal) else: Urls.store_crawl_url(url, 1) return page crawler.warning('max tries for {},check the url in redis db2'.format(url)) Urls.store_crawl_url(url, 0) return ''
def send_personal_message(target_uid, adver_message, user_verify=True, need_login=True): """ :param url: url to be crawled :param user_verify: if it's ajax url, the value is False, else True :param need_login: if the url is need to login, the value is True, else False :return: return '' if exception happens or status_code != 200 """ crawler.info('the send_personal_message uid is {uid}'.format(uid=str(target_uid))) count = 0 while count < max_retries: if need_login: name_cookies = Cookies.fetch_cookies() print(name_cookies) # check adver_timers if int(name_cookies[3]) >= int(adver_timers): continue if name_cookies is None: crawler.warning('no cookies in cookies pool, please find out the reason') send_email() os.kill(os.getppid(), signal.SIGTERM) try: if need_login: resp = requests.post('http://api.weibo.com/webim/2/direct_messages/new.json?source='+str(name_cookies[2]), data={'text': adver_message, 'uid':str(target_uid)}, cookies=name_cookies[1], headers=personal_message_headers) if "error" in resp.text: crawler.warning('account {} has been banned, resp.text is: {}'.format(name_cookies[0], resp.text)) freeze_account(name_cookies[0], 0) Cookies.delete_cookies(name_cookies[0]) continue else: # update adver_times Cookies.store_cookies(name_cookies[0], name_cookies[1], name_cookies[2], 1) return None # if "$CONFIG['islogin'] = '******'" in resp.text: # crawler.warning('account {} has been banned'.format(name_cookies[0])) # freeze_account(name_cookies[0], 0) # Cookies.delete_cookies(name_cookies[0]) # continue # # else: # # resp = requests.get(url, headers=headers, timeout=time_out, verify=False) # # page = resp.text # # if page: # page = page.encode('utf-8', 'ignore').decode('utf-8') # else: # continue # # # slow down to aviod being banned # time.sleep(interal) # # if user_verify: # if is_banned(resp.url) or is_403(page): # crawler.warning('account {} has been banned'.format(name_cookies[0])) # freeze_account(name_cookies[0], 0) # Cookies.delete_cookies(name_cookies[0]) # count += 1 # continue # # if 'verifybmobile' in resp.url: # crawler.warning('account {} has been locked,you should use your phone to unlock it'. # format(name_cookies[0])) # # freeze_account(name_cookies[0], -1) # Cookies.delete_cookies(name_cookies[0]) # continue # # if not is_complete(page): # count += 1 # continue # # if is_404(page): # crawler.warning('send_personal_message{uid} seems to be 404'.format(uid=str(target_uid))) # return '' except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e: crawler.warning('excepitons happens when send_personal_message {},specific infos are {}'.format(target_uid, e)) count += 1 time.sleep(excp_interal) else: # Urls.store_crawl_url(url, 1) # return page return None crawler.warning('max tries for {},check the target_uid in redis db2'.format(target_uid)) # Urls.store_crawl_url(url, 0) return ''
def send_jd_seckill_task(jd_user_string, address_string, task_id, skuId, netproxy): """ """ s = requests.session() s.timeout = session_timeout s.proxies = netproxy jd_user_json = json.loads(jd_user_string) address_json = json.loads(address_string) cookies_encode = jd_user_json['cookies'].encode() cookies_decode = base64.b64decode(cookies_encode).decode() # cookies_dict = json.loads(cookies_decode) rawdata = '__jdv=122270672|direct|-|none|-|1504798597931; o2-webp=true; TrackID=1d8yuf-8hCib8xjpwDjMwOLGCD0gmGtLEjJFNZQwBIvwskJdwUNnq1kiTmBcsfXw2nATZkxctFmE3r1fN0yVk9egAz0M5KDHytNxuRLuHtOk; pinId=7iwdYGSz99W1ffsfn98I-w; pin=xtuyaowu; thor=C3888A1807C299F45E21294E559BB739649F3F90C26DB309D58688491645C60E7745B49FBD8CD722E210B31A2EE861DAF9C0782F8A06AAF23606C377C1953E40B92BA29EED15FF5F57F2A0165047E0C44F71D5CA5FF000281EC43042F0403E24E8A7B703856EC818D09300F82CB14986EF55754C61CA47D6A3F1A6ADE7E1FE0B99D7576D0BD2721B0E8F279EE5980A2B; _tp=gs6zPQLXL133eDDGdm%2Bv%2Fg%3D%3D; _pst=xtuyaowu; ceshi3.com=000; __jda=122270672.15047985979311779686273.1504798598.1504798598.1504798598.1; __jdb=122270672.3.15047985979311779686273|1.1504798598; __jdc=122270672; __jdu=15047985979311779686273' cookie = SimpleCookie() cookie.load(cookies_decode) # Even though SimpleCookie is dictionary-like, it internally uses a Morsel object # which is incompatible with requests. Manually construct a dictionary instead. cookies = {} for key, morsel in cookie.items(): cookies[key] = morsel.value crawler.info('the send_jd_seckill_task jd_user is {uid}'.format(uid=str(jd_user_string))) celery_stask_status = 7 try: # 第一次提交获取地址 resp = s.get('https://marathon.jd.com/async/getUsualAddressList.action?skuId='+str(skuId), headers=headers, cookies=cookies, timeout=time_out, verify=False) # [{ # "name": "冷月", # "id": 138356479, # "addressDetail": "广州外国语学校-凤凰大道 丰巢快递柜", # "provinceId": 19, # "cityId": 1601, # "countyId": 50259, # "townId": 51886, # "mobile": "", # "provinceName": "广东", # "cityName": "广州市", # "countyName": "南沙区", # "mobileKey": "5fe7bdd8ce0aa7af84b7d1380d8141a3", # "email": "", # "townName": "城区", # "mobileWithXing": "131****5409" # }, { # "name": "冷月", # "id": 138359040, # "addressDetail": "中信香樟墅1街12号", # "provinceId": 19, # "cityId": 1601, # "countyId": 50284, # "townId": 50451, # "mobile": "", # "provinceName": "广东", # "cityName": "广州市", # "countyName": "增城区", # "mobileKey": "5fe7bdd8ce0aa7af84b7d1380d8141a3", # "email": "", # "townName": "中新镇", # "mobileWithXing": "131****5409" # }] # # todo 第一次提交返回校验 if not resp.text: save_task_monitor(task_id, celery_stask_status, "do not contain address") return None if '登录' in resp.text: save_task_monitor(task_id, celery_stask_status, "cookies失败") return None address_list = json.loads(resp.text) if len(address_list) >0: address_dict = address_list[0] if 'addressDetail' not in address_dict: crawler.warning('task_id {} has been banned, resp.text is: {}'.format(task_id, resp.text)) save_task_monitor(task_id, celery_stask_status, resp.text) return None # todo 秒杀 参数需要确认 resp = s.post('https://marathon.jd.com/seckill/submitOrder.action?skuId='+str(skuId)+'&vid= HTTP/1.1', data={'orderParam.name':address_dict['name'], 'orderParam.addressDetail':address_dict['addressDetail'], 'orderParam.mobile':address_dict['mobileWithXing'], 'orderParam.email':address_dict['email'], 'orderParam.provinceId':address_dict['provinceId'], 'orderParam.cityId':address_dict['cityId'], 'orderParam.countyId':address_dict['countyId'], 'orderParam.townId':address_dict['townId'], 'orderParam.paymentType':4, 'orderParam.password':'', 'orderParam.invoiceTitle':4, 'orderParam.invoiceContent':1, 'orderParam.invoiceCompanyName':'', 'orderParam.invoiceTaxpayerNO':'', 'orderParam.usualAddressId':address_dict['id'], 'skuId':skuId, 'num':1, 'orderParam.provinceName':address_dict['provinceName'], 'orderParam.cityName':address_dict['cityName'], 'orderParam.countyName':address_dict['countyName'], 'orderParam.townName':address_dict['townName'], 'orderParam.codTimeType':3, 'orderParam.mobileKey':address_dict['mobileKey'], 'eid':jd_user_json['eid'], 'fp':jd_user_json['fp'] }, cookies=cookies, headers=personal_message_headers) # 秒杀返回校验 if "//marathon.jd.com/koFail.html?reason=" in resp.text: crawler.warning('task_id {} has been banned, resp.text is: {}'.format(task_id, resp.text)) else: celery_stask_status = 8 except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e: print(e.format_exc()) crawler.warning('excepitons happens when task_id {},specific infos are {}'.format(task_id, e)) time.sleep(excp_interal) dbc = class_MongoDB.MongoClient(uri, class_logger.getLogger('MongoDB_Users'), 'JD') dbc.setUnique('Users', 'username') dbc.update('Users', {'username': jd_user_json['username']}, {'status': 2}) save_task_monitor(task_id, celery_stask_status, resp.text) return ''
def crawl_weibo_pics(uid): limit = get_max_home_page() cur_page = 1 # 自定义最大爬取的页数 max_page = 20 # end url = home_url.format(uid, cur_page) html = get_page(url) domain = public.get_userdomain(html) # 只爬取微博个人用户的相片,如果是非个人用户(如政府,组织等)不爬取。 if domain not in ['103505', '100306', '100505', '']: set_seed_home_crawled(uid, 2) return # end domain_uid = domain + uid page_domain = 'page_' + domain url = pic_url.format(domain_uid, page_domain) html = get_page(url) weibo_pics, next_ajax_url = get_wbdata_fromweb(html) if weibo_pics is None or next_ajax_url is None: crawler.warning('用户id为{}的用户相册未采集成功,请检查原因'.format(uid)) set_seed_home_crawled(uid, 3) return if not weibo_pics: crawler.warning('用户id为{}的用户相册未采集成功,可能是因为TA没有发过带图微博'.format(uid)) set_seed_home_crawled(uid, 5) return insert_weibo_pics(weibo_pics) if not next_ajax_url: crawler.warning('用户id为{}的相册采集完成'.format(uid)) set_seed_home_crawled(uid, 4) return cur_page += 1 while cur_page <= limit: # 有些微博账号的照片多达两三千张,如果全部爬取比较浪费时间,这里先简单粗暴地根据当前微博的页数 # 进行限制。经过调查发现10页左右应该是比较理想的数字。 if cur_page > max_page: break # ebd cur_time = int(time.time()*1000) ajax_call = 1 page_id = domain_uid url = ajax_url.format(page_id, cur_page, ajax_call, cur_time) + '&' + next_ajax_url html = get_page(url, user_verify=False) weibo_pics, next_ajax_url = get_pic_data_byajax(html) if weibo_pics is None or next_ajax_url is None: crawler.warning('用户id为{}的用户相册未采集成功,请检查原因'.format(uid)) set_seed_home_crawled(uid, 3) return if not weibo_pics: crawler.warning('用户id为{}的用户相册未采集成功,请检查原因'.format(uid)) set_seed_home_crawled(uid, 3) return insert_weibo_pics(weibo_pics) if not next_ajax_url: crawler.warning('用户id为{}的相册采集完成'.format(uid)) set_seed_home_crawled(uid, 4) return cur_page += 1 # 在完成规定的最大爬取页数后主动退出,将标志位置位为4 set_seed_home_crawled(uid, 4) return
def exception_uid_handler(uid, err_code, proxy ={}, html=''): crawler.warning('用户id为{}的相册采集出错,这一请求接收到的内容为{},状态码{}'.format(uid, html, err_code)) # set_seed_home_crawled(uid, 3) if proxy: proxy_handler(proxy, -1)
def finish_uid_handler(uid, proxy): crawler.warning('用户id为{}的相册采集完成'.format(uid)) set_seed_home_crawled(uid, 4) proxy_handler(proxy, 1)
def get_session(name, password): session = requests.Session() su = get_encodename(name) sever_data = get_server_data(su, session) servertime = sever_data["servertime"] nonce = sever_data['nonce'] rsakv = sever_data["rsakv"] pubkey = sever_data["pubkey"] sp = get_password(password, servertime, nonce, pubkey) # 提交的数据可以根据抓包获得 data = { 'encoding': 'UTF-8', 'entry': 'weibo', 'from': '', 'gateway': '1', 'nonce': nonce, 'pagerefer': "", 'prelt': 67, 'pwencode': 'rsa2', "returntype": "META", 'rsakv': rsakv, 'savestate': '7', 'servertime': servertime, 'service': 'miniblog', 'sp': sp, 'sr': '1920*1080', 'su': su, 'useticket': '1', 'vsnf': '1', 'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack' } post_url = 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18)' url = get_redirect(data, post_url, session) if url != '': rs_cont = session.get(url, headers=headers) login_info = rs_cont.text u_pattern = r'"uniqueid":"(.*)",' m = re.search(u_pattern, login_info) if m: if m.group(1): # 任意验证一个页面看能否访问,使用这个方法验证比较依赖外部条件,但是没找到更好的方式(有的情况下, # 账号存在问题,但是可以访问自己的主页,所以通过自己的主页验证账号是否正常不恰当) check_url = 'http://weibo.com/p/1005051764222885/info?mod=pedit_more' resp = session.get(check_url, headers=headers) if is_403(resp.text): other.error('账号{}已被冻结'.format(name)) crawler.warning('账号{}已经被冻结'.format(name)) freeze_account(name) return None other.info('本次登陆账号为:{}'.format(name)) Cookies.store_cookies(name, session.cookies.get_dict()) return session else: other.error('本次账号{}登陆失败'.format(name)) return None else: other.error('本次账号{}登陆失败'.format(name)) return None else: other.error('本次账号{}登陆失败'.format(name)) return None
def crawl_weibo(uid): debug_mode = 1 limit = get_max_home_page() cur_page = 1 pic_count = 0 max_pic_count = 150 max_retry_cnt = 2 cur_retry_cnt = 0 direct_get_sleep_time = 30 containerid = '230413' + uid luicode = '10000011' lfid = '230283' + uid featurecode = '20000180' value = uid page_type = '03' page = cur_page # 只要db中没有proxy,就认为当前进入了一个暂时无代理而需要直接连接的状况,sleep的时间就应该相应的拉长 proxy = get_a_random_proxy() if proxy == {}: direct_get_sleep_time = 60 elif random_event_occur(): proxy = {} print(proxy) # end if debug_mode == 1: direct_get_sleep_time = 1 # test for getting empty proxy if proxy == {}: # crawler.warning('empty proxy!') # time.sleep(3) # proxy = get_a_random_proxy() # proxy_cnt = count_proxy() # crawler.warning('new proxy:{}, proxy count:{}'.format(proxy, proxy_cnt)) # return time.sleep(randint(0, direct_get_sleep_time)) # end url = ori_wb_temp_url.format(containerid, luicode, lfid, featurecode, value, page_type, page) html = get_page(url, user_verify=False, need_login=False, proxys=proxy) # html为空也有可能是其他原因,但是代理问题应该是大概率,因此对代理进行扣分。 # 如果重试还是返回空html,那么两个proxy均不扣分,记录uid异常后直接return,如果返回非空但无效的html,则在后面流程进行扣分 if html == '': if cur_retry_cnt < max_retry_cnt: cur_retry_cnt = cur_retry_cnt + 1 proxy_handler(proxy, -1) proxy = get_a_random_proxy() if proxy == {}: time.sleep(randint(0, direct_get_sleep_time)) html = get_page(url, user_verify=False, need_login=False, proxys=proxy) if html == '': proxy_handler(proxy, -1) return else: proxy_handler(proxy, -1) return # end weibo_pics = get_weibo_list(html) if weibo_pics == '': crawler.warning('请求过于频繁') if proxy == {}: time.sleep(randint(0, direct_get_sleep_time)) proxy_handler(proxy, -1) return if weibo_pics == None: proxy_handler(proxy, -1) return elif weibo_pics == False: finish_uid_handler(uid, proxy) return elif weibo_pics: insert_weibo_pics(weibo_pics) pic_count = pic_count + len(weibo_pics) cur_page += 1 while cur_page <= limit and pic_count < max_pic_count: page = cur_page url = ori_wb_temp_url.format(containerid, luicode, lfid, featurecode, value, page_type, page) html = get_page(url, user_verify=False, need_login=False, proxys=proxy) # html为空也有可能是其他原因,但是代理问题应该是大概率,因此对代理进行扣分。 if html == '': if cur_retry_cnt < max_retry_cnt: cur_retry_cnt = cur_retry_cnt + 1 proxy_handler(proxy, -1) proxy = get_a_random_proxy() if proxy == {}: time.sleep(randint(0, direct_get_sleep_time)) html = get_page(url, user_verify=False, need_login=False, proxys=proxy) if html == '': exception_uid_handler(uid, 6, proxy) return else: exception_uid_handler(uid, 3, proxy) return # end weibo_pics = get_weibo_list(html) # 如果通过当前代理所获取到的页面是被封锁页面,则将当前代理降分并直接return if weibo_pics == '': crawler.warning('请求过于频繁') if proxy == {}: time.sleep(randint(0, direct_get_sleep_time)) proxy_handler(proxy, -1) return if weibo_pics == None: exception_uid_handler(uid, 4, proxy, html) return elif weibo_pics == False: finish_uid_handler(uid, proxy) return elif weibo_pics: insert_weibo_pics(weibo_pics) pic_count = pic_count + len(weibo_pics) cur_page += 1 finish_uid_handler(uid, proxy) return
def get_page(url, user_verify=True, need_login=True): """ :param url: 待出现 :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码),否为抓取转发的ajax连接 :param need_login: 抓取页面是否需要登录,这样做可以减小一些账号的压力 :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串 """ crawler.info('本次抓取的url为{url}'.format(url=url)) count = 0 latest_name_cookies = None while count < max_retries: if need_login: # 每次重试的时候都换cookies,并且和上次不同 name_cookies = Cookies.fetch_cookies() if name_cookies is None: crawler.warning('cookie池中不存在cookie,正在检查是否有可用账号') rs = get_login_info() if len(rs) == 0: crawler.error('账号均不可用,请检查账号健康状况') # 杀死所有关于celery的进程 if 'win32' in sys.platform: os.popen('taskkill /F /IM "celery*"') else: os.popen('pkill -f "celery"') else: # 如果有可用账号,那么就拿来登录,这里用了本地调用,好像不是很合理,因为如果login queue # 不会在本机上,那么该调用就会无效但是用网络调用,如何保证不会出现在某些不常用登录地的节点 # 上还有待考量,亦或者找到一个更适合的方法可以突破异地登录的限制 # TODO 衡量是用网络调用还是直接调用 login.get_session()方法,这里应该不是很合理 # 目前暂时不考虑节点登录出现验证码的问题, 考虑到大规模账号登录的话,需要把login_queue的节点放在账号常用地 crawler.info('重新获取cookie中...') login.excute_login_task() time.sleep(10) if name_cookies == latest_name_cookies: continue latest_name_cookies = name_cookies try: if need_login: resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False) if "$CONFIG['islogin'] = '******'" in resp.text: crawler.warning('账号{}出现异常'.format(name_cookies[0])) freeze_account(name_cookies[0]) Cookies.delete_cookies(name_cookies[0]) continue else: resp = requests.get(url, headers=headers, timeout=time_out, verify=False) page = resp.text if page: page = page.encode('utf-8', 'ignore').decode('utf-8') else: continue # 每次抓取过后程序sleep的时间,降低封号危险 time.sleep(interal) if user_verify: if 'unfreeze' in resp.url or 'accessdeny' in resp.url or is_403( page): crawler.warning('账号{}已经被冻结'.format(name_cookies[0])) freeze_account(name_cookies[0]) Cookies.delete_cookies(name_cookies[0]) count += 1 continue if not is_complete(page): count += 1 continue if is_404(page): crawler.warning('url为{url}的连接不存在'.format(url=url)) return '' except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e: crawler.warning('抓取{}出现异常,具体信息是{}'.format(url, e)) count += 1 time.sleep(excp_interal) else: Urls.store_crawl_url(url, 1) return page crawler.warning('抓取{}已达到最大重试次数,请在redis的失败队列中查看该url并检查原因'.format(url)) Urls.store_crawl_url(url, 0) return ''
def get_page(url, user_verify=True, need_login=True): """ :param url: 待抓取url :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码), 否为抓取转发的ajax连接 :param need_login: 抓取页面是否需要登录,这样做可以减小一些账号的压力 :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串 """ crawler.info('本次抓取的url为{url}'.format(url=url)) count = 0 while count < max_retries: if need_login: name_cookies = Cookies.fetch_cookies() if name_cookies is None: crawler.warning('cookie池中不存在cookie,请检查账号是否正常') other.warning('正在关闭爬虫程序...') if 'win32' in sys.platform: os.popen('taskkill /F /IM "celery*"') else: os.popen('pkill -f "celery"') try: if need_login: resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False) if "$CONFIG['islogin'] = '******'" in resp.text: crawler.warning('账号{}出现异常'.format(name_cookies[0])) freeze_account(name_cookies[0], 0) Cookies.delete_cookies(name_cookies[0]) continue else: resp = requests.get(url, headers=headers, timeout=time_out, verify=False) page = resp.text if page: page = page.encode('utf-8', 'ignore').decode('utf-8') else: continue # 每次抓取过后程序sleep的时间,降低封号危险 time.sleep(interal) if user_verify: if 'unfreeze' in resp.url or 'accessdeny' in resp.url or 'userblock' in resp.url or is_403( page): crawler.warning('账号{}已经被冻结'.format(name_cookies[0])) freeze_account(name_cookies[0], 0) Cookies.delete_cookies(name_cookies[0]) count += 1 continue if 'verifybmobile' in resp.url: crawler.warning('账号{}功能被锁定,需要手机解锁'.format(name_cookies[0])) freeze_account(name_cookies[0], -1) Cookies.delete_cookies(name_cookies[0]) continue if not is_complete(page): count += 1 continue if is_404(page): crawler.warning('url为{url}的连接不存在'.format(url=url)) return '' except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e: crawler.warning('抓取{}出现异常,具体信息是{}'.format(url, e)) count += 1 time.sleep(excp_interal) else: Urls.store_crawl_url(url, 1) return page crawler.warning('抓取{}已达到最大重试次数,请在redis的失败队列中查看该url并检查原因'.format(url)) Urls.store_crawl_url(url, 0) return ''
def get_page(url, user_verify=True, need_login=True): """ :param url: 待抓取url :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码),否为抓取转发的ajax连接 :param need_login: 抓取页面是否需要登录,这样做可以减小一些账号的压力 :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串 """ crawler.info('本次抓取的url为{url}'.format(url=url)) count = 0 latest_name_cookies = None while count < max_retries: if need_login: # 每次重试的时候都换cookies,并且和上次不同,如果只有一个账号,那么就允许相同 name_cookies, cookies_count = Cookies.fetch_cookies() if name_cookies is None: crawler.warning('cookie池中不存在cookie,正在检查是否有可用账号') rs = get_login_info() # 选择状态正常的账号进行登录,账号都不可用就停掉celery worker if len(rs) == 0: crawler.error('账号均不可用,请检查账号健康状况') # 杀死所有关于celery的进程 if 'win32' in sys.platform: os.popen('taskkill /F /IM "celery*"') else: os.popen('pkill -f "celery"') else: crawler.info('重新获取cookie中...') login.excute_login_task() time.sleep(10) # 只有cookies总数大于1的时候才会在每次重试的时候切换不同cookie if cookies_count > 1 and name_cookies == latest_name_cookies: continue latest_name_cookies = name_cookies try: if need_login: resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False) if "$CONFIG['islogin'] = '******'" in resp.text: crawler.warning('账号{}出现异常'.format(name_cookies[0])) freeze_account(name_cookies[0], 0) Cookies.delete_cookies(name_cookies[0]) continue else: resp = requests.get(url, headers=headers, timeout=time_out, verify=False) page = resp.text if page: page = page.encode('utf-8', 'ignore').decode('utf-8') else: continue # 每次抓取过后程序sleep的时间,降低封号危险 time.sleep(interal) if user_verify: if 'unfreeze' in resp.url or 'accessdeny' in resp.url or is_403(page): crawler.warning('账号{}已经被冻结'.format(name_cookies[0])) freeze_account(name_cookies[0], 0) Cookies.delete_cookies(name_cookies[0]) count += 1 continue if not is_complete(page): count += 1 continue if is_404(page): crawler.warning('url为{url}的连接不存在'.format(url=url)) return '' except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e: crawler.warning('抓取{}出现异常,具体信息是{}'.format(url, e)) count += 1 time.sleep(excp_interal) else: Urls.store_crawl_url(url, 1) return page crawler.warning('抓取{}已达到最大重试次数,请在redis的失败队列中查看该url并检查原因'.format(url)) Urls.store_crawl_url(url, 0) return ''
def get_page(url, user_verify=True): """ :param url: 待出现 :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码),否为抓取转发的ajax连接 :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串 """ crawler.info('本次抓取的url为{url}'.format(url=url)) count = 0 latest_name_cookies = None while count < max_retries: # 每次重试的时候都换cookies,并且和上次不同 name_cookies = Cookies.fetch_cookies() if name_cookies is None: crawler.error('cookie池中不存在cookie,请检查账号和登录任务是否正常。采集程序退出。') os._exit(0) if name_cookies == latest_name_cookies: continue latest_name_cookies = name_cookies try: resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False) page = resp.text if page: page = page.encode('utf-8', 'ignore').decode('utf-8') else: continue # 每次抓取过后程序sleep的时间,降低封号危险 time.sleep(interal) if user_verify: if 'unfreeze' in resp.url or is_403(page): crawler.warning('账号{}已经被冻结'.format(name_cookies[0])) freeze_account(name_cookies[0]) Cookies.delete_cookies(name_cookies[0]) count += 1 continue if not is_complete(page): count += 1 continue if is_404(page): crawler.warning('url为{url}的连接不存在'.format(url=url)) return '' except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e: crawler.warning('抓取{}出现异常,具体信息是{}'.format(url, e)) count += 1 time.sleep(excp_interal) else: Urls.store_crawl_url(url, 1) return page crawler.warning('抓取{}已达到最大重试次数,请在redis的失败队列中查看该url并检查原因'.format(url)) Urls.store_crawl_url(url, 0) return ''
def crawl_weibo_datas(uid): limit = get_max_home_page() cur_page = 1 # 自定义最大爬取的页数 max_page = 10 # end while cur_page <= limit: # 有些微博账号的照片多达两三千张,如果全部爬取比较浪费时间,这里先简单粗暴地根据当前微博的页数 # 进行限制。经过调查发现10页左右应该是比较理想的数字,电脑版微博一页有45条微博,那么一个账户就是 # 450条微博。 if cur_page > max_page: break # end url = home_url.format(uid, cur_page) html = get_page(url) domain = public.get_userdomain(html) # 只爬取微博个人用户的相片,如果是非个人用户(如政府,组织等)不爬取。 if domain not in ['103505', '100306', '100505', '']: set_seed_home_crawled(uid, 2) return # end weibo_datas, weibo_pics = get_wbdata_fromweb(html) if not weibo_datas: crawler.warning('用户id为{}的用户主页微博数据未采集成功,请检查原因'.format(uid)) return insert_weibo_datas(weibo_datas) # 如果非空,则将weibo_pics插入数据库中 if weibo_pics: insert_weibo_pics(weibo_pics) # end cur_time = int(time.time() * 1000) ajax_url_0 = ajax_url.format(domain, 0, domain, uid, cur_page, cur_page, cur_time) ajax_url_1 = ajax_url.format(domain, 1, domain, uid, cur_page, cur_page, cur_time + 100) if cur_page == 1: total_page = get_total_page(crawl_ajax_page(ajax_url_1)) if total_page < limit: limit = total_page cur_page += 1 app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0, ), queue='ajax_home_crawler', routing_key='ajax_home_info') app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1, ), queue='ajax_home_crawler', routing_key='ajax_home_info') # 在遍历完所有页数之后,将flag置位。放在这里表示所有页面都遍历过,不保证遍历成功后置位。可能以后还要优化,即在 # 某个回调函数中使用它。 set_seed_home_crawled(uid)