def crawl_praise_page(mid): # 这里为了马上拿到返回结果,采用本地调用的方式 cur_time = int(time.time() * 1000) cur_url = BASE_URL.format(mid, cur_time) html = get_page(cur_url, auth_level=2, is_ajax=True) praise_data, ext_param = praise.get_praise_list(html, mid) PraiseOper.add_all(praise_data) WbDataOper.set_weibo_praise_crawled(mid) if not ext_param: crawler.error( 'fail to get praise page 2 ext_param, mid is {mid}'.format( mid=mid)) return # why no app.send_task and fall back to sequential execution # because weibo praise now require a parameter called max_id # and request without it will return something different from normal browser # should work after 5 # TODO: retry or return depending on ext_param for __ in range(2, 5): # ext_param mainly max_id will be updated each time and be used next time html, praise_data, ext_param = crawl_praise_by_page(mid, ext_param) return
def get_redirect(name, data, post_url, session, proxy): logining_page = session.post(post_url, data=data, headers=headers, proxies=proxy) login_loop = logining_page.content.decode("GBK") # if name or password is wrong, set the value to 2 if 'retcode=101' in login_loop: crawler.error( 'invalid password for {}, please ensure your account and password'. format(name)) LoginInfoOper.freeze_account(name, 2) return '' if 'retcode=2070' in login_loop: crawler.error('invalid verification code') return 'pinerror' if 'retcode=4049' in login_loop: crawler.warning('account {} need verification for login'.format(name)) return 'login_need_pincode' if '正在登录' in login_loop or 'Signing in' in login_loop: pa = r'location\.replace\([\'"](.*?)[\'"]\)' return re.findall(pa, login_loop)[0] else: return ''
def crawl_person_infos(uid): """ Crawl user info and their fans and followers For the limit of weibo's backend, we can only crawl 5 pages of the fans and followers. We also have no permissions to view enterprise's followers and fans info :param uid: current user id :return: None """ if not uid: return try: user, is_crawled = get_profile(uid) # If it's enterprise user, just skip it if user and user.verify_type == 2: SeedidsOper.set_seed_other_crawled(uid) return # Crawl fans and followers if not is_crawled: app.send_task('tasks.user.crawl_follower_fans', args=(uid,), queue='fans_followers', routing_key='for_fans_followers') # By adding '--soft-time-limit secs' when you start celery, this will resend task to broker # e.g. celery -A tasks.workers -Q user_crawler worker -l info -c 1 --soft-time-limit 10 except SoftTimeLimitExceeded: crawler.error("user SoftTimeLimitExceeded uid={uid}".format(uid=uid)) app.send_task('tasks.user.crawl_person_infos', args=(uid, ), queue='user_crawler', routing_key='for_user_info')
def crawl_person_infos(uid): """ Crawl user info and their fans and followers For the limit of weibo's backend, we can only crawl 5 pages of the fans and followers. We also have no permissions to view enterprise's followers and fans info :param uid: current user id :return: None """ if not uid: return try: user, is_crawled = get_profile(uid) # If it's enterprise user, just skip it if user and user.verify_type == 2: SeedidsOper.set_seed_other_crawled(uid) return # Crawl fans and followers if not is_crawled: app.send_task('tasks.user.crawl_follower_fans', args=(uid, ), queue='fans_followers', routing_key='for_fans_followers') # By adding '--soft-time-limit secs' when you start celery, this will resend task to broker # e.g. celery -A tasks.workers -Q user_crawler worker -l info -c 1 --soft-time-limit 10 except SoftTimeLimitExceeded: crawler.error( "user SoftTimeLimitExceeded uid={uid}".format(uid=uid)) app.send_task('tasks.user.crawl_person_infos', args=(uid, ), queue='user_crawler', routing_key='for_user_info')
def time_limit(*args, **kargs): try: return func(*args, **kargs) except Exception as e: crawler.error( 'failed to crawl {url},here are details:{e}, stack is {stack}'. format(url=args[0], e=e, stack=format_tb(e.__traceback__)[0])) return ''
def time_limit(*args, **kargs): try: return func(*args, **kargs) except Exception as e: crawler.error('failed to crawl {url},here are details:{e}, stack is {stack}'.format(url=args[0], e=e, stack=format_tb (e.__traceback__)[0])) return ''
def crawl_praise_by_page(mid, page_num): try: cur_time = int(time.time() * 1000) cur_url = BASE_URL.format(mid, page_num, cur_time) html = get_page(cur_url, auth_level=2, is_ajax=True) praise_datas = praise.get_praise_list(html, mid) except SoftTimeLimitExceeded: crawler.error( "praise SoftTimeLimitExceeded mid={mid} page_num={page_num}". format(mid=mid, page_num=page_num)) crawl_praise_by_page(mid, page_num) PraiseOper.add_all(praise_datas) if page_num == 1: WbDataOper.set_weibo_praise_crawled(mid) return html, praise_datas
def crawl_comment_by_page(mid, page_num, seeion): try: cur_url = BASE_URL.format(mid, page_num) html = get_page(cur_url, auth_level=1, is_ajax=True) comment_datas, seed_ids = comment.get_comment_list(html, mid) except SoftTimeLimitExceeded: crawler.error( "comment SoftTimeLimitExceeded mid={mid} page_num={page_num}". format(mid=mid, page_num=page_num)) crawl_comment_by_page(mid, page_num) CommentOper.add_all(comment_datas, seeion) SeedidsOper.insert_seeds(seed_ids, seeion) if page_num == 1: WbDataOper.set_weibo_comment_crawled(mid, seeion) return html, comment_datas
def download_img_task(mid, urls): count = 0 for img_url in urls: if IMG_TYPE == 'large': img_url = img_url.replace('thumbnail', 'large').replace('square', 'large') suffix = img_url[img_url.rfind('.') + 1:] # skip gif images, which is used to show loading process if suffix != 'gif': count += 1 try: image_response = requests.get(img_url, stream=True) except Exception as e: crawler.error('fail to down image {}, {} is raised'.format(img_url, e)) else: with open(os.path.join(IMG_PATH, '{}-{}.{}'.format(mid, count, suffix)), 'wb') as out_file: shutil.copyfileobj(image_response.raw, out_file)
def crawl_comment_by_page(mid, page_num): try: cur_url = BASE_URL.format(mid, page_num) html = get_page(cur_url, auth_level=1, is_ajax=True) comment_datas = comment.get_comment_list(html, mid) except SoftTimeLimitExceeded: crawler.error( "comment SoftTimeLimitExceeded mid={mid} page_num={page_num}". format(mid=mid, page_num=page_num)) app.send_task('tasks.comment.crawl_comment_by_page', args=(mid, page_num), queue='comment_page_crawler', routing_key='comment_page_info') CommentOper.add_all(comment_datas) if page_num == 1: WbDataOper.set_weibo_comment_crawled(mid) return html, comment_datas
def crawl_comment_by_page(mid, page_num): try: cur_url = BASE_URL.format(mid, page_num) html = get_page(cur_url, auth_level=1, is_ajax=True) comment_datas = comment.get_comment_list(html, mid) except SoftTimeLimitExceeded: crawler.error( "comment SoftTimeLimitExceeded mid={mid} page_num={page_num}". format(mid=mid, page_num=page_num)) app.send_task( 'tasks.comment.crawl_comment_by_page', args=(mid, page_num), queue='comment_page_crawler', routing_key='comment_page_info') CommentOper.add_all(comment_datas) if page_num == 1: WbDataOper.set_weibo_comment_crawled(mid) return html, comment_datas
def crawl_person_infos(uid): if not uid: return try: user, is_crawled = get_profile(uid) # If it's enterprise user, just skip it if user and user.verify_type == 2: SeedidsOper.set_seed_other_crawled(uid) return # Crawl fans and followers if not is_crawled: crawl_follower_fans(uid) except SoftTimeLimitExceeded: crawler.error( "user SoftTimeLimitExceeded uid={uid}".format(uid=uid)) crawl_person_infos(uid)
def get_redirect(name, data, post_url, session, proxy): logining_page = session.post(post_url, data=data, headers=headers, proxies=proxy) login_loop = logining_page.content.decode("GBK") # if name or password is wrong, set the value to 2 if 'retcode=101' in login_loop: crawler.error('invalid password for {}, please ensure your account and password'.format(name)) LoginInfoOper.freeze_account(name, 2) return '' if 'retcode=2070' in login_loop: crawler.error('invalid verification code') return 'pinerror' if 'retcode=4049' in login_loop: crawler.warning('account {} need verification for login'.format(name)) return 'login_need_pincode' if '正在登录' in login_loop or 'Signing in' in login_loop: pa = r'location\.replace\([\'"](.*?)[\'"]\)' return re.findall(pa, login_loop)[0] else: return ''
def crawl_praise_page(mid): # 这里为了马上拿到返回结果,采用本地调用的方式 cur_time = int(time.time() * 1000) cur_url = BASE_URL.format(mid, cur_time) html = get_page(cur_url, auth_level=2, is_ajax=True) praise_data, ext_param = praise.get_praise_list(html, mid) PraiseOper.add_all(praise_data) WbDataOper.set_weibo_praise_crawled(mid) if not ext_param: crawler.error('fail to get praise page 2 ext_param, mid is {mid}'.format(mid=mid)) return # why no app.send_task and fall back to sequential execution # because weibo praise now require a parameter called max_id # and request without it will return something different from normal browser # should work after 5 # TODO: retry or return depending on ext_param for __ in range(2,5): # ext_param mainly max_id will be updated each time and be used next time html, praise_data, ext_param = crawl_praise_by_page(mid, ext_param) return