def crawl_weibo_datas(uid): limit = get_max_home_page() cur_page = 1 while cur_page <= limit: url = home_url.format(uid, cur_page) html = get_page(url) weibo_datas = get_wbdata_fromweb(html) if not weibo_datas: crawler.warning('用户id为{}的用户主页微博数据未采集成功,请检查原因'.format(uid)) return insert_weibo_datas(weibo_datas) domain = public.get_userdomain(html) ajax_url_0 = ajax_url.format(domain, 0, domain, uid, cur_page, cur_page) ajax_url_1 = ajax_url.format(domain, 1, domain, uid, cur_page, cur_page) if cur_page == 1: total_page = get_total_page(crawl_ajax_page(ajax_url_1)) if total_page < limit: limit = total_page cur_page += 1 app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0,), queue='ajax_home_crawler', routing_key='ajax_home_info') app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1,), queue='ajax_home_crawler', routing_key='ajax_home_info')
def crawl_person_infos(uid): """ Crawl user info and their fans and followers For the limit of weibo's backend, we can only crawl 5 pages of the fans and followers. We also have no permissions to view enterprise's followers and fans info :param uid: current user id :return: None """ if not uid: return url = home_url.format(uid) html = get_page(url) if is_404(html): return None domain = public.get_userdomain(html) user, is_crawled = user_get.get_profile(uid, domain) # If it's enterprise user, just skip it if user and user.verify_type == 2: set_seed_other_crawled(uid) return # Crawl fans and followers if not is_crawled: app.send_task('tasks.user.crawl_follower_fans', args=(uid, domain), queue='fans_followers', routing_key='for_fans_followers')
def crawl_weibo_datas(uid): limit = get_max_home_page() cur_page = 1 while cur_page <= limit: url = home_url.format(uid, cur_page) html = get_page(url) weibo_datas = get_wbdata_fromweb(html) if not weibo_datas: crawler.warning("user {} has no weibo".format(uid)) return insert_weibo_datas(weibo_datas) domain = public.get_userdomain(html) cur_time = int(time.time()*1000) ajax_url_0 = ajax_url.format(domain, 0, domain, uid, cur_page, cur_page, cur_time) ajax_url_1 = ajax_url.format(domain, 1, domain, uid, cur_page, cur_page, cur_time+100) if cur_page == 1: # here we use local call to get total page number total_page = get_total_page(crawl_ajax_page(ajax_url_1)) if total_page < limit: limit = total_page cur_page += 1 app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0,), queue='ajax_home_crawler', routing_key='ajax_home_info') app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1,), queue='ajax_home_crawler', routing_key='ajax_home_info') set_seed_home_crawled(uid)
def crawl_weibo_datas(uid): limit = get_max_home_page() cur_page = 1 while cur_page <= limit: url = HOME_URL.format(uid, cur_page) if cur_page == 1: html = get_page(url, auth_level=1) else: html = get_page(url, auth_level=2) weibo_datas = get_data(html) if not weibo_datas: crawler.warning("user {} has no weibo".format(uid)) return # Check whether weibo created after time in spider.yaml timeafter = time.mktime( time.strptime(get_time_after(), '%Y-%m-%d %H:%M:%S')) length_weibo_datas = len(weibo_datas) for i in range(0, len(weibo_datas)): weibo_time = time.mktime( time.strptime(weibo_datas[i].create_time, '%Y-%m-%d %H:%M')) if weibo_time < timeafter: weibo_datas = weibo_datas[0:i] break WbDataOper.add_all(weibo_datas) # If the weibo isn't created after the given time, jump out the loop if i != length_weibo_datas - 1: break domain = public.get_userdomain(html) cur_time = int(time.time()*1000) ajax_url_0 = AJAX_URL.format(domain, 0, domain, uid, cur_page, cur_page, cur_time) ajax_url_1 = AJAX_URL.format(domain, 1, domain, uid, cur_page, cur_page, cur_time+100) if cur_page == 1: # here we use local call to get total page number total_page = get_total_page(crawl_ajax_page(ajax_url_1, 2)) auth_level = 1 else: auth_level = 2 if total_page < limit: limit = total_page app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0, auth_level), queue='ajax_home_crawler', routing_key='ajax_home_info') app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1, auth_level), queue='ajax_home_crawler', routing_key='ajax_home_info') cur_page += 1 SeedidsOper.set_seed_home_crawled(uid)
def get_url_from_web(user_id): """ Get user info according to user id. If user domain is 100505,the url is just 100505+userid; If user domain is 103505 or 100306, we need to request once more to get his info If user type is enterprise or service, we just crawl their home page info :param: user id :return: user entity """ if not user_id: return None url = BASE_URL.format('100505', user_id) html = get_page(url, auth_level=1) if not is_404(html): domain = public.get_userdomain(html) # writers(special users) if domain == '103505' or domain == '100306': url = BASE_URL.format(domain, user_id) html = get_page(url) user = get_user_detail(user_id, html) # normal users elif domain == '100505': user = get_user_detail(user_id, html) samefollow_uid = get_samefollow_uid() if samefollow_uid.strip() != '': samefollow_uid = samefollow_uid.split(',') url = SAMEFOLLOW_URL.format(user_id) isFanHtml = get_page(url, auth_level=2) person.get_isFan(isFanHtml, samefollow_uid, user_id) # enterprise or service else: user = get_enterprise_detail(user_id, html) if user is None: return None user.name = public.get_username(html) user.head_img = public.get_headimg(html) user.verify_type = public.get_verifytype(html) user.verify_info = public.get_verifyreason(html, user.verify_type) user.level = public.get_level(html) if user.name: UserOper.add_one(user) storage.info( 'Has stored user {id} info successfully'.format(id=user_id)) return user else: return None else: return None
def get_url_from_web(user_id): """ Get user info according to user id. If user domain is 100505,the url is just 100505+userid; If user domain is 103505 or 100306, we need to request once more to get his info If user type is enterprise or service, we just crawl their home page info :param: user id :return: user entity """ if not user_id: return None url = BASE_URL.format('100505', user_id) html = get_page(url, auth_level=1) if not is_404(html): domain = public.get_userdomain(html) # writers(special users) if domain == '103505' or domain == '100306': url = BASE_URL.format(domain, user_id) html = get_page(url) user = get_user_detail(user_id, html) # normal users elif domain == '100505': user = get_user_detail(user_id, html) samefollow_uid = get_samefollow_uid() if samefollow_uid.strip() != '': samefollow_uid = samefollow_uid.split(',') url = SAMEFOLLOW_URL.format(user_id) isFanHtml = get_page(url, auth_level=2) person.get_isFan(isFanHtml, samefollow_uid, user_id) # enterprise or service else: user = get_enterprise_detail(user_id, html) if user is None: return None user.name = public.get_username(html) user.head_img = public.get_headimg(html) user.verify_type = public.get_verifytype(html) user.verify_info = public.get_verifyreason(html, user.verify_type) user.level = public.get_level(html) if user.name: UserOper.add_one(user) storage.info('Has stored user {id} info successfully'.format(id=user_id)) return user else: return None else: return None
def get_url_from_web(user_id): """ Get user info according to user id. If user domain is 100505,the url is just 100505+userid; If user domain is 103505 or 100306, we need to request once more to get his info If user type is enterprise or service, we just crawl their home page info :param: user id :return: user entity """ if not user_id: return None url = base_url.format('100505', user_id) html = get_page(url) if not is_404(html): domain = public.get_userdomain(html) # writers(special users) if domain == '103505' or domain == '100306': url = base_url.format(domain, user_id) html = get_page(url) user = get_user_detail(user_id, html) # normal users elif domain == '100505': user = get_user_detail(user_id, html) # enterprise or service else: user = get_enterprise_detail(user_id, html) if user is None: return None user.name = public.get_username(html) user.head_img = public.get_headimg(html) user.verify_type = public.get_verifytype(html) user.verify_info = public.get_verifyreason(html, user.verify_type) user.level = public.get_level(html) if user.name: save_user(user) storage.info('has stored user {id} info successfully'.format(id=user_id)) return user else: return None else: return None
def get_url_from_web(user_id): """ 根据用户id获取用户资料:如果用户的domain为100505,那么会直接返回用户详细资料;如果是103505或者100306,那么需要再进行 一次请求,因为用base_url的方式它只会定位到用户主页而不是详细资料页;如果是企业和服务号等,通过base_url访问也会跳转到该 用户的主页,由于该类用户的详细页价值不大,所以不再进行请求它们的详细页 :param user_id: 用户id :return: 用户类实体 """ if not user_id: return None url = base_url.format('100505', user_id) html = get_page(url) if not is_404(html): domain = public.get_userdomain(html) # 作家 if domain == '103505' or domain == '100306': url = base_url.format(domain, user_id) html = get_page(url) user = get_user_detail(user_id, html) # 普通用户 elif domain == '100505': user = get_user_detail(user_id, html) # 默认是企业 else: user = get_enterprise_detail(user_id, html) if user is None: return None user.name = public.get_username(html) user.head_img = public.get_headimg(html) user.verify_type = public.get_verifytype(html) user.verify_info = public.get_verifyreason(html, user.verify_type) user.level = public.get_level(html) # 保存用户信息到数据库 save_user(user) storage.info('已经成功保存ID为{id}的用户信息'.format(id=user_id)) return user else: return None
def get_url_from_web(user_id): """ Get user info according to user id. If user domain is 100505,the url is just 100505+userid; If user domain is 103505 or 100306, we need to request once more to get his info If user type is enterprise or service, we just crawl their home page info :param: user id :return: user entity """ if not user_id: return None url = base_url.format('100505', user_id) html = get_page(url) if not is_404(html): domain = public.get_userdomain(html) # writers(special users) if domain == '103505' or domain == '100306': url = base_url.format(domain, user_id) html = get_page(url) user = get_user_detail(user_id, html) # normal users elif domain == '100505': user = get_user_detail(user_id, html) # enterprise or service else: user = get_enterprise_detail(user_id, html) if user is None: return None user.name = public.get_username(html) user.head_img = public.get_headimg(html) user.verify_type = public.get_verifytype(html) user.verify_info = public.get_verifyreason(html, user.verify_type) user.level = public.get_level(html) save_user(user) storage.info('已经成功保存ID为{id}的用户信息'.format(id=user_id)) return user else: return None
def crawl_weibo_datas(uid): limit = get_max_home_page() cur_page = 1 while cur_page <= limit: url = home_url.format(uid, cur_page) html = get_page(url) weibo_datas = get_wbdata_fromweb(html) if not weibo_datas: crawler.warning("user {} has no weibo".format(uid)) return insert_weibo_datas(weibo_datas) domain = public.get_userdomain(html) page_id = public.get_pageid(html) cur_time = int(time.time() * 1000) ajax_url_0 = ajax_url.format(domain, 0, page_id, uid, cur_page, cur_page, cur_time) ajax_url_1 = ajax_url.format(domain, 1, page_id, uid, cur_page, cur_page, cur_time + 100) if cur_page == 1: # here we use local call to get total page number total_page = get_total_page(crawl_ajax_page(ajax_url_1)) if total_page < limit: limit = total_page cur_page += 1 app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0, ), queue='ajax_home_crawler', routing_key='ajax_home_info') app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1, ), queue='ajax_home_crawler', routing_key='ajax_home_info') set_seed_home_crawled(uid)
def crawl_weibo_datas(uid): limit = get_max_home_page() cur_page = 1 while cur_page <= limit: url = HOME_URL.format(uid, cur_page) if cur_page == 1: html = get_page(url, auth_level=1) else: html = get_page(url, auth_level=2) weibo_datas = get_data(html) if not weibo_datas: crawler.warning("user {} has no weibo".format(uid)) return WbDataOper.add_all(weibo_datas) domain = public.get_userdomain(html) cur_time = int(time.time()*1000) ajax_url_0 = AJAX_URL.format(domain, 0, domain, uid, cur_page, cur_page, cur_time) ajax_url_1 = AJAX_URL.format(domain, 1, domain, uid, cur_page, cur_page, cur_time+100) if cur_page == 1: # here we use local call to get total page number total_page = get_total_page(crawl_ajax_page(ajax_url_1, 2)) auth_level = 1 else: auth_level = 2 if total_page < limit: limit = total_page app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0, auth_level), queue='ajax_home_crawler', routing_key='ajax_home_info') app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1, auth_level), queue='ajax_home_crawler', routing_key='ajax_home_info') cur_page += 1 SeedidsOper.set_seed_home_crawled(uid)
def crawl_weibo_pics(uid): limit = get_max_home_page() cur_page = 1 # 自定义最大爬取的页数 max_page = 20 # end url = home_url.format(uid, cur_page) html = get_page(url) domain = public.get_userdomain(html) # 只爬取微博个人用户的相片,如果是非个人用户(如政府,组织等)不爬取。 if domain not in ['103505', '100306', '100505', '']: set_seed_home_crawled(uid, 2) return # end domain_uid = domain + uid page_domain = 'page_' + domain url = pic_url.format(domain_uid, page_domain) html = get_page(url) weibo_pics, next_ajax_url = get_wbdata_fromweb(html) if weibo_pics is None or next_ajax_url is None: crawler.warning('用户id为{}的用户相册未采集成功,请检查原因'.format(uid)) set_seed_home_crawled(uid, 3) return if not weibo_pics: crawler.warning('用户id为{}的用户相册未采集成功,可能是因为TA没有发过带图微博'.format(uid)) set_seed_home_crawled(uid, 5) return insert_weibo_pics(weibo_pics) if not next_ajax_url: crawler.warning('用户id为{}的相册采集完成'.format(uid)) set_seed_home_crawled(uid, 4) return cur_page += 1 while cur_page <= limit: # 有些微博账号的照片多达两三千张,如果全部爬取比较浪费时间,这里先简单粗暴地根据当前微博的页数 # 进行限制。经过调查发现10页左右应该是比较理想的数字。 if cur_page > max_page: break # ebd cur_time = int(time.time()*1000) ajax_call = 1 page_id = domain_uid url = ajax_url.format(page_id, cur_page, ajax_call, cur_time) + '&' + next_ajax_url html = get_page(url, user_verify=False) weibo_pics, next_ajax_url = get_pic_data_byajax(html) if weibo_pics is None or next_ajax_url is None: crawler.warning('用户id为{}的用户相册未采集成功,请检查原因'.format(uid)) set_seed_home_crawled(uid, 3) return if not weibo_pics: crawler.warning('用户id为{}的用户相册未采集成功,请检查原因'.format(uid)) set_seed_home_crawled(uid, 3) return insert_weibo_pics(weibo_pics) if not next_ajax_url: crawler.warning('用户id为{}的相册采集完成'.format(uid)) set_seed_home_crawled(uid, 4) return cur_page += 1 # 在完成规定的最大爬取页数后主动退出,将标志位置位为4 set_seed_home_crawled(uid, 4) return
def crawl_weibo_datas(uid): limit = get_max_home_page() cur_page = 1 while cur_page <= limit: url = HOME_URL.format(uid, cur_page) if cur_page == 1: html = get_page(url, auth_level=1) else: html = get_page(url, auth_level=2) weibo_datas = get_data(html) if not weibo_datas: crawler.warning("user {} has no weibo".format(uid)) return # Check whether weibo created after time in spider.yaml length_weibo_datas = len(weibo_datas) timeafter = time.mktime( time.strptime(get_time_after(), '%Y-%m-%d %H:%M:%S')) weibo_datas = [ weibo_datum for weibo_datum in weibo_datas if determine(weibo_datum, timeafter) ] WbDataOper.add_all(weibo_datas) # If the weibo isn't created after the given time, jump out the loop if len(weibo_datas) != length_weibo_datas: break domain = public.get_userdomain(html) cur_time = int(time.time() * 1000) ajax_url_0 = AJAX_URL.format(domain, 0, domain, uid, cur_page, cur_page, cur_time) ajax_url_1 = AJAX_URL.format(domain, 1, domain, uid, cur_page, cur_page, cur_time + 100) if cur_page == 1: # here we use local call to get total page number total_page = get_total_page(crawl_ajax_page(ajax_url_1, 2)) auth_level = 1 if total_page < limit: limit = total_page # Since the second ajax of page 1 has already been crawled # in the code above and has been stored in databse, # we only have to crawl the first ajax of page 1 crawl_ajax_page(ajax_url_0, auth_level) else: auth_level = 2 # Still the same as before # if total_page != limit: # limit = total_page # crawler.warning("total pagenum is {}".format(total_page)) crawl_ajax_page(ajax_url_0, auth_level) crawl_ajax_page(ajax_url_1, auth_level) cur_page += 1 SeedidsOper.set_seed_home_crawled(uid)
def crawl_weibo_datas(uid): limit = get_max_home_page() cur_page = 1 while cur_page <= limit: url = HOME_URL.format(uid, cur_page) if cur_page == 1: html = get_page(url, auth_level=1) else: html = get_page(url, auth_level=2) weibo_datas = get_data(html) if not weibo_datas: crawler.warning("user {} has no weibo".format(uid)) return # Check whether weibo created after time in spider.yaml length_weibo_datas = len(weibo_datas) timeafter = time.mktime( time.strptime(get_time_after(), '%Y-%m-%d %H:%M:%S')) weibo_datas = [ weibo_datum for weibo_datum in weibo_datas if determine(weibo_datum, timeafter) ] WbDataOper.add_all(weibo_datas) # If the weibo isn't created after the given time, jump out the loop if len(weibo_datas) != length_weibo_datas: break domain = public.get_userdomain(html) cur_time = int(time.time()*1000) ajax_url_0 = AJAX_URL.format(domain, 0, domain, uid, cur_page, cur_page, cur_time) ajax_url_1 = AJAX_URL.format(domain, 1, domain, uid, cur_page, cur_page, cur_time+100) if cur_page == 1: # here we use local call to get total page number total_page = get_total_page(crawl_ajax_page(ajax_url_1, 2)) auth_level = 1 if total_page < limit: limit = total_page # Since the second ajax of page 1 has already been crawled # in the code above and has been stored in databse, # we only have to crawl the first ajax of page 1 crawl_ajax_page(ajax_url_0, auth_level) else: auth_level = 2 # Still the same as before # if total_page != limit: # limit = total_page # crawler.warning("total pagenum is {}".format(total_page)) crawl_ajax_page(ajax_url_0, auth_level) crawl_ajax_page(ajax_url_1, auth_level) cur_page += 1 SeedidsOper.set_seed_home_crawled(uid)
def crawl_weibo_datas(uid): limit = get_max_home_page() cur_page = 36 retry_count = 1 while cur_page <= 36: crawler.warning("current page {}".format(cur_page)) url = HOME_URL.format(uid, cur_page) #if cur_page == 1: # html = get_page(url, auth_level=1) #else: html = get_page(url, auth_level=2) weibo_datas = get_data(html) if not weibo_datas: if retry_count < 10: crawler.warning("user {} has no weibo, retry".format(uid)) retry_count = retry_count + 1 #time.sleep(240) continue; else: crawler.warning("user {} has no weibo, return".format(uid)) return # Check whether weibo created after time in spider.yaml # timeafter = time.mktime( # time.strptime(get_time_after(), '%Y-%m-%d %H:%M:%S')) # length_weibo_datas = len(weibo_datas) # for i in range(0, len(weibo_datas)): # weibo_time = time.mktime( # time.strptime(weibo_datas[i].create_time, '%Y-%m-%d %H:%M')) # if weibo_time < timeafter: # weibo_datas = weibo_datas[0:i] # break WbDataOper.add_all(weibo_datas) # # If the weibo isn't created after the given time, jump out the loop # if i != length_weibo_datas - 1: # break domain = public.get_userdomain(html) cur_time = int(time.time()*1000) ajax_url_0 = AJAX_URL.format(domain, 0, domain, uid, cur_page, cur_page, cur_time) ajax_url_1 = AJAX_URL.format(domain, 1, domain, uid, cur_page, cur_page, cur_time+100) # if cur_page == 1: # # here we use local call to get total page number # total_page = get_total_page(crawl_ajax_page(ajax_url_1, 2)) # auth_level = 1 # else: auth_level = 2 #if total_page < limit: # limit = total_page crawler.warning("append tasks.home.crawl_ajax_page{}".format(uid)); app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0, auth_level), queue='ajax_home_crawler', routing_key='ajax_home_info') app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1, auth_level), queue='ajax_home_crawler', routing_key='ajax_home_info') cur_page += 1 SeedidsOper.set_seed_home_crawled(uid)
def crawl_weibo_datas(uid): limit = get_max_home_page() cur_page = 1 # 自定义最大爬取的页数 max_page = 10 # end while cur_page <= limit: # 有些微博账号的照片多达两三千张,如果全部爬取比较浪费时间,这里先简单粗暴地根据当前微博的页数 # 进行限制。经过调查发现10页左右应该是比较理想的数字,电脑版微博一页有45条微博,那么一个账户就是 # 450条微博。 if cur_page > max_page: break # end url = home_url.format(uid, cur_page) html = get_page(url) domain = public.get_userdomain(html) # 只爬取微博个人用户的相片,如果是非个人用户(如政府,组织等)不爬取。 if domain not in ['103505', '100306', '100505', '']: set_seed_home_crawled(uid, 2) return # end weibo_datas, weibo_pics = get_wbdata_fromweb(html) if not weibo_datas: crawler.warning('用户id为{}的用户主页微博数据未采集成功,请检查原因'.format(uid)) return insert_weibo_datas(weibo_datas) # 如果非空,则将weibo_pics插入数据库中 if weibo_pics: insert_weibo_pics(weibo_pics) # end cur_time = int(time.time() * 1000) ajax_url_0 = ajax_url.format(domain, 0, domain, uid, cur_page, cur_page, cur_time) ajax_url_1 = ajax_url.format(domain, 1, domain, uid, cur_page, cur_page, cur_time + 100) if cur_page == 1: total_page = get_total_page(crawl_ajax_page(ajax_url_1)) if total_page < limit: limit = total_page cur_page += 1 app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0, ), queue='ajax_home_crawler', routing_key='ajax_home_info') app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1, ), queue='ajax_home_crawler', routing_key='ajax_home_info') # 在遍历完所有页数之后,将flag置位。放在这里表示所有页面都遍历过,不保证遍历成功后置位。可能以后还要优化,即在 # 某个回调函数中使用它。 set_seed_home_crawled(uid)