def test_get_url_from_web(self): from page_get import user as user_get normal_user = user_get.get_profile('1195908387') self.assertEqual(normal_user.name, '日_推') writer = user_get.get_profile('1191258123') self.assertEqual(writer.description, '韩寒') enterprise_user = user_get.get_profile('1839256234') self.assertEqual(enterprise_user.level, 36)
def test_get_url_from_web(self): """ test crawling different kind of users """ from page_get import user as user_get normal_user, _ = user_get.get_profile('1195908387') self.assertEqual(normal_user.name, '日_推') writer, _ = user_get.get_profile('1191258123') self.assertEqual(writer.description, '韩寒') enterprise_user, _ = user_get.get_profile('1839256234') self.assertEqual(enterprise_user.level, 36)
def crawl_repost_page(mid, uid): limit = get_max_repost_page() + 1 first_repost_data = crawl_repost_by_page(mid, 1) total_page = repost.get_total_page(first_repost_data[0]) repost_datas = first_repost_data[1] if not repost_datas: return root_user, _ = user_get.get_profile(uid) if total_page < limit: limit = total_page + 1 for page_num in range(2, limit): cur_repost_datas = crawl_repost_by_page(mid, page_num)[1] if cur_repost_datas: repost_datas.extend(cur_repost_datas) for index, repost_obj in enumerate(repost_datas): user_id = IdNames.fetch_uid_by_name(repost_obj.parent_user_name) if not user_id: # when it comes to errors, set the args to default(root) repost_obj.parent_user_id = root_user.uid repost_obj.parent_user_name = root_user.name else: repost_obj.parent_user_id = user_id repost_datas[index] = repost_obj weibo_repost.save_reposts(repost_datas)
def _get_current_source(url, wb_mid): """ :param url: 当前微博url :param wb_mid: 当前微博mid :return: 转发数,微博用户id,用户名 """ html = get_page(url) if not html or basic.is_404(html): return None reposts = parse_status.get_repostcounts(html) comments = parse_status.get_commentcounts(html) # 更新weibo_search_data表中的转发数、评论数 weibosearch_dao.update_repost_comment(mid=wb_mid, reposts=reposts, comments=comments) root_url = url user_id = parse_status.get_userid(html) user_name = parse_status.get_username(html) post_time = parse_status.get_statustime(html) device = parse_status.get_statussource(html) comments_count = parse_status.get_commentcounts(html) reposts_count = parse_status.get_repostcounts(html) root_user = user.get_profile(user_id) # 源微博的相关信息存储 spread_original_dao.save(root_user, wb_mid, post_time, device, reposts_count, comments_count, root_url) crawler.info('该微博转发数为{counts}'.format(counts=reposts_count)) return reposts_count, user_id, user_name
def crawl_repost_page(mid, uid): limit = get_max_repost_page() + 1 first_repost_data = crawl_repost_by_page(mid, 1) wb_data.set_weibo_repost_crawled(mid) total_page = repost.get_total_page(first_repost_data[0]) repost_datas = first_repost_data[1] if not repost_datas: return root_user = user_get.get_profile(uid) if total_page < limit: limit = total_page + 1 # todo 这里需要衡量是否有用网络调用的必要性 for page_num in range(2, limit): # app.send_task('tasks.comment.crawl_comment_by_page', args=(mid, page_num), queue='comment_page_crawler', # routing_key='comment_page_info') cur_repost_datas = crawl_repost_by_page(mid, page_num)[1] if cur_repost_datas: repost_datas.extend(cur_repost_datas) # 补上user_id,方便可视化 for index, repost_obj in enumerate(repost_datas): user_id = IdNames.fetch_uid_by_name(repost_obj.parent_user_name) if not user_id: # 设置成根用户的uid和用户名 repost_obj.parent_user_id = root_user.uid repost_obj.parent_user_name = root_user.name else: repost_obj.parent_user_id = user_id repost_datas[index] = repost_obj weibo_repost.save_reposts(repost_datas)
def crawl_person_infos(uid): """ 根据用户id来爬取用户相关资料和用户的关注数和粉丝数(由于微博服务端限制,默认爬取前五页,企业号的关注和粉丝也不能查看) :param uid: 用户id :return: """ if not uid: return # 由于与别的任务共享数据表,所以需要先判断数据库是否有该用户信息,再进行抓取 user = user_get.get_profile(uid) # 不抓取企业号 if user.verify_type == 2: set_seed_other_crawled(uid) return seed = get_seed_by_id(uid) if seed.other_crawled == 0: rs = user_get.get_fans_or_followers_ids(uid, 1) rs.extend(user_get.get_fans_or_followers_ids(uid, 2)) datas = set(rs) # 重复数据跳过插入 if datas: insert_seeds(datas) set_seed_other_crawled(uid)
def crawl_person_infos(uid): """ Crawl user info and their fans and followers For the limit of weibo's backend, we can only crawl 5 pages of the fans and followers. We also have no permissions to view enterprise's followers and fans info :param uid: current user id :return: None """ if not uid: return url = home_url.format(uid) html = get_page(url) if is_404(html): return None domain = public.get_userdomain(html) user, is_crawled = user_get.get_profile(uid, domain) # If it's enterprise user, just skip it if user and user.verify_type == 2: set_seed_other_crawled(uid) return # Crawl fans and followers if not is_crawled: app.send_task('tasks.user.crawl_follower_fans', args=(uid, domain), queue='fans_followers', routing_key='for_fans_followers')
def get_status_info(url, user_id, name, mid=''): soc = SpreadOtherCache() print('当前转发微博url为:' + url) repost_cont = get_page(url) if not is_404(repost_cont): repost_user_id = parse_status.get_userid(repost_cont) if repost_user_id == '': return None repost_user_name = parse_status.get_username(repost_cont) soc.set_id(repost_user_id) soc.set_name(repost_user_name) so = SpreadOther() so.id = repost_user_id so.screen_name = repost_user_name so.upper_user_name = parse_status.get_upperusername(repost_cont, name) cur_user = user.get_profile(repost_user_id) try: so.province = cur_user.province so.city = cur_user.city so.location = cur_user.location so.description = cur_user.description so.domain_name = cur_user.domain_name so.blog_url = cur_user.blog_url so.gender = cur_user.gender so.headimg_url = cur_user.headimg_url so.followers_count = cur_user.followers_count so.friends_count = cur_user.friends_count so.status_count = cur_user.status_count so.verify_type = cur_user.verify_type so.verify_info = cur_user.verify_info so.register_time = cur_user.register_time if so.screen_name == name: so.id = user_id so.mid = parse_status.get_mid(repost_cont) so.status_post_time = parse_status.get_statustime(repost_cont) so.device = parse_status.get_statussource(repost_cont) if mid: so.original_status_id = mid else: so.original_status_id = parse_status.get_orignalmid(repost_cont) so.comments_count = parse_status.get_commentcounts(repost_cont) so.reposts_count = parse_status.get_repostcounts(repost_cont) so.like_count = parse_status.get_likecounts(repost_cont) so.status_url = url except AttributeError as e: # todo:找出这里的问题 logging.info('解析{user_id}失败, 堆栈为{e}'.format(user_id=user_id, e=e)) logging.info(r'该转发页面的源代码为:\n{repost_cont}'.format(repost_cont=repost_cont)) return None else: return SpreadOtherAndCache(so, soc) else: return None
def test_get_user_from_web(self): from wblogin.login import get_session from page_get.user import get_profile from headers import headers user_id = '2674334272' sc = get_session() if sc: session = sc.get('session', '') if session: # 数据库已有的数据 user = get_profile(user_id, session, headers) self.assertNotEqual(user.description, '') # 数据库没有的数据 user2 = get_profile('3614046244', session, headers) self.assertEqual(user2.status_count, 35) else: raise Exception('模拟登录失败')
def crawl_person_infos(uid): """ 根据用户id来爬取用户相关资料和用户的关注数和粉丝数(由于微博服务端限制,默认爬取前五页,企业号的关注和粉丝也不能查看) :param uid: 用户id :return: """ if not uid: return # 由于与别的任务共享数据表,所以需要先判断数据库是否有该用户信息,再进行抓取 user = user_get.get_profile(uid) # 不抓取企业号 if user.verify_type == 2: set_seed_other_crawled(uid) return app.send_task('tasks.user.crawl_follower_fans', args=(uid,), queue='fans_followers', routing_key='for_fans_followers')
def crawl_person_infos(uid): """ Crawl user info and their fans and followers For the limit of weibo's backend, we can only crawl 5 pages of the fans and followers. We also have no permissions to view enterprise's followers and fans info :param uid: current user id :return: None """ if not uid: return user, is_crawled = user_get.get_profile(uid) # If it's enterprise user, just skip it if user and user.verify_type == 2: set_seed_other_crawled(uid) return # Crawl fans and followers if not is_crawled: app.send_task('tasks.user.crawl_follower_fans', args=(uid,), queue='fans_followers', routing_key='for_fans_followers')
def crawl_person_infos(uid): """ Crawl user info and their fans and followers For the limit of weibo's backend, we can only crawl 5 pages of the fans and followers. We also have no permissions to view enterprise's followers and fans info :param uid: current user id :return: None """ if not uid: return user = user_get.get_profile(uid) # If it's enterprise user, just skip it if user.verify_type == 2: set_seed_other_crawled(uid) return # Crawl fans and followers celery.send_task('celery_tasks.weibo.user.crawl_follower_fans', args=(uid, ), queue='fans_followers', routing_key='for_fans_followers')
def _get_current_reposts(url, session, weibo_mid): """ 修改过后的抓取主程序,由于微博频率限制比较严格,目前只抓取当前微博及其子微博,不抓取源微博 """ spread_other_caches = list() spread_others = list() spread_other_and_caches = list() html = get_page(url, session, headers) reposts = status_parse.get_repostcounts(html) comments = status_parse.get_commentcounts(html) # 更新weibo_search_data表中的转发数、评论数 weibosearch_dao.update_repost_comment(mid=weibo_mid, reposts=reposts, comments=comments) if not basic.is_404(html): root_url = url mid = status_parse.get_mid(html) user_id = status_parse.get_userid(html) user_name = status_parse.get_username(html) post_time = status_parse.get_statustime(html) device = status_parse.get_statussource(html) comments_count = status_parse.get_commentcounts(html) reposts_count = status_parse.get_repostcounts(html) root_user = user.get_profile(user_id, session, headers) spread_original_dao.save(root_user, mid, post_time, device, reposts_count, comments_count, root_url) crawler.info('该微博转发数为{counts}'.format(counts=reposts_count)) if reposts_count > 0: base_url = 'http://weibo.com/aj/v6/mblog/info/big?ajwvr=6&id={mid}&page={currpage}' soc = SpreadOtherCache() soc.set_id(user_id) soc.set_name(user_name) spread_other_caches.append(soc) page = 1 ajax_url = base_url.format(mid=mid, currpage=page) source = get_page(ajax_url, session, headers, False) crawler.info('本次转发信息url为:' + ajax_url) try: repost_json = json.loads(source) total_page = int(repost_json['data']['page']['totalpage']) except Exception as why: parser.error('{url}使用json解析转发信息出现异常,具体信息为:{why}'.format( url=ajax_url, why=why)) else: page = total_page page_counter = 0 while page > 0 and page_counter < page_max: ajax_url = base_url.format(mid=mid, currpage=page) repost_info = get_page(ajax_url, session, headers, False) try: repost_json = json.loads(repost_info) repost_html = repost_json['data']['html'] except Exception as why: parser.error( '{url}使用json解析转发信息出现异常,具体信息为:{why}'.format( url=ajax_url, why=why)) else: repost_urls = status_parse.get_reposturls(repost_html) # 转发节点排序逻辑 for repost_url in repost_urls: repost_cont = status.get_status_info( repost_url, session, user_id, user_name, headers, mid) if repost_cont is not None: spread_other_and_caches.append(repost_cont) for soac in spread_other_and_caches: if soac.get_so().id != '': spread_others.append(soac.get_so()) spread_other_caches.append(soac.get_soc()) finally: print('当前位于第{currpage}页'.format(currpage=page)) page -= 1 page_counter += 1 for so in spread_others: if so.verify_type == '': so.verify_type = 0 for i in spread_other_caches: if so.upper_user_name == i.get_name(): so.upper_user_id = i.get_id() break else: so.upper_user_id = user_id spread_others = list(set(spread_others)) spread_other_dao.save(spread_others) crawler.info('一共获取了{num}条转发信息,该条微博的转发信息已经采集完成'.format( num=len(spread_others))) else: crawler.info('{url}为404页面'.format(url=url))