def crawl_person_infos(uid): """ Crawl user info and their fans and followers For the limit of weibo's backend, we can only crawl 5 pages of the fans and followers. We also have no permissions to view enterprise's followers and fans info :param uid: current user id :return: None """ if not uid: return url = home_url.format(uid) html = get_page(url) if is_404(html): return None domain = public.get_userdomain(html) user, is_crawled = user_get.get_profile(uid, domain) # If it's enterprise user, just skip it if user and user.verify_type == 2: set_seed_other_crawled(uid) return # Crawl fans and followers if not is_crawled: app.send_task('tasks.user.crawl_follower_fans', args=(uid, domain), queue='fans_followers', routing_key='for_fans_followers')
def excute_login_task(): infos = login_info.get_login_info() log.crawler.info('本轮模拟登陆开始') for info in infos: app.send_task('tasks.login.login_task', args=(info.name, info.password)) time.sleep(10)
def crawl_weibo_datas(uid): limit = get_max_home_page() cur_page = 1 while cur_page <= limit: url = home_url.format(uid, cur_page) html = get_page(url) weibo_datas = get_wbdata_fromweb(html) if not weibo_datas: crawler.warning('用户id为{}的用户主页微博数据未采集成功,请检查原因'.format(uid)) return insert_weibo_datas(weibo_datas) domain = public.get_userdomain(html) ajax_url_0 = ajax_url.format(domain, 0, domain, uid, cur_page, cur_page) ajax_url_1 = ajax_url.format(domain, 1, domain, uid, cur_page, cur_page) if cur_page == 1: total_page = get_total_page(crawl_ajax_page(ajax_url_1)) if total_page < limit: limit = total_page cur_page += 1 app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0,), queue='ajax_home_crawler', routing_key='ajax_home_info') app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1,), queue='ajax_home_crawler', routing_key='ajax_home_info')
def excute_home_task(): # you can have many strategies to crawl user's home page, here we choose table seed_ids's uid # whose home_crawl is 0 id_objs = get_home_ids() for id_obj in id_objs: app.send_task('tasks.home.crawl_weibo_datas', args=(id_obj.uid,), queue='home_crawler', routing_key='home_info')
def crawl_weibo_datas(uid): limit = get_max_home_page() cur_page = 1 while cur_page <= limit: url = home_url.format(uid, cur_page) html = get_page(url) weibo_datas = get_wbdata_fromweb(html) if not weibo_datas: crawler.warning("user {} has no weibo".format(uid)) return insert_weibo_datas(weibo_datas) domain = public.get_userdomain(html) cur_time = int(time.time()*1000) ajax_url_0 = ajax_url.format(domain, 0, domain, uid, cur_page, cur_page, cur_time) ajax_url_1 = ajax_url.format(domain, 1, domain, uid, cur_page, cur_page, cur_time+100) if cur_page == 1: # here we use local call to get total page number total_page = get_total_page(crawl_ajax_page(ajax_url_1)) if total_page < limit: limit = total_page cur_page += 1 app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0,), queue='ajax_home_crawler', routing_key='ajax_home_info') app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1,), queue='ajax_home_crawler', routing_key='ajax_home_info') set_seed_home_crawled(uid)
def search_keyword(keyword, keyword_id): cur_page = 1 encode_keyword = url_parse.quote(keyword) while cur_page < limit: cur_url = url.format(encode_keyword, cur_page) search_page = get_page(cur_url) if not search_page: crawler.warning('本次并没获取到关键词{}的相关微博,该页面源码是{}'.format( keyword, search_page)) return search_list = parse_search.get_search_info(search_page) # 先判断数据库里是否存在相关的微博,如果是已有的,那就说明是已经抓取的微博(因为结果默认按时间排序),就退出循环 for wb_data in search_list: rs = get_wb_by_mid(wb_data.weibo_id) if rs: crawler.info('关键词{}本次搜索更新的微博已经获取完成'.format(keyword)) return else: insert_weibo_data(wb_data) insert_keyword_wbid(keyword_id, wb_data.weibo_id) # 这里暂时使用网络调用而非本地调用,权衡两种方法的好处 app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid, ), queue='user_crawler', routing_key='for_user_info') # 判断是否包含下一页 if 'page next S_txt1 S_line1' in search_page: cur_page += 1 else: crawler.info('关键词{}搜索完成'.format(keyword)) return
def search_keyword(keyword, keyword_id): cur_page = 1 encode_keyword = url_parse.quote(keyword) while cur_page < limit: cur_url = url.format(encode_keyword, cur_page) search_page = get_page(cur_url) if not search_page: crawler.warning('No result for keyword {}, the source page is {}'.format(keyword, search_page)) return search_list = parse_search.get_search_info(search_page) # Because the search results are sorted by time, if any result has been stored in mysql, # we need not crawl the same keyword in this turn for wb_data in search_list: rs = get_wb_by_mid(wb_data.weibo_id) if rs: crawler.info('keyword {} has been crawled in this turn'.format(keyword)) return else: insert_weibo_data(wb_data) insert_keyword_wbid(keyword_id, wb_data.weibo_id) # send task for crawling user info app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid,), queue='user_crawler', routing_key='for_user_info') if 'page next S_txt1 S_line1' in search_page: cur_page += 1 else: crawler.info('keyword {} has been crawled in this turn'.format(keyword)) return
def search_keyword(keyword, keyword_id): cur_page = 1 encode_keyword = url_parse.quote(keyword) #crawler.info(limit) while cur_page < limit: cur_url = url.format(encode_keyword, cur_page) search_page = get_page(cur_url) #crawler.info(search_page) if not search_page: crawler.warning('No result for keyword {}, the source page is {}'.format(keyword, search_page)) return search_list = parse_search.get_search_info(search_page) # Because the search results are sorted by time, if any result has been stored in mysql, # we need not crawl the same keyword in this turn for wb_data in search_list: rs = get_wb_by_mid(wb_data.weibo_id) if rs: crawler.info('keyword {} has been crawled in last turn'.format(keyword)) #continue return else: insert_weibo_data(wb_data) insert_keyword_wbid(keyword_id, wb_data.weibo_id) # send task for crawling user info app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid,), queue='user_crawler', routing_key='for_user_info') if 'page next S_txt1 S_line1' in search_page: cur_page += 1 else: crawler.info('keyword {} has been crawled in this turn'.format(keyword)) return
def get_answer_comments(): comment_ids = get_zhihu_answer_comment_not_crawled() for comment_id in comment_ids: app.send_task('tasks.answer_comment.get_one_answer_comments', args=(comment_id[0]), queue='answer_comment_crawler', routing_key='for_search_info')
def excute_search_task(): keywords = get_search_keywords() for each in keywords: app.send_task('tasks.search.search_keyword', args=(each[0], each[1]), queue='search_crawler', routing_key='for_search_info')
def excute_pic_task(): # 这里的策略由自己指定,可以基于已有用户做主页抓取,也可以指定一些用户,我这里直接选的种子数据库中的uid id_objs = get_home_ids(0) for id_obj in id_objs: app.send_task('tasks.pic.crawl_weibo_pics', args=(id_obj.uid,), queue='pic_crawler', routing_key='pic_info') # crawl_weibo_pics(id_obj.uid)
def execute_start_request(): meta = {} start_url = "http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E6%B7%B1%E5%9C%B3&kw=python" meta["start_url"] = start_url app.send_task("tasks.downloader.downloader", args=(start_url, meta), queue="downloader_queue", routing_key="for_download")
def excute_comment_task(): #weibo_datas = wb_data.get_weibo_comment_not_crawled() weibo_data = '4079144788308403' # for weibo_data in weibo_datas: # app.send_task('tasks.comment.crawl_comment_page', args=(weibo_data.weibo_id,), queue='comment_crawler', # routing_key='comment_info') app.send_task('tasks.comment.crawl_comment_page', args=(weibo_data,), queue='comment_crawler', routing_key='comment_info')
def execute_user_task(): seeds, is_exists = SeedUser.get_seed_names() if is_exists: for seed in seeds: crawler.info(f"send task crawl_user_info {seed.name}") app.send_task("tasks.user.crawl_user_info", args=(seed.name, )) else: crawler.info("find no user, abort")
def excute_repost_task(): # 以当前微博为源微博进行分析,不向上溯源,如果有同学需要向上溯源,需要自己判断一下该微博是否是根微博 weibo_datas = wb_data.get_weibo_repost_not_crawled() crawler.info('本次一共有{}条微博需要抓取转发信息'.format(len(weibo_datas))) for weibo_data in weibo_datas: app.send_task('tasks.repost.crawl_repost_page', args=(weibo_data.weibo_id, weibo_data.uid), queue='repost_crawler', routing_key='repost_info')
def excute_repost_task(): datas = weibosearch_dao.get_crawl_urls() crawler.info('一共获取到{len}条需要抓取的微博'.format(len=len(datas))) # 把抓取任务分发到各个机器上执行 for data in datas: app.send_task('tasks.repost.get_current_reposts', args=(data['url'], data['mid'])) crawler.info('本次任务分发完成')
def excute_user_task(): seeds = get_seed_ids() if seeds: for seed in seeds: app.send_task('tasks.user.crawl_person_infos', args=(seed.uid, ), queue='user_crawler', routing_key='for_user_info')
def excute_comment_task(): # 只解析了根评论,而未对根评论下的评论进行抓取,如果有需要的同学,可以适当做修改 weibo_datas = wb_data.get_weibo_comment_not_crawled() for weibo_data in weibo_datas: app.send_task('tasks.comment.crawl_comment_page', args=(weibo_data.weibo_id, ), queue='comment_crawler', routing_key='comment_info')
def excute_repost_task(): # regard current weibo url as the original url, you can also analyse from the root url weibo_datas = wb_data.get_weibo_repost_not_crawled() crawler.info('There are {} repost urls have to be crawled'.format(len(weibo_datas))) for weibo_data in weibo_datas: app.send_task('tasks.repost.crawl_repost_page', args=(weibo_data.weibo_id, weibo_data.uid), queue='repost_crawler', routing_key='repost_info')
def excute_login_task(): infos = login_info.get_login_info() log.crawler.info('本轮模拟登陆开始') for info in infos: app.send_task('tasks.login.login_task', args=(info.name, info.password, info.need_verify), queue='login_queue', routing_key='for_login') time.sleep(10)
def excute_home_task(): # 这里的策略由自己指定,可以基于已有用户做主页抓取,也可以指定一些用户,我这里直接选的种子数据库中的uid uids = get_home_ids() for uid in uids: app.send_task('tasks.home.crawl_weibo_datas', args=(uid[0], ), queue='home_crawler', routing_key='home_info')
def excute_home_task(): # you can have many strategies to crawl user's home page, here we choose table seed_ids's uid # whose home_crawl is 0 id_objs = get_home_ids() for id_obj in id_objs: app.send_task('tasks.home.crawl_weibo_datas', args=(id_obj.uid, ), queue='home_crawler', routing_key='home_info')
def excute_login_task(): infos = login_info.get_login_info() # Clear all stacked login tasks before each time for login Cookies.check_login_task() log.crawler.info('The login task is starting...') for info in infos: app.send_task('tasks.login.login_task', args=(info.name, info.password), queue='login_queue', routing_key='for_login') time.sleep(10)
def excute_repost_task(): # regard current weibo url as the original url, you can also analyse from the root url weibo_datas = wb_data.get_weibo_repost_not_crawled() crawler.info('There are {} repost urls have to be crawled'.format( len(weibo_datas))) for weibo_data in weibo_datas: app.send_task('tasks.repost.crawl_repost_page', args=(weibo_data.weibo_id, weibo_data.uid), queue='repost_crawler', routing_key='repost_info')
def excute_login_task(): infos = login_info.get_login_info() # 每次登陆前清楚所有堆积的登录任务 Cookies.check_login_task() log.crawler.info('本轮模拟登陆开始') for info in infos: app.send_task('tasks.login.login_task', args=(info.name, info.password), queue='login_queue', routing_key='for_login') time.sleep(10)
def crawl_comment_page(mid): limit = conf.get_max_comment_page() + 1 # 这里为了马上拿到返回结果,采用本地调用的方式 first_page = crawl_comment_by_page(mid, 1) total_page = comment.get_total_page(first_page) if total_page < limit: limit = total_page + 1 for page_num in range(2, limit): app.send_task('tasks.comment.crawl_comment_by_page', args=(mid, page_num), queue='comment_page_crawler', routing_key='comment_page_info')
def get_people_and_follows(people_id, selector): try: people = People() people.people_id = people_id people.name = selector.xpath( '//div[@class="aw-user-center"]/div[1]/div/h1/text()')[0].strip() people.desc = "".join( selector.xpath( '//div[@class="aw-user-center"]/div[1]/div/span/text()')) if selector.xpath('//i[contains(@class,"i-user-locate")]'): user_locate = selector.xpath( '//i[contains(@class,"i-user-locate")]')[0].getparent() people.province = "".join(user_locate.xpath('a[1]/text()')) people.city = "".join(user_locate.xpath('a[2]/text()')) if selector.xpath('//i[contains(@class,"i-user-post")]'): user_post = selector.xpath( '//i[contains(@class,"i-user-post")]')[0].getparent() people.post = "".join(user_post.xpath('text()')).strip() if selector.xpath('//i[contains(@class,"i-user-visits")]'): user_visits = selector.xpath( '//i[contains(@class,"i-user-visits")]')[0].getparent() user_visits_str = "".join(user_visits.xpath('text()')) people.home_visit_num = re.findall('(\d+)', user_visits_str)[0] people_type_spans = selector.xpath( '//div[@class="aw-user-center"]/div[1]/div/p[3]/span') people.user_type = people_type_spans[0].xpath( 'a/em/text()')[0].replace("»", "").strip() people.weiwang_num = people_type_spans[1].xpath('em/text()')[0] people.agree_num = people_type_spans[2].xpath('em/text()')[0] people.thanks_num = people_type_spans[3].xpath('em/text()')[0] people.gold_num = people_type_spans[4].xpath('em/text()')[0] if '+' in people.gold_num: people.gold_num = 100 if selector.xpath('//span[contains(text(),"最后活跃")]'): last_active_time_str = selector.xpath( '//span[contains(text(),"最后活跃")]')[0].getparent().getnext( ).xpath('text()')[0] people.last_active_time = str2datetime(last_active_time_str) CommonOper.add_one(people) CommonOper.add_filter_key("people_id", people_id) except Exception as e: jsl_log.warning( "get people info error,people_id:{},here are details {}".format( people_id, format_tb(e.__traceback__)[0])) app.send_task("tasks.people.do_follow", args=( people_id, 0, ), queue="people_queue", routing_key="people")
def crawl_user_info(name): """抓取用户首页的信息 :param name: 用户名 :return: None """ if not name: return None crawler.info(f"received task crawl_user_info {name}") user, other_crawled = get_profile(name) if not other_crawled: crawler.info(f"send task crawl_follower_fans {user.name}") app.send_task("tasks.user.crawl_follower_fans", args=(user.name, ))
def excute_login_task(): infos = login_info.get_login_info() log.crawler.info('本轮模拟登陆开始') for info in infos: try: rs = Cookies.check_login_task(info.name) except KeyError: log.crawler.warning('请检查是否已经启动worker及指定了login_queue') else: if not rs: app.send_task('tasks.login.login_task', args=(info.name, info.password, info.need_verify), queue='login_queue', routing_key='for_login')
def get_douban_subject_id_list(tag, sort, page_start, page_limit): """ 按subject获取豆瓣电影id列表 :param tag: :param sort: :param page_start: :param page_limit: :return: """ app.send_task('tasks.movie.douban_get_subject_id_list', args=( tag, sort, page_start, page_limit, ))
def task_filter(task_type, param): if task_type == 'question': if not CommonOper.is_exist("question_id", param): app.send_task('tasks.question.do_question', args=(param, ), queue='question_queue', routing_key='question') else: jsl_log.info("相关question已存在,question_id:{}".format(param)) elif task_type == 'people': if not CommonOper.is_exist("people_id", param): app.send_task('tasks.people.do_people', args=(param, ), queue='people_queue', routing_key='people') else: jsl_log.info("相关people已存在,people_id:{}".format(param))
def excute_start_crawler(parameter): crawler_info.info("Task started!") result = app.send_task('tasks.start_task.start_crawler', args=(parameter["id"], ), queue='crawler_queue', routing_key='for_crawler') crawler_info.info(result.task_id) return result.task_id
def crawl_person_infos(uid): """ 根据用户id来爬取用户相关资料和用户的关注数和粉丝数(由于微博服务端限制,默认爬取前五页,企业号的关注和粉丝也不能查看) :param uid: 用户id :return: """ if not uid: return # 由于与别的任务共享数据表,所以需要先判断数据库是否有该用户信息,再进行抓取 user = user_get.get_profile(uid) # 不抓取企业号 if user.verify_type == 2: set_seed_other_crawled(uid) return app.send_task('tasks.user.crawl_follower_fans', args=(uid,), queue='fans_followers', routing_key='for_fans_followers')
def start_crawler(parameter): parameters = {} parameters["id"] = parameter params = spider_task.start_task(parameters) for item in params: crawler = Crawleruning() crawler.set_parameter(item) crawler.start() target_url = crawler.process() for sub_url in target_url: item["url"] = sub_url app.send_task('tasks.start_task.parse_url', args=(item, ), queue='crawler_queue', routing_key='for_crawler') parameters["status"] = 0 spider_task.update_status(parameters)
def crawl_person_infos(uid): """ Crawl user info and their fans and followers For the limit of weibo's backend, we can only crawl 5 pages of the fans and followers. We also have no permissions to view enterprise's followers and fans info :param uid: current user id :return: None """ if not uid: return user, is_crawled = user_get.get_profile(uid) # If it's enterprise user, just skip it if user and user.verify_type == 2: set_seed_other_crawled(uid) return # Crawl fans and followers if not is_crawled: app.send_task('tasks.user.crawl_follower_fans', args=(uid,), queue='fans_followers', routing_key='for_fans_followers')
def excute_comment_task(): # 只解析了根评论,而未对根评论下的评论进行抓取,如果有需要的同学,可以适当做修改 weibo_datas = wb_data.get_weibo_comment_not_crawled() for weibo_data in weibo_datas: app.send_task('tasks.comment.crawl_comment_page', args=(weibo_data.weibo_id,), queue='comment_crawler', routing_key='comment_info')
def get_weibo_info(each, html): wb_data = WeiboData() try: wb_data.weibo_id = each['mid'] except (AttributeError, IndexError, TypeError): parser.error('Failed to get weibo id, the page source is {}'.format(html)) return None imgs = list() imgs_url = list() try: imgs = str(each.find(attrs={'node-type': 'feed_list_media_prev'}).find_all('li')) imgs_url = list(map(url_filter, re.findall(r"src=\"(.+?)\"", imgs))) wb_data.weibo_img = ';'.join(imgs_url) except Exception: wb_data.weibo_img = '' if IMG_ALLOW and imgs and imgs_url: app.send_task('tasks.downloader.download_img_task', args=(wb_data.weibo_id, imgs_url), queue='download_queue', routing_key='for_download') wb_data.weibo_img_path = IMG_PATH else: wb_data.weibo_img_path = '' # todo 没找到vedio的测试数据 try: a_tag = str(each.find(attrs={'node-type': 'feed_list_media_prev'}).find_all('a')) extracted_url = urllib.parse.unquote(re.findall(r"full_url=(.+?)&", a_tag)[0]) wb_data.weibo_video = url_filter(extracted_url) except Exception: wb_data.weibo_video = '' try: wb_data.device = each.find(attrs={'class': 'from'}).find(attrs={'rel': 'nofollow'}).text except AttributeError: wb_data.device = '' try: # todo 日期格式化,会有今日XXX,X分钟前等噪音 wb_data.create_time = each.find(attrs={'class': 'from'}).find(attrs={'target': '_blank'}).text.strip() wb_data.weibo_url = 'https:'+each.find(attrs={'class': 'from'}).find(attrs={'target': '_blank'})['href'] wb_data.uid = each.find(attrs={'class': 'from'}).find(attrs={'target': '_blank'})['href'].split('/')[3] except (AttributeError, KeyError): wb_data.create_time = '' wb_data.weibo_url = '' wb_data.weibo_uid = '' try: wb_data.repost_num = int(each.find(attrs={'class': 'card-act'}).find_all('li')[0].find('a').text.split('/')[-1]) except (AttributeError, ValueError): wb_data.repost_num = 0 try: wb_data.comment_num = int(each.find(attrs={'class': 'card-act'}).find_all('li')[1].find('a').text.split('/')[-1]) except (AttributeError, ValueError): wb_data.comment_num = 0 try: wb_data.praise_num = int(each.find(attrs={'class': 'card-act'}).find_all('li')[2].find('a').find('em').text) except (AttributeError, ValueError): wb_data.praise_num = 0 if '展开全文' in str(each): is_all_cont = 1 try: wb_data.weibo_cont = each.find(attrs={'node-type': 'feed_list_content_full'}).text.strip() except Exception as why: parser.error('Failed to get weibo cont, the error is {}, the page source is {}'.format(why, html)) return None else: is_all_cont = 1 try: wb_data.weibo_cont = each.find(attrs={'node-type': 'feed_list_content'}).text.strip() except Exception as why: parser.error('Failed to get weibo cont, the error is {}, the page source is {}'.format(why, html)) return None return wb_data, is_all_cont
def excute_user_task(): seeds = get_seed_ids() if seeds: for seed in seeds: app.send_task('tasks.user.crawl_person_infos', args=(seed.uid,), queue='user_crawler', routing_key='for_user_info')