def excute_search_task(): keywords = get_search_keywords() for each in keywords: celery.send_task('celery_tasks.weibo.search.search_keyword', args=(each[0], each[1]), queue='search_crawler', routing_key='for_search_info')
def excute_user_task(): seeds = get_seed_ids() if seeds: for seed in seeds: celery.send_task('celery_tasks.weibo.user.crawl_person_infos', args=(seed.uid, ), queue='user_crawler', routing_key='for_user_info')
def excute_comment_task(): # 只解析了根评论,而未对根评论下的评论进行抓取,如果有需要的同学,可以适当做修改 weibo_datas = wb_data.get_weibo_comment_not_crawled() for weibo_data in weibo_datas: celery.send_task('celery_tasks.weibo.comment.crawl_comment_page', args=(weibo_data.weibo_id, ), queue='comment_crawler', routing_key='comment_info')
def excute_home_task(): # 这里的策略由自己指定,可以基于已有用户做主页抓取,也可以指定一些用户,我这里直接选的种子数据库中的uid id_objs = get_home_ids() for id_obj in id_objs: celery.send_task('celery_tasks.weibo.home.crawl_weibo_datas', args=(id_obj.uid, ), queue='home_crawler', routing_key='home_info')
def excute_user_personal_adver_task(adv_message): seeds = get_seed() if seeds: for seed in seeds: sleep(random.randint(1, 6)) print(seed.uid) celery.send_task('celery_tasks.weibo.user.excute_personal_adver', args=(seed.uid, adv_message), queue='personal_adver', routing_key='for_adver')
def excute_login_task(): infos = login_info.get_login_info() # Clear all stacked login celery_tasks before each time for login Cookies.check_login_task() log.crawler.info('The excute_login_task is starting...') for info in infos: celery.send_task('celery_tasks.weibo.login.login_task', args=(info.name, info.password, info.source), queue='login_task', routing_key='login_task') time.sleep(10)
def excute_repost_task(): # regard current weibo url as the original url, you can also analyse from the root url weibo_datas = wb_data.get_weibo_repost_not_crawled() crawler.info('There are {} repost urls have to be crawled'.format( len(weibo_datas))) for weibo_data in weibo_datas: celery.send_task('celery_tasks.weibo.repost.crawl_repost_page', args=(weibo_data.weibo_id, weibo_data.uid), queue='repost_crawler', routing_key='repost_info')
def crawl_comment_page(mid): limit = conf.get_max_comment_page() + 1 # 这里为了马上拿到返回结果,采用本地调用的方式 first_page = crawl_comment_by_page(mid, 1) total_page = comment.get_total_page(first_page) if total_page < limit: limit = total_page + 1 for page_num in range(2, limit): celery.send_task('celery_tasks.weibo.comment.crawl_comment_by_page', args=(mid, page_num), queue='comment_page_crawler', routing_key='comment_page_info')
def crawl_person_infos(uid): """ Crawl user info and their fans and followers For the limit of weibo's backend, we can only crawl 5 pages of the fans and followers. We also have no permissions to view enterprise's followers and fans info :param uid: current user id :return: None """ if not uid: return user = user_get.get_profile(uid) # If it's enterprise user, just skip it if user.verify_type == 2: set_seed_other_crawled(uid) return # Crawl fans and followers celery.send_task('celery_tasks.weibo.user.crawl_follower_fans', args=(uid, ), queue='fans_followers', routing_key='for_fans_followers')
def search_keyword(keyword, keyword_id): cur_page = 1 encode_keyword = url_parse.quote(keyword) while cur_page < limit: cur_url = url.format(encode_keyword, cur_page) search_page = get_page(cur_url) if not search_page: crawler.warning( 'No result for keyword {}, the source page is {}'.format( keyword, search_page)) return search_list = parse_search.get_search_info(search_page) # Because the search results are sorted by time, if any result has been stored in mysql, # we need not crawl the same keyword in this turn for wb_data in search_list: rs = get_wb_by_mid(wb_data.weibo_id) if rs: crawler.info( 'keyword {} has been crawled in this turn'.format(keyword)) return else: insert_weibo_data(wb_data) insert_keyword_wbid(keyword_id, wb_data.weibo_id) # send task for crawling user info celery.send_task('celery_tasks.weibo.user.crawl_person_infos', args=(wb_data.uid, ), queue='user_crawler', routing_key='for_user_info') if 'page next S_txt1 S_line1' in search_page: cur_page += 1 else: crawler.info( 'keyword {} has been crawled in this turn'.format(keyword)) return
def jd_seckill_timer_relogin(self): task_id = self.request.id logger = cl.getLogger('jd_seckill_timer_relogin') logger.info('重新登录任务开始,task_id:' + task_id) ppool = ProxyStore.get_proxyPoolstores() mongdb_conn = DBStore.get_datastores() mydb = mongdb_conn['JD'] jd_users = mydb.Users.find({"status": 1}) for jd_user in jd_users: fetch_result = celery.send_task("celery_tasks.jd_seckill.jd_seckill.jd_seckill_relogin_task", queue='jd_seckill_presell', args=(jd_user["username"], jd_user["password"], 4099139, ppool.getProxy()))
def crawl_weibo_datas(uid): limit = get_max_home_page() cur_page = 1 while cur_page <= limit: url = home_url.format(uid, cur_page) html = get_page(url) weibo_datas = get_wbdata_fromweb(html) if not weibo_datas: crawler.warning('用户id为{}的用户主页微博数据未采集成功,请检查原因'.format(uid)) return insert_weibo_datas(weibo_datas) domain = public.get_userdomain(html) cur_time = int(time.time() * 1000) ajax_url_0 = ajax_url.format(domain, 0, domain, uid, cur_page, cur_page, cur_time) ajax_url_1 = ajax_url.format(domain, 1, domain, uid, cur_page, cur_page, cur_time + 100) if cur_page == 1: total_page = get_total_page(crawl_ajax_page(ajax_url_1)) if total_page < limit: limit = total_page cur_page += 1 celery.send_task('celery_tasks.weibo.home.crawl_ajax_page', args=(ajax_url_0, ), queue='ajax_home_crawler', routing_key='ajax_home_info') celery.send_task('celery_tasks.weibo.home.crawl_ajax_page', args=(ajax_url_1, ), queue='ajax_home_crawler', routing_key='ajax_home_info')
# coding:utf-8 from apps.celery_init import celery from celery_tasks.weibo import login if __name__ == '__main__': # you should execute this file, because celery timer will execute login delayed # login.excute_login_task() celery.send_task('apps.celery_init.start_add_task', args=(1, 2), queue='start_add_task', routing_key='start_add_task')