def crawl_weibo_datas(uid): limit = get_max_home_page() cur_page = 1 while cur_page <= limit: url = home_url.format(uid, cur_page) html = get_page(url) weibo_datas = get_wbdata_fromweb(html) if not weibo_datas: crawler.warning('用户id为{}的用户主页微博数据未采集成功,请检查原因'.format(uid)) return insert_weibo_datas(weibo_datas) domain = public.get_userdomain(html) ajax_url_0 = ajax_url.format(domain, 0, domain, uid, cur_page, cur_page) ajax_url_1 = ajax_url.format(domain, 1, domain, uid, cur_page, cur_page) if cur_page == 1: total_page = get_total_page(crawl_ajax_page(ajax_url_1)) if total_page < limit: limit = total_page cur_page += 1 app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0,), queue='ajax_home_crawler', routing_key='ajax_home_info') app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1,), queue='ajax_home_crawler', routing_key='ajax_home_info')
def crawl_ajax_page(url): ajax_html_0 = get_page(url) ajax_wbdatas_0 = get_home_wbdata_byajax(ajax_html_0) if not ajax_wbdatas_0: return insert_weibo_datas(ajax_wbdatas_0)
def crawl_weibo_datas(uid): limit = get_max_home_page() cur_page = 1 while cur_page <= limit: url = home_url.format(uid, cur_page) html = get_page(url) weibo_datas = get_wbdata_fromweb(html) if not weibo_datas: crawler.warning("user {} has no weibo".format(uid)) return insert_weibo_datas(weibo_datas) domain = public.get_userdomain(html) cur_time = int(time.time()*1000) ajax_url_0 = ajax_url.format(domain, 0, domain, uid, cur_page, cur_page, cur_time) ajax_url_1 = ajax_url.format(domain, 1, domain, uid, cur_page, cur_page, cur_time+100) if cur_page == 1: # here we use local call to get total page number total_page = get_total_page(crawl_ajax_page(ajax_url_1)) if total_page < limit: limit = total_page cur_page += 1 app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0,), queue='ajax_home_crawler', routing_key='ajax_home_info') app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1,), queue='ajax_home_crawler', routing_key='ajax_home_info') set_seed_home_crawled(uid)
def crawl_ajax_page(url): ajax_html = get_page(url, user_verify=False) ajax_wbdatas = get_home_wbdata_byajax(ajax_html) if not ajax_wbdatas: return '' insert_weibo_datas(ajax_wbdatas) return ajax_html
def test_add_search_cont(self): """ 测试批量添加微博信息 :return: """ from db.wb_data import insert_weibo_datas from page_parse import search with open('tests/search.html', encoding='utf-8') as f: cont = f.read() infos = search.get_search_info(cont) insert_weibo_datas(infos)
def crawl_ajax_page(url): """ :param url: user home ajax url :return: resp.text """ ajax_html = get_page(url, user_verify=False) ajax_wbdatas = get_home_wbdata_byajax(ajax_html) if not ajax_wbdatas: return '' insert_weibo_datas(ajax_wbdatas) return ajax_html
def test_add_search_cont(self): """ 测试批量添加微博信息 :return: """ from db.wb_data import insert_weibo_datas from page_parse import search url = TEST_SERVER + 'search.html' resp = requests.get(url) resp.encoding = 'utf-8' cont = resp.text infos = search.get_search_info(cont) insert_weibo_datas(infos)
def crawl_ajax_page(url): """ 返回值主要供第一次本地调用使用(获取总页数),网络调用忽略返回值 :param url: :return: """ ajax_html = get_page(url, user_verify=False) ajax_wbdatas = get_home_wbdata_byajax(ajax_html) if not ajax_wbdatas: return '' insert_weibo_datas(ajax_wbdatas) return ajax_html
def crawl_weibo_datas(uid): limit = get_max_home_page() cur_page = 1 while cur_page <= limit: url = home_url.format(uid, cur_page) html = get_page(url) weibo_datas = get_wbdata_fromweb(html) if not weibo_datas: crawler.warning("user {} has no weibo".format(uid)) return insert_weibo_datas(weibo_datas) domain = public.get_userdomain(html) page_id = public.get_pageid(html) cur_time = int(time.time() * 1000) ajax_url_0 = ajax_url.format(domain, 0, page_id, uid, cur_page, cur_page, cur_time) ajax_url_1 = ajax_url.format(domain, 1, page_id, uid, cur_page, cur_page, cur_time + 100) if cur_page == 1: # here we use local call to get total page number total_page = get_total_page(crawl_ajax_page(ajax_url_1)) if total_page < limit: limit = total_page cur_page += 1 app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0, ), queue='ajax_home_crawler', routing_key='ajax_home_info') app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1, ), queue='ajax_home_crawler', routing_key='ajax_home_info') set_seed_home_crawled(uid)
def crawl_weibo_datas(uid): limit = get_max_home_page() cur_page = 1 # 自定义最大爬取的页数 max_page = 10 # end while cur_page <= limit: # 有些微博账号的照片多达两三千张,如果全部爬取比较浪费时间,这里先简单粗暴地根据当前微博的页数 # 进行限制。经过调查发现10页左右应该是比较理想的数字,电脑版微博一页有45条微博,那么一个账户就是 # 450条微博。 if cur_page > max_page: break # end url = home_url.format(uid, cur_page) html = get_page(url) domain = public.get_userdomain(html) # 只爬取微博个人用户的相片,如果是非个人用户(如政府,组织等)不爬取。 if domain not in ['103505', '100306', '100505', '']: set_seed_home_crawled(uid, 2) return # end weibo_datas, weibo_pics = get_wbdata_fromweb(html) if not weibo_datas: crawler.warning('用户id为{}的用户主页微博数据未采集成功,请检查原因'.format(uid)) return insert_weibo_datas(weibo_datas) # 如果非空,则将weibo_pics插入数据库中 if weibo_pics: insert_weibo_pics(weibo_pics) # end cur_time = int(time.time() * 1000) ajax_url_0 = ajax_url.format(domain, 0, domain, uid, cur_page, cur_page, cur_time) ajax_url_1 = ajax_url.format(domain, 1, domain, uid, cur_page, cur_page, cur_time + 100) if cur_page == 1: total_page = get_total_page(crawl_ajax_page(ajax_url_1)) if total_page < limit: limit = total_page cur_page += 1 app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0, ), queue='ajax_home_crawler', routing_key='ajax_home_info') app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1, ), queue='ajax_home_crawler', routing_key='ajax_home_info') # 在遍历完所有页数之后,将flag置位。放在这里表示所有页面都遍历过,不保证遍历成功后置位。可能以后还要优化,即在 # 某个回调函数中使用它。 set_seed_home_crawled(uid)