def fetch_all_url(kw, begin, count, name_cookies): # 先进行一次搜索并保存 max_num = get_total(kw, begin, 1, name_cookies) while max_num > begin: search_url = get_search_biz_url(query=kw, begin=begin, count=count) logger.info(search_url) keyword.excute_user_crawl_task(search_url) begin += count
def search_keyword(kw): # 通过队列获取账号的cookie name_cookies = Cookies.fetch_cookies() if (len(name_cookies) == 0): logger.error("没有可用cookie。") else: token = get_token(name_cookies) fetch_all(kw, 340, 5, name_cookies, token) logger.info('完成搜索公众号=========>关键词:' + kw) # 设置为不可用 KeywordDao.set_enable(kw, False)
def fetch_all(kw, begin, count, name_cookies, token): #先进行一次搜索并保存 max_num = get_total(kw, begin, count, name_cookies, token) while max_num > begin: time.sleep(random.randint(10, 15)) logger.info('翻页###################begin=' + str(begin)) try: begin += count search_by_page(kw, begin, count, name_cookies, token) except: logger.error('采集异常!!!!!!!')
def search_by_page(wechat_biz, begin, count, name_cookies): logger.info('开始search公众号=========>:' + str(begin) + ':' + wechat_biz) login_user = name_cookies[0] cookies = name_cookies[1] token = cookies['token'] search_url = get_search_wechat_url(fakeid=wechat_biz, begin=begin, count=count, token=token) logger.info(search_url) search_response = requests.get(search_url, cookies=cookies, headers=header) return search_response
def search_by_page(kw, begin, count, name_cookies): logger.info('开始search关键词=========>:' + str(begin) + ':' + kw) login_user = name_cookies[0] cookies = name_cookies[1] token = cookies['token'] search_url = get_search_biz_url(query=kw, begin=begin, count=count, token=token) logger.info(search_url) search_response = requests.get(search_url, cookies=cookies, headers=header) return search_response
def get_article_url_list(search_url): # 通过队列获取账号的cookie name_cookies = get_cookie() while name_cookies == None: logger.info('cookie 为空,30s后 重新获取') time.sleep(30) name_cookies = get_cookie() cookies = name_cookies[1] token = cookies['token'] search_url = search_url + '&token=' + token logger.info("searching article ==========>:" + search_url) search_response = requests.get(search_url, cookies=cookies, headers=header) lists = search_response.json().get('app_msg_list') if not os.path.exists(url_save_path): os.mkdir(url_save_path) urls_file = open(url_save_path + '/urls.txt', 'a+', encoding='utf-8') for item in lists: json_str = json.dumps(item, ensure_ascii=False) logger.info(json_str) try: link = item.get('link') # url写入文件 urls_file.write(link + '\n') # 发送微信文章爬虫任务 wechat_crawl.excute_wechat_crawl_task(link) except: logger.info("保存失败:" + json_str) urls_file.close() random_time = random.randint(15, 30) time.sleep(random_time)
def search_keyword(kw): # 通过队列获取账号的cookie name_cookies = get_cookie() while name_cookies == None: logger.info('搜索关键词' + kw + '时,cookie 为空,重新获取') name_cookies = get_cookie() user_crawl.fetch_all_url(kw, start, page_size, name_cookies) logger.info('搜索公众号任务完成=========>关键词:' + kw) # 设置为不可用 KeywordDao.set_enable(kw, False) # if __name__ == '__main__': # # search_keyword('datangleiyin')
def fetch_user_all_url(wechat_biz): begin = 0 count = ARTICLE_PAGE_LIMIT name_cookies = get_cookie() # 先进行一次搜索并保存 max_num = get_total(wechat_biz, begin, 1, name_cookies) while max_num > begin: search_url = get_search_wechat_url(fakeid=wechat_biz, begin=begin, count=count) logger.info(search_url) # 发送搜索url任务 wechat_crawl.excute_wechat_url_crawl_task(search_url) begin += count WeChatUserDao.set_history_crawled(wechat_biz)
def search_by_page(kw, begin, count, name_cookies, token): logger.info('开始search关键词=========>:' + str(begin) + ':' + kw) login_user = name_cookies[0] cookies = name_cookies[1] query_id = { 'action': 'search_biz', 'token': token, 'lang': 'zh_CN', 'f': 'json', 'ajax': '1', 'random': random.random(), 'query': kw, 'begin': '{}'.format(str(begin)), 'count': count, } search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?' search_response = requests.get(search_url, cookies=cookies, headers=header, params=query_id) lists = search_response.json().get('list') for item in lists: json_str = json.dumps(item, ensure_ascii=False) logger.info(json_str) try: WeChatUserDao.create_by_json(item) except: logger.info("保存失败:" + json_str) return search_response.json()
def get_user_list(search_url): # 通过队列获取账号的cookie name_cookies = get_cookie() while name_cookies == None: name_cookies = get_cookie() cookies = name_cookies[1] token = cookies['token'] search_url = search_url + '&token=' + token logger.info("searching==========>:" + search_url) search_response = requests.get(search_url, cookies=cookies, headers=header) lists = search_response.json().get('list') for item in lists: json_str = json.dumps(item, ensure_ascii=False) logger.info(json_str) try: WeChatUserDao.create_by_json(item) #TODO: #触发爬取该用户url的任务,根据公众号的fakeid,即为biz logger.info("开始爬取公众号:" + item.get('nickname')) wechat_crawl.excute_wechat_user_crawl_task(item.get('fakeid')) except Exception as e: logger.error("保存公众号失败:" + json_str) logger.error(e) #执行完后休息一段时间 random_time = random.randint(20, 50) time.sleep(random_time)
def put(self, item): json_str = json.dumps(item, ensure_ascii=False, cls=CJsonEncoder) crawler.info(json_str) return item
def get_article(article_url): is_crawled = Urls.is_crawled_url(article_url) if (is_crawled == 1): logger.info("ignore crawled page : " + article_url) return logger.info("crawling page : " + article_url) response = requests.get(article_url, headers=header_wechat, timeout=(3, 5)) soup = BeautifulSoup(response.content, 'lxml') html_str = str(soup) try: # article = soup.find('div', class_='rich_media') # title = article.find('h2').get_text() # meta_content = soup.find(id='meta_content') # time_str = meta_content.find(id='post-date').get_text() # nickname = meta_content.find_all('em')[1].get_text() #首先提取分享的内容,如果没有再去提取正文(如果文章是分享的内容为js_share_content,js_content则为空) content_div = soup.find(id='js_share_content') if (content_div == None): content_div = soup.find(id='js_content') # content = content_div.get_text() content = str(content_div) msg_title = (re.search('(var msg_title = ")(.*)"', html_str).group(2)) nickname = (re.search('(var nickname = ")(.*)"', html_str).group(2)) alias = (re.search('(var user_name = ")(.*)"', html_str).group(2)) publish_timestamp = (re.search('(var ct = ")(.*)"', html_str).group(2)) publish_time = timestamp_datetime(publish_timestamp, type='s') # publish_time = time.localtime(int(publish_time_long)) # publish_time = (re.search('(var publish_time = ")(.*)" ', html_str).group(2)) round_head_img = (re.search('(var round_head_img = ")(.*)"', html_str).group(2)) ori_head_img_url = (re.search('(var ori_head_img_url = ")(.*)"', html_str).group(2)) msg_desc = (re.search('(var msg_desc = ")(.*)"', html_str).group(2)) msg_source_url = (re.search('(var msg_source_url = \')(.*)\'', html_str).group(2)) if msg_title: item = WeChatData() item.url = response.url item.title = msg_title item.nickname = nickname item.alias = alias item.pub_time = publish_time item.round_head_img = round_head_img item.ori_head_img_url = ori_head_img_url item.msg_desc = msg_desc item.msg_source_url = msg_source_url item.content = content # 文章处理 try: dic = class_to_dict(item) del dic['_state'] dic['date_modified'] = dic[ 'date_created'] = timestamp_datetime(time.time(), type='s') # json_str = json.dumps(dic,ensure_ascii=False) # 发送到kafka(必须先发送到kafka,否则时间格式会让发送不成功) MyKafkaProducer.get_instance().send(dic) #保存到数据库 WeChatData.save(item) Urls.store_crawled_url(article_url) except Exception as err: logger.error("保存微信文章异常:") logger.error(err) Urls.store_save_failed_url(article_url) # dic = class_to_dict(item) # urls_file = open(url_save_path + '/articles.txt', 'a+', encoding='utf-8') # urls_file.writelines(str(dic) + '\n') # urls_file.close() except Exception as e: Urls.store_crawl_failed_url(article_url) logger.error(e) # if __name__ == '__main__': # share_article_url = "https://mp.weixin.qq.com/s?__biz=MzA5MzIwNDk5Ng==&mid=2649719243&idx=2&sn=175caa81361cfea8acbb5ead4211b5b6&chksm=887a46a0bf0dcfb6d4c6c4e12eea067446d7e6035e45404d9a0666ab66822e0f7027e37cbb3e#rd" # article_url ='http://mp.weixin.qq.com/s?__biz=MzA3NjcwNjgyOQ==&mid=204671295&idx=3&sn=731238aecce2c638c3c1157092650e18#rd' # get_article(article_url)
def get_wechat_user_urls(wechat_user_id, name_cookies): login_user = name_cookies[0] cookies = name_cookies[1] response = requests.get(url=url, cookies=cookies) response_list = re.findall(r'token=(\d+)', str(response.url)) if (len(response_list)): token = response_list[0] query = wechat_user_id crawler.info('使用账号[' + login_user + ']开始采集公众号=========>:' + query) urls_file = open(url_save_path + '/urls_' + query + '.txt', 'a+', encoding='utf-8') query_id = { 'action': 'search_biz', 'token': token, 'lang': 'zh_CN', 'f': 'json', 'ajax': '1', 'random': random.random(), 'query': query, 'begin': '0', 'count': '5', } search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?' search_response = requests.get(search_url, cookies=cookies, headers=header, params=query_id) lists = search_response.json().get('list') if (len(lists) == 0): crawler.info("未找到公众号:" + query) else: fakeid = lists[0].get('fakeid') query_id_data = { 'token': token, 'lang': 'zh_CN', 'f': 'json', 'ajax': '1', 'random': random.random(), 'action': 'list_ex', 'begin': '0', 'count': '5', 'query': '', 'fakeid': fakeid, 'type': '9' } appmsg_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?' appmsg_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data) max_num = appmsg_response.json().get('app_msg_cnt') num = int(int(max_num) / 5) begin = 0 while num + 1 > 0: query_id_data = { 'token': token, 'lang': 'zh_CN', 'f': 'json', 'ajax': '1', 'random': random.random(), 'action': 'list_ex', 'begin': '{}'.format(str(begin)), 'count': '5', 'query': '', 'fakeid': fakeid, 'type': '9' } crawler.info('翻页###################begin=' + str(begin)) try: query_fakeid_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data) fakeid_list = query_fakeid_response.json().get( 'app_msg_list') for item in fakeid_list: crawler.info(item.get('link')) urls_file.write(item.get('link') + '\n') urls_file.flush() num -= 1 begin = int(begin) begin += 5 time.sleep(random.randint(10, 15)) except: crawler.error('采集异常!!!!!!!') urls_file.close() crawler.info('完成采集公众号=========>:' + query) WeChatUserDao.set_history_crawled(query) else: crawler.info(login_user + 'cookie 失效,请重新登录,正在删除...') Cookies.delete_cookies(login_user) crawler.info(login_user + 'cookie,删除成功。')