def deal_comment(self, req_url, text): data = tools.get_json(text) __biz = tools.get_param(req_url, '__biz') comment_id = tools.get_param(req_url, 'comment_id') # 与文章关联 elected_comment = data.get('elected_comment', []) comment_datas = [ dict( __biz=__biz, comment_id=comment_id, nick_name=comment.get('nick_name'), logo_url=comment.get('logo_url'), content=comment.get('content'), create_time=tools.timestamp_to_date(comment.get('create_time')), content_id=comment.get('content_id'), like_num=comment.get('like_num'), is_top=comment.get('is_top'), spider_time=tools.get_current_date() ) for comment in elected_comment ] if comment_datas: data_pipeline.save_article_commnet(comment_datas)
def deal_comment(reply): if not reply: return comment_id = reply.get('id') pre_id = reply.get('replyId') content = reply.get('content') article_id = reply.get('mainContentId') release_time = reply.get('addTime') release_time = tools.timestamp_to_date(release_time) head_url = reply.get('userInfo', {}).get('icon') consumer = reply.get('userInfo', {}).get('uname') gender = int(reply.get('userInfo', {}).get('gender')) up_count = reply.get('likes') # TODO emotion = random.randint(0, 2) hot_id = comment_id log.debug(''' 评论id: %s 父id %s 文章id %s 发布人: %s 头像地址 %s 性别 %s 内容: %s 点赞量 %s 发布时间 %s ''' % (comment_id, pre_id, article_id, consumer, head_url, gender, content, up_count, release_time)) return self_base_parser.add_comment(comment_id, pre_id, article_id, consumer, head_url, gender, content, up_count, release_time, emotion, hot_id)
def deal_comment(self, req_url, text): """ 解析评论 :param req_url: :param text: :return: """ data = tools.get_json(text) __biz = tools.get_param(req_url, "__biz") comment_id = tools.get_param(req_url, "comment_id") # 与文章关联 elected_comment = data.get("elected_comment", []) comment_datas = [ dict( __biz=__biz, comment_id=comment_id, nick_name=comment.get("nick_name"), logo_url=comment.get("logo_url"), content=comment.get("content"), create_time=tools.timestamp_to_date( comment.get("create_time")), content_id=comment.get("content_id"), like_num=comment.get("like_num"), is_top=comment.get("is_top"), spider_time=tools.get_current_date(), ) for comment in elected_comment ] if comment_datas: data_pipeline.save_article_commnet(comment_datas)
def is_have_new_article(self, account_id='', account=''): ''' @summary: 检查公众号今日是否发文 --------- @param account_id: @param account: --------- @result: ''' account_block = self.__get_account_blocks(account_id, account) if account_block == constance.VERIFICATION_CODE: return constance.VERIFICATION_CODE regex = "timeConvert\('(\d*?)'\)" release_time = tools.get_info(account_block, regex, fetch_one=True) if release_time: release_time = int(release_time) release_time = tools.timestamp_to_date(release_time) log.debug("最近发文时间 %s" % release_time) if release_time >= tools.get_current_date('%Y-%m-%d'): return constance.UPDATE else: return constance.NOT_UPDATE else: return constance.ERROR
def __open_next_page(self): ''' @summary: 跳转到历史文章 --------- @param __biz: @param pass_ticket: @param appmsg_token: @param offset: --------- @result: ''' is_done = False # 是否做完一轮 is_all_done = False # 是否全部做完(所有公众号当日的发布的信息均已采集) if WechatAction._todo_urls: url = WechatAction._todo_urls.popleft() else: # 做完一个公众号 更新其文章数 WechatAction._wechat_service.update_account_article_num( WechatAction._current_account_biz) # 跳转到下一个公众号 account_id, __biz, is_done, is_all_done = WechatAction._wechat_service.get_next_account( ) WechatAction._account_info[__biz] = account_id or '' # url = 'http://mp.weixin.qq.com/mp/getmasssendmsg?__biz=%s#wechat_webview_type=1&wechat_redirect'%__biz url = 'https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=%s&scene=124#wechat_redirect' % __biz log.debug(''' 下一个公众号 : %s ''' % url) # 注入js脚本实现自动跳转 if is_all_done: # 当天文章均已爬取 下一天再爬 # 睡眠到下一天 sleep_time = self.get_next_day_time_interval() elif is_done: # 做完一轮 休息 sleep_time = self.get_wait_time() elif ONLY_TODAY_MSG and tools.get_current_date( ) < tools.get_current_date( "%Y-%m-%d" ) + ' ' + SPIDER_START_TIME: # 只爬取今日文章且当前时间小于指定的开始时间,则休息不爬取,因为公众号下半夜很少发布文章 sleep_time = self.get_spider_start_time_interval() else: # 做完一篇文章 间隔一段时间 sleep_time = self.get_sleep_time() log.debug(''' next_page_url : %s is_done: %s is_all_done: %s sleep_time: %s next_start_time %s ''' % (url, is_done, is_all_done, tools.seconds_to_h_m_s(sleep_time / 1000), tools.timestamp_to_date(tools.get_current_timestamp() + sleep_time / 1000))) next_page = "<script>setTimeout(function(){window.location.href='%s';},%d);</script>" % ( url, sleep_time) return next_page
def deal_article(self, req_url, text): """ 解析文章 :param req_url: :param text: :return: """ sn = tools.get_param(req_url, "sn") if not text: self._task_manager.update_article_task_state(sn, -1) return None selector = Selector(text) content = selector.xpath( '//div[@class="rich_media_content "]|//div[@class="rich_media_content"]|//div[@class="share_media"]' ).extract_first(default="") title = (selector.xpath('//h2[@class="rich_media_title"]/text()'). extract_first(default="").strip()) account = (selector.xpath('//a[@id="js_name"]/text()').extract_first( default="").strip()) author = (selector.xpath( '//span[@class="rich_media_meta rich_media_meta_text"]//text()'). extract_first(default="").strip()) publish_timestamp = selector.re_first('n="(\d{10})"') publish_timestamp = int( publish_timestamp) if publish_timestamp else None publish_time = (tools.timestamp_to_date(publish_timestamp) if publish_timestamp else None) biz = tools.get_param(req_url, "__biz") text = remove_tags(content).strip() spider_name = 'wechat' collection_mode = 'spider' data_source_type = '微信公众号' article_data = { "data_type": account, "title": title, "data_address": req_url, "author": author, "publish_time": publish_time, "__biz": biz, "text": text, "spider_name": spider_name, "collection_mode": collection_mode, "data_source_type": data_source_type, "sn": sn, "collection_time": tools.get_current_date(), } # 入库 if article_data and data_pipeline.save_article( article_data) is not None: self._task_manager.update_article_task_state(sn, 1) return self._task_manager.get_task()
def get_task(self, url=None, tip=''): """ 获取任务 :param url: 指定url时,返回该url包装后的任务。否则先取公众号任务,无则取文章任务。若均无任务,则休眠一段时间之后再取 :return: """ sleep_time = random.randint(self._spider_interval_min, self._spider_interval_max) if not url: account_task = self.get_account_task() if account_task: __biz = account_task.get('__biz') last_publish_time = account_task.get('last_publish_time') self.record_last_article_publish_time(__biz, last_publish_time) tip = '正在抓取列表' url = 'https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz={}&scene=124#wechat_redirect'.format( __biz) else: article_task = self.get_article_task() if article_task: tip = '正在抓取详情' url = article_task.get('article_url') else: sleep_time = config.get('spider').get('no_task_sleep_time') log.info('暂无任务 休眠 {}s'.format(sleep_time)) tip = '暂无任务 ' if url: next_page = "{tip} 休眠 {sleep_time}s 下次刷新时间 {begin_spider_time} <script>setTimeout(function(){{window.location.href='{url}';}},{sleep_time_msec});</script>".format( tip=tip and tip + ' ', sleep_time=sleep_time, begin_spider_time=tools.timestamp_to_date( tools.get_current_timestamp() + sleep_time), url=url, sleep_time_msec=sleep_time * 1000) else: next_page = "{tip} 休眠 {sleep_time}s 下次刷新时间 {begin_spider_time} <script>setTimeout(function(){{window.location.reload();}},{sleep_time_msec});</script>".format( tip=tip and tip + ' ', sleep_time=sleep_time, begin_spider_time=tools.timestamp_to_date( tools.get_current_timestamp() + sleep_time), sleep_time_msec=sleep_time * 1000) return next_page
def __open_next_page(self): ''' @summary: 跳转到历史文章 --------- @param __biz: @param pass_ticket: @param appmsg_token: @param offset: --------- @result: ''' is_done = False # 是否做完一轮 url = None while WechatAction._todo_urls: result = WechatAction._todo_urls.popleft() if callable(result): # 为更新公众号已做完的回调 result() #执行回调 else: url = result break if not url: # 跳转到下一个公众号 account = WechatAction._wechat_service.get_next_account() if account: account_id, __biz = account WechatAction._account_info[__biz] = account_id or '' url = 'https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=%s&scene=124#wechat_redirect'%__biz log.debug(''' 下一个公众号 : %s '''%url) else: is_done = True # 注入js脚本实现自动跳转 if is_done: # 做完一轮 休息 sleep_time = self.get_wait_time() elif ONLY_TODAY_MSG and tools.get_current_date() < tools.get_current_date("%Y-%m-%d") + ' ' + SPIDER_START_TIME: # 只爬取今日文章且当前时间小于指定的开始时间,则休息不爬取,因为公众号下半夜很少发布文章 sleep_time = self.get_spider_start_time_interval() else: # 做完一篇文章 间隔一段时间 sleep_time = self.get_sleep_time() tip_sleep_time = tools.seconds_to_h_m_s(sleep_time / 1000) tip_next_start_time = tools.timestamp_to_date(tools.get_current_timestamp() + sleep_time / 1000) if not url: url = 'http://localhost:6210/tip/wait?sleep_time={}&next_start_time={}'.format(tip_sleep_time, tip_next_start_time) log.debug(''' next_page_url : %s is_done: %s sleep_time: %s next_start_time %s '''%(url, is_done, tip_sleep_time, tip_next_start_time)) next_page = "休眠 %s 下次刷新时间 %s<script>setTimeout(function(){window.location.href='%s';},%d);</script>"%(tip_sleep_time, tip_next_start_time, url, sleep_time) return next_page
def parser_next_page_article(video_id, wall_id, feed_id, sns_time, url): article_json_url = 'http://api-t.iqiyi.com/feed/get_feeds?authcookie=&device_id=pc_web&m_device_id=a11e6ea94270eaaa0b46be30af84fc54&agenttype=118&wallId={wall_id}&feedTypes=1%2C7%2C8%2C9&count=20&top=1&hasRecomFeed=1&feedId={feed_id}&needTotal=1¬ice=1&version=1&upOrDown=1&snsTime={sns_time}&_={timestamp_m}'.format(wall_id = wall_id, feed_id = feed_id, sns_time = sns_time, timestamp_m = int(tools.get_current_timestamp() * 1000)) print(article_json_url) article_json = tools.get_json_by_requests(article_json_url) wall_id = article_json.get('data', {}).get('wallId') # 评论数组 feeds = article_json.get('data', {}).get('feeds', []) for feed in feeds: article_id = feed.get('commentId') head_url = feed.get('icon') name = feed.get('name') release_time = feed.get('releaseDate') release_time = tools.timestamp_to_date(release_time) title = feed.get('feedTitle') content = feed.get('description') image_urls = ','.join([img.get('url') for img in feed.get('pictures', [])])#逗号分隔 watch_count = feed.get('uvCount') up_count = feed.get('agreeCount') comment_count = feed.get('commentCount') log.debug(''' id: %s 节目id %s 头像地址: %s 名字: %s 发布时间: %s 标题: %s 内容: %s 图片地址: %s 观看量: %s 点赞量: %s 评论量: %s '''%(article_id, video_id, head_url, name, release_time, title, content, image_urls, watch_count, up_count, comment_count)) if self_base_parser.add_article(article_id, head_url, name, release_time, title, content, image_urls, watch_count, up_count, comment_count, program_id = video_id, gender = random.randint(0,1), url = url, info_type = 3, emotion = random.randint(0,2), collect = 0, source = '爱奇艺'): # 解析評論 parser_comment(article_id, wall_id) else: break else: if feeds: feed_id = feeds[-1].get('feedId') sns_time = feeds[-1].get('snsTime') parser_next_page_article(video_id, wall_id, feed_id, sns_time, url)
def get_article_release_time(self, account_id='', account=''): account_block = self.__get_account_blocks(account_id, account) if account_block == constance.VERIFICATION_CODE: return constance.VERIFICATION_CODE regex = "timeConvert\('(\d*?)'\)" release_time = tools.get_info(account_block, regex, fetch_one=True) if release_time: release_time = int(release_time) release_time = tools.timestamp_to_date(release_time) return release_time
def deal_article(self, req_url, text): sn = tools.get_param(req_url, 'sn') if not text: self._task_manager.update_article_task_state(sn, -1) return None selector = Selector(text) content = selector.xpath('//div[@class="rich_media_content "]|//div[@class="rich_media_content"]|//div[@class="share_media"]') title = selector.xpath('//h2[@class="rich_media_title"]/text()').extract_first(default='').strip() account = selector.xpath('//a[@id="js_name"]/text()').extract_first(default='').strip() author = selector.xpath('//span[@class="rich_media_meta rich_media_meta_text"]//text()').extract_first(default='').strip() publish_timestamp = selector.re_first('n="(\d{10})"') publish_timestamp = int(publish_timestamp) if publish_timestamp else None publish_time = tools.timestamp_to_date(publish_timestamp) if publish_timestamp else None pics_url = content.xpath('.//img/@src|.//img/@data-src').extract() biz = tools.get_param(req_url, '__biz') digest = selector.re_first('var msg_desc = "(.*?)"') cover = selector.re_first('var cover = "(.*?)";') or selector.re_first('msg_cdn_url = "(.*?)"') source_url = selector.re_first("var msg_source_url = '(.*?)';") content_html = content.extract_first(default='') comment_id = selector.re_first('var comment_id = "(\d+)"') article_data = { 'account': account, 'title': title, 'url': req_url, 'author': author, 'publish_time': publish_time, '__biz': biz, 'digest': digest, 'cover': cover, "pics_url": pics_url, "content_html": content_html, "source_url": source_url, "comment_id": comment_id, "sn": sn, "spider_time": tools.get_current_date() } # 入库 if article_data and data_pipeline.save_article(article_data) is not None: self._task_manager.update_article_task_state(sn, 1) return self._task_manager.get_task()
def get_url(time_lenght = 60): ''' @summary: --------- @param time_lenght: 时间段 分钟 --------- @result: ''' current_date = tools.get_current_date() per_date = tools.read_file(STO_PER_SYNC_TIME) or tools.timestamp_to_date(tools.get_current_timestamp() - time_lenght * 60) tools.write_file(STO_PER_SYNC_TIME, current_date) root_url = 'http://192.168.60.38:8001/hotspot_al/interface/getCluesDataSearchInfo?pageNo=%d&pageSize=100&updateSTime={per_date}&updateETime={current_date}&sort=5&isDesc=0'.format(per_date = per_date, current_date = current_date) return root_url
def parse_article_info(article_info, comm_msg_info): if not article_info: return # log.debug(tools.dumps_json(article_info)) title = article_info.get("title") digest = article_info.get("digest") url = article_info.get("content_url").replace("\\", "").replace( "amp;", "") source_url = article_info.get("source_url").replace("\\", "") # 引用的文章链接 cover = article_info.get("cover").replace("\\", "") subtype = article_info.get("subtype") is_multi = article_info.get("is_multi") author = article_info.get("author") copyright_stat = article_info.get("copyright_stat") duration = article_info.get("duration") del_flag = article_info.get("del_flag") type = comm_msg_info.get("type") publish_time = tools.timestamp_to_date( comm_msg_info.get("datetime")) sn = tools.get_param(url, "sn") if sn: # 缓存文章信息 article_data = { "title": title, "digest": digest, "url": url, "source_url": source_url, "cover": cover, "subtype": subtype, "is_multi": is_multi, "author": author, "copyright_stat": copyright_stat, "duration": duration, "del_flag": del_flag, "type": type, "publish_time": publish_time, "sn": sn, "__biz": __biz, "spider_time": tools.get_current_date(), } return article_data
def is_have_new_article(self, __biz): ''' @summary: 检查公众号今日是否发文 --------- @param account_id: @param account: --------- @result: ''' log.debug('search keywords ' + __biz) url = 'https://mp.weixin.qq.com/cgi-bin/appmsg' params = { "lang": "zh_CN", "token": TOOKEN, "query": "", "f": "json", "count": "5", "action": "list_ex", "ajax": "1", "type": "9", "fakeid": __biz, "random": str(random.random()) + str(random.randint(1, 9)), "begin": "0" } articles_json = tools.get_json_by_requests(url, params=params, headers=HEADERS) # print(articles_json) # TOOLEN 过期 返回 {'base_resp': {'err_msg': 'invalid csrf token', 'ret': 200040}} article_list = articles_json.get('app_msg_list', []) for article in article_list: release_time = article.get('update_time') release_time = tools.timestamp_to_date(release_time) log.debug("最近发文时间 %s" % release_time) if release_time >= tools.get_current_date('%Y-%m-%d'): return constance.UPDATE else: return constance.NOT_UPDATE else: return constance.ERROR
def get_wait_check_account(self): ''' @summary: --------- @param : --------- @result: ''' # 取抓取完的公众号,且最近发布时间已过去两小时,则再次监测是否又发布新文章 before_tow_hours = tools.timestamp_to_date( tools.get_current_timestamp() - 60 * 60 * 2) sql = ''' select t.id, t.domain, t.name, to_char(t.last_article_release_time, 'yyyy-mm-dd hh24:mi:ss'), t.biz from TAB_IOPM_SITE t where t.biz is not null and mointor_status = 701 and t.spider_status = 603 and (t.last_article_release_time is null or t.last_article_release_time <= to_date('{}', 'yyyy-mm-dd hh24:mi:ss')) '''.format(before_tow_hours) accounts = self._oracledb.find(sql) # 若无抓取完的公众号,且redis中无抓取任务,则数据库中非603任务可能为丢失任务,需要重新下发 if not accounts and not self._redisdb.sget_count('wechat:account'): sql = ''' select t.id, t.domain, t.name, to_char(t.last_article_release_time, 'yyyy-mm-dd hh24:mi:ss'), t.biz from TAB_IOPM_SITE t where t.biz is not null and mointor_status = 701 and t.spider_status != 603 ''' accounts = self._oracledb.find(sql) return accounts
def parse_article_info(article_info, comm_msg_info): if not article_info: return # log.debug(tools.dumps_json(article_info)) title = article_info.get('title') digest = article_info.get('digest') url = article_info.get('content_url').replace('\\', '').replace('amp;', '') source_url = article_info.get('source_url').replace('\\', '') # 引用的文章链接 cover = article_info.get('cover').replace('\\', '') subtype = article_info.get('subtype') is_multi = article_info.get('is_multi') author = article_info.get('author') copyright_stat = article_info.get('copyright_stat') duration = article_info.get('duration') del_flag = article_info.get('del_flag') type = comm_msg_info.get('type') publish_time = tools.timestamp_to_date(comm_msg_info.get('datetime')) sn = tools.get_param(url, 'sn') if sn: # 缓存文章信息 article_data = { 'title': title, 'digest': digest, 'url': url, 'source_url': source_url, 'cover': cover, 'subtype': subtype, 'is_multi': is_multi, 'author': author, 'copyright_stat': copyright_stat, 'duration': duration, 'del_flag': del_flag, 'type': type, 'publish_time': publish_time, 'sn': sn, '__biz': __biz, 'spider_time': tools.get_current_date() } return article_data
def parser(url_info): # url = 'http://user.xiaoyouzb.net/v3/vod/small_recommend?nwtime=1571816563&sign=883f96aee2655d8885e7815de3423df7&type=1&cateId=13&pageNum=0&isFirst=N&_u=edac2c15598946bd9ba7bda78a83489c&version=4.7.0&platform=android&appx=yuntu&apppn=org.fungo.fungolive&enterprise=0&channel=tencent&market=32&os_version=8.0.0&device_model=MIX%25202&device_code=780493075490198&udid=77e2cb72797f20afdcaaa6265872cea9&androidId=220240afd2e0e640&source=android' root_url = url_info['url'] cname = url_info['remark']["category_name"] headers = { "User-Agent": "yuntutv/4.7.0 (Android 8.0.0)", "Host": "user.xiaoyouzb.net" } json_data = tools.get_json_by_requests(root_url, headers=headers) data_infos = json_data["data"] for data_info in data_infos: publishTime = data_info["publishTime"] release_time = tools.timestamp_to_date(str(publishTime)[:-3]) title = data_info["content"] content = data_info["content"] video_url = data_info["videoUrl"] img_url = data_info["coverUrl"] base_parser.save_info('content_info', site_id=SITE_ID, url=video_url, title=title, site_name=NAME, content=content, release_time=release_time, image_url=img_url, video_url=video_url, is_out_link=1, download_image=False, is_debug=False, ) base_parser.update_url('urls', root_url, Constance.DONE)
def monitor_cookies(self): ''' @summary: 监控管理cookies 1、删除无用的cookie : 不可用次数超过最大值 2、将闲置24小时的cookie 设为可用 --------- --------- @result: ''' # 删除无用的cookie sql = 'delete from sogou_cookies where un_available_times > %d'%MAX_UN_AVAILABLE_TIMES self._sqlite3db.delete(sql) # 将闲置24小时的cookie 设为可用 sql = ''' update sogou_cookies set is_available = 1 where un_available_time < '%s' '''%(tools.timestamp_to_date(tools.get_current_timestamp() - 24 * 60 * 60 )) self._sqlite3db.update(sql)
def parser(url_info): root_url = url_info['url'] para = url_info["remark"]["para_template"] headers = url_info["remark"]["header_template"] response = requests.get(root_url, params=para, headers=headers) time.sleep(2) json_info = response.json() cate = url_info["remark"]["cate_name"] data_jsons = jsonpath(json_info, "$..items..data") if cate != '': for data_info in data_jsons: data_json = json.loads(data_info) title = jsonpath(data_json, "$..title")[0] img_str = glom(data_json, "coverUrl") img_json = json.loads(img_str) img_url = img_json["L"][0] content = jsonpath(data_json, "$..summary")[0] updateTime = jsonpath(data_json, "$..updateTime")[0] video_str = glom(data_json, "videoUrl") video_json = json.loads(video_str) video_url = video_json["source"]["hd"] release_time = tools.timestamp_to_date(str(updateTime)[:-3]) base_parser.save_info( 'content_info', site_id=SITE_ID, url=video_url, title=title, site_name=NAME, content=content, release_time=release_time, image_url=img_url, video_url=video_url, is_out_link=1, download_image=False, is_debug=False, ) base_parser.update_url('urls', root_url, Constance.DONE)
def extract_info(json_data): try: data_infos = json_data["Data"]["List"] for data_info in data_infos: data = data_info["Data"] title = data["Title"] content = data["Summary"] img_url = data["Photo"] url = data["Href"] print(url) video_url = data["VideoUrl"] time_regx = r"/(\d+).shtml" time_str = tools.get_info(url, time_regx, fetch_one=True) release_time = tools.timestamp_to_date(time_str[:-3]) like_count = data_info["LikeCount"] comment_cnt = data_info["CommentCnt"] # print(title) # print(release_time) # print(content) # print(img_url) # print(video_url) # print(like_count) # print(comment_cnt) data_info = { "url": url, "title": title, "release_time": release_time, "content": content, "img_url": img_url, "video_url": video_url, "like_count": like_count, "comment_cnt": comment_cnt } save_info(data_info) except Exception as e: print(e)
def __parse_article_list(self, article_list, __biz, is_first_page=False): ''' @summary: 解析文章列表 --------- @param article_list: 文章列表信息 str --------- @result: True / None (True: 继续向下抓取; None: 停止向下抓取) ''' # log.debug(tools.dumps_json(article_list)) # 解析json内容里文章信息 def parse_article_info(article_info, comm_msg_info): if not article_info: return # log.debug(tools.dumps_json(article_info)) title = article_info.get('title') digest = article_info.get('digest') url = article_info.get('content_url').replace('\\', '').replace('amp;', '') source_url = article_info.get('source_url').replace('\\', '') # 引用的文章链接 cover = article_info.get('cover').replace('\\', '') subtype = article_info.get('subtype') is_multi = article_info.get('is_multi') author = article_info.get('author') copyright_stat = article_info.get('copyright_stat') duration = article_info.get('duration') del_flag = article_info.get('del_flag') type = comm_msg_info.get('type') publish_time = tools.timestamp_to_date(comm_msg_info.get('datetime')) sn = tools.get_param(url, 'sn') if sn: # 缓存文章信息 article_data = { 'title': title, 'digest': digest, 'url': url, 'source_url': source_url, 'cover': cover, 'subtype': subtype, 'is_multi': is_multi, 'author': author, 'copyright_stat': copyright_stat, 'duration': duration, 'del_flag': del_flag, 'type': type, 'publish_time': publish_time, 'sn': sn, '__biz': __biz, 'spider_time': tools.get_current_date() } return article_data # log.debug(tools.dumps_json(article_list)) article_list = tools.get_json(article_list) article_list_data = [] publish_time = None is_need_get_more = True article_list = article_list.get('list', []) is_first_article = True for article in article_list: comm_msg_info = article.get('comm_msg_info', {}) publish_timestamp = comm_msg_info.get('datetime') publish_time = tools.timestamp_to_date(publish_timestamp) # 记录最新发布时间 if is_first_page and is_first_article: self._task_manager.record_new_last_article_publish_time(__biz, publish_time) is_first_article = False if publish_timestamp and self._task_manager.is_zombie_account(publish_timestamp): # 首页检测是否为最新发布的文章 若最近未发布 则为僵尸账号 log.info('公众号 {} 为僵尸账号 不再监控'.format(__biz)) self._task_manager.sign_account_is_zombie(__biz, publish_time) is_need_get_more = False break # 对比时间 若采集到上次时间,则跳出 is_reach = self._task_manager.is_reach_last_article_publish_time(__biz, publish_time) if is_reach: log.info('采集到上次发布时间 公众号 {} 采集完成'.format(__biz)) new_last_publish_time = self._task_manager.get_new_last_article_publish_time(__biz) self._task_manager.update_account_last_publish_time(__biz, new_last_publish_time) is_need_get_more = False break elif is_reach is None: log.info('公众号 {} 为爬虫启动时的手点公众号。不遍历历史消息,即将抓取监控池里的公众号'.format(__biz)) return article_type = comm_msg_info.get('type') if article_type != 49: # 49为常见的图文消息、其他消息有文本、语音、视频,此处不采集,格式不统一 continue # 看是否在抓取时间范围 publish_time_status = self._task_manager.is_in_crawl_time_range(publish_time) if publish_time_status == TaskManager.OVER_MIN_TIME_RANGE: log.info('公众号 {} 超过采集时间范围 采集完成'.format(__biz)) new_last_publish_time = self._task_manager.get_new_last_article_publish_time(__biz) self._task_manager.update_account_last_publish_time(__biz, new_last_publish_time) is_need_get_more = False break elif publish_time_status == TaskManager.NOT_REACH_TIME_RANGE: log.info('公众号 {} 当前采集到的时间 {} 未到采集时间范围 不采集'.format(__biz, publish_time)) continue # 在时间范围 # 微信公众号每次可以发多个图文消息 # 第一个图文消息 app_msg_ext_info = article.get('app_msg_ext_info', {}) article_data = parse_article_info(app_msg_ext_info, comm_msg_info) if article_data: article_list_data.append(article_data) # 同一天附带的图文消息 multi_app_msg_item_list = app_msg_ext_info.get('multi_app_msg_item_list') for multi_app_msg_item in multi_app_msg_item_list: article_data = parse_article_info(multi_app_msg_item, comm_msg_info) if article_data: article_list_data.append(article_data) if article_list_data: data_pipeline.save_article_list(article_list_data) if is_need_get_more: return publish_time
def monitor_task(): task_manager = TaskManager() total_time = 0 task_count = 0 begin_time = None end_time = None spend_hours = None is_show_start_tip = False is_show_have_task = False while True: task_count = task_manager.get_task_count() if not task_count: if not is_show_start_tip: log.info('开始监控任务池...') is_show_start_tip = True total_time += CHECK_HAVE_TASK_SLEEP_TIME tools.delay_time(CHECK_HAVE_TASK_SLEEP_TIME) else: if not is_show_have_task: log.info('任务池中有%s条任务,work可以正常工作' % task_count) is_show_have_task = True total_time = 0 tools.delay_time(CHECK_HAVE_TASK_SLEEP_TIME) if total_time > MAX_NULL_TASK_TIME: is_show_start_tip = False is_show_have_task = False # 结束一轮 做些统计 if begin_time: # 统计时间 end_time = tools.timestamp_to_date( tools.get_current_timestamp() - MAX_NULL_TASK_TIME) spend_time = tools.date_to_timestamp( end_time) - tools.date_to_timestamp(begin_time) spend_hours = tools.seconds_to_h_m_s(spend_time) # 统计url数量 depth_count_info = task_manager.get_ever_depth_count(5) # 统计文章数量 article_count_msg = statistic_article_count.get_article_count_msg( begin_time, end_time) log.info( ''' ------- 已做完一轮 -------- \r开始时间:%s \r结束时间:%s \r耗时:%s \r网站数量:%s \rurl数量信息:%s \r文章数量信息:%s ''' % (begin_time, end_time, spend_hours, task_count, tools.dumps_json(depth_count_info), article_count_msg)) # 删除url指纹 log.info('删除url指纹...') task_manager.clear_task() log.info('redis 中连续%s秒无任务,超过允许最大等待%s秒 开始添加任务' % (total_time, MAX_NULL_TASK_TIME)) # 取任务 tasks = task_manager.get_task_from_oracle() if tasks: total_time = 0 task_manager.add_task_to_redis(tasks) task_count = task_manager.get_task_count() if task_count: begin_time = tools.get_current_date() log.info('添加任务到redis中成功 共添加%s条任务。 work开始工作' % (task_count)) else: log.error('未从oracle中取到任务')
def parser(url_info): url = url_info['url'] list_datas = tools.get_json_by_requests(url) list_datas = list_datas['list'] for list_data in list_datas: title = list_data['title'] watched_count = list_data['playsCounts'] image_url = list_data['coverLarge'] comment_count = list_data['commentsCount'] charge_type = list_data['priceTypeId'] is_finished = list_data['isFinished'] article_type = list_data['tags'] origin = list_data['provider'] episodes = list_data['tracks'] # uid = list_data['uid'] author = list_data['nickname'] album_id = list_data['albumId'] abstract = list_data['intro'] score = tools.get_json_value(list_data, 'score') # id = list_data['id'] new_url_2 = 'http://mobile.ximalaya.com/mobile/v1/album/rich?albumId=%s' % album_id list_datas_2 = tools.get_json_by_requests(new_url_2) content = tools.get_json_value(list_datas_2, 'data.album.intro') release_time = tools.get_json_value(list_datas_2, 'data.album.createdAt') release_time = tools.timestamp_to_date(release_time / 1000) update_time = tools.get_json_value(list_datas_2, 'data.album.lastUptrackAt') update_time = tools.timestamp_to_date(update_time / 1000) subscribe_count = tools.get_json_value(list_datas_2, 'data.album.subscribeCount') new_url_3 = 'http://mobile.ximalaya.com/mobile/v1/album/track?albumId=%s&device=android&isAsc=true&pageId=1&' \ 'pageSize=5000&pre_page=1' % album_id list_datas_3 = tools.get_json_by_requests(new_url_3) lists = tools.get_json_value(list_datas_3, 'data.list') log.debug(''' 书名: %s 作品类型: %s 集数: %s 评分: %s (免费作品均无评分) 订阅数: %s 作者: %s 创建时间: %s 最近更新日期: %s 贴图: %s 播放次数: %s 评论数: %s (免费作品均无评论) 收费类型: %s (0:免费,1:单期购买, 2:全集购买) 是否完结: %s (0、1:未完结, 2:完结) 提供者: %s 简介: %s 完整介绍: %s ''' % (title, article_type, episodes, score, subscribe_count, author, release_time, update_time, image_url, watched_count, comment_count, charge_type, is_finished, origin, abstract, content)) content_id = base_parser.add_wp_content_info( 'WP_content_info', SITE_ID, title=title, article_type=article_type, episodes=episodes, score=score, subscribe_count=subscribe_count, author=author, release_time=release_time, update_time=update_time, image_url=image_url, watched_count=watched_count, comment_count=comment_count, charge_type=charge_type, is_finished=is_finished, origin=origin, abstract=abstract, content=content, data_type=DATA_TYPE) for list in lists: title = list['title'] download_url = list['playPathAacv164'] watched_count = list['playtimes'] play_length = list['duration'] comments_count = list['comments'] create_time = list['createdAt'] create_time = tools.timestamp_to_date(create_time / 1000) # log.debug(''' # 书名: %s # 下载链接: %s # 播放次数: %s # 播放时长: %s # 评论数: %s # 创建时间: %s # ''' % ( # title, download_url, watched_count, play_length, comments_count, create_time)) base_parser.add_wp_content_episode_info( 'WP_content_episode_info', content_id=content_id, title=title, video_url=download_url, watched_count=watched_count, play_length=play_length, comments_count=comments_count, release_time=create_time, data_type=DATA_TYPE) base_parser.update_url('WP_urls', url, Constance.DONE)
def __parse_article_list(self, article_list, __biz, is_first_page=False): """ @summary: 解析文章列表 --------- @param article_list: 文章列表信息 str --------- @result: True / None (True: 继续向下抓取; None: 停止向下抓取) """ # log.debug(tools.dumps_json(article_list)) # 解析json内容里文章信息 def parse_article_info(article_info, comm_msg_info): if not article_info: return # log.debug(tools.dumps_json(article_info)) title = article_info.get("title") digest = article_info.get("digest") url = article_info.get("content_url").replace("\\", "").replace( "amp;", "") source_url = article_info.get("source_url").replace("\\", "") # 引用的文章链接 cover = article_info.get("cover").replace("\\", "") subtype = article_info.get("subtype") is_multi = article_info.get("is_multi") author = article_info.get("author") copyright_stat = article_info.get("copyright_stat") duration = article_info.get("duration") del_flag = article_info.get("del_flag") type = comm_msg_info.get("type") publish_time = tools.timestamp_to_date( comm_msg_info.get("datetime")) sn = tools.get_param(url, "sn") if sn: # 缓存文章信息 article_data = { "title": title, "digest": digest, "url": url, "source_url": source_url, "cover": cover, "subtype": subtype, "is_multi": is_multi, "author": author, "copyright_stat": copyright_stat, "duration": duration, "del_flag": del_flag, "type": type, "publish_time": publish_time, "sn": sn, "__biz": __biz, "spider_time": tools.get_current_date(), } return article_data # log.debug(tools.dumps_json(article_list)) article_list = tools.get_json(article_list) article_list_data = [] publish_time = None is_need_get_more = True article_list = article_list.get("list", []) is_first_article = True for article in article_list: comm_msg_info = article.get("comm_msg_info", {}) publish_timestamp = comm_msg_info.get("datetime") publish_time = tools.timestamp_to_date(publish_timestamp) # 记录最新发布时间 if is_first_page and is_first_article: self._task_manager.record_new_last_article_publish_time( __biz, publish_time) is_first_article = False if publish_timestamp and self._task_manager.is_zombie_account( publish_timestamp): # 首页检测是否为最新发布的文章 若最近未发布 则为僵尸账号 log.info("公众号 {} 为僵尸账号 不再监控".format(__biz)) self._task_manager.sign_account_is_zombie( __biz, publish_time) is_need_get_more = False break # 对比时间 若采集到上次时间,则跳出 is_reach = self._task_manager.is_reach_last_article_publish_time( __biz, publish_time) if is_reach: log.info("采集到上次发布时间 公众号 {} 采集完成".format(__biz)) new_last_publish_time = self._task_manager.get_new_last_article_publish_time( __biz) self._task_manager.update_account_last_publish_time( __biz, new_last_publish_time) is_need_get_more = False break elif is_reach is None: log.info( "公众号 {} 为爬虫启动时的手点公众号。不遍历历史消息,即将抓取监控池里的公众号".format(__biz)) return article_type = comm_msg_info.get("type") if article_type != 49: # 49为常见的图文消息、其他消息有文本、语音、视频,此处不采集,格式不统一 continue # 看是否在抓取时间范围 publish_time_status = self._task_manager.is_in_crawl_time_range( __biz, publish_time) if publish_time_status == TaskManager.OVER_MIN_TIME_RANGE: log.info("公众号 {} 超过采集时间范围 采集完成".format(__biz)) new_last_publish_time = self._task_manager.get_new_last_article_publish_time( __biz) self._task_manager.update_account_last_publish_time( __biz, new_last_publish_time) is_need_get_more = False break elif publish_time_status == TaskManager.NOT_REACH_TIME_RANGE: log.info("公众号 {} 当前采集到的时间 {} 未到采集时间范围 不采集".format( __biz, publish_time)) continue # 在时间范围 # 微信公众号每次可以发多个图文消息 # 第一个图文消息 app_msg_ext_info = article.get("app_msg_ext_info", {}) article_data = parse_article_info(app_msg_ext_info, comm_msg_info) if article_data: article_list_data.append(article_data) # 同一天附带的图文消息 multi_app_msg_item_list = app_msg_ext_info.get( "multi_app_msg_item_list") for multi_app_msg_item in multi_app_msg_item_list: article_data = parse_article_info(multi_app_msg_item, comm_msg_info) if article_data: article_list_data.append(article_data) if article_list_data: data_pipeline.save_article_list(article_list_data) if is_need_get_more: return publish_time
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] column_id = remark headers = { 'Host': 'is.snssdk.com', 'Accept': ' */*', 'X-SS-Cookie': '_ba=BA0.2-20170101-51e32-mV0oh6KwzUmWxXl227kO; install_id=8738029911; ttreq=1$b34d173d3544397b1ca82d19a58a7db80e2aef29; qh[360]=1; alert_coverage=33; _ga=GA1.2.1084363974.1479979043; login_flag=cd47dd57ff2f963719bc324163954696; sessionid=3554607744525de375854663cc7e355b; sid_guard="3554607744525de375854663cc7e355b|1489461314|2592000|Thu\054 13-Apr-2017 03:15:14 GMT"; sid_tt=3554607744525de375854663cc7e355b', 'tt-request-time': '1489990271848', 'Cookie': ' _ba=BA0.2-20170101-51e32-mV0oh6KwzUmWxXl227kO; install_id=8738029911; ttreq=1$b34d173d3544397b1ca82d19a58a7db80e2aef29; qh[360]=1; alert_coverage=33; _ga=GA1.2.1084363974.1479979043; login_flag=cd47dd57ff2f963719bc324163954696; sessionid=3554607744525de375854663cc7e355b; sid_guard="3554607744525de375854663cc7e355b|1489461314|2592000|Thu\054 13-Apr-2017 03:15:14 GMT"; sid_tt=3554607744525de375854663cc7e355b', 'User-Agent': 'News/6.0.1 (iPhone; iOS 10.2.1; Scale/3.00)', 'Accept-Language': ' zh-Hans-CN;q=1, en-CN;q=0.9', 'Accept-Encoding': 'gzip, deflate', 'Connection': ' keep-alive' } json = tools.get_json_by_requests(root_url) if not json: base_parser.update_url('VAApp_urls', root_url, Constance.EXCEPTION) return datas = json['data'] for data in datas: data = tools.get_json_value(data, 'content') title = tools.get_json_value(data, 'title') # 检测数据库中是否存在,若存在则退出 if db.find('VAApp_content_info', {'title': title}): continue abstract = tools.get_json_value(data, 'abstract') abstract = abstract and abstract or tools.get_json_value( data, 'content') img_url = tools.get_json_value(data, 'image_list.url') img_url = img_url and img_url or tools.get_json_value( data, 'middle_image.url') img_url = img_url and img_url or tools.get_json_value( data, 'large_image_list.url') img_url = img_url and img_url.replace('.webp', '.jpg') or img_url original_url = tools.get_json_value(data, 'article_url') original_url = original_url and original_url or tools.get_json_value( data, 'share_url') release_time = tools.get_json_value(data, 'publish_time') release_time = release_time and release_time or tools.get_json_value( data, '1481012423') release_time = release_time and tools.timestamp_to_date( release_time) or release_time video_msg = tools.get_json_value(data, 'video_play_info') #需要处理 video_main_url = tools.get_json_value(video_msg, 'video_list.video_2.main_url') video_main_url = video_main_url and video_main_url or tools.get_json_value( video_msg, 'video_list.video_1.main_url') parse_video_url = tools.compile_js(PARSE_VIDEO_URL_JSFUNC) video_url = parse_video_url('base64decode', video_main_url) html = tools.get_html_auto_deal_code(original_url) regexs = [ 'class="article-content">(.*?)<div class="article-actions">', '<div class="content">(.*?)<div class="suggestion-list-con"', '<!-- 文章内容 -->(.*?)<!-- @end 文章内容 -->', 'class="yi-content-text">(.*?)<div class="yi-normal"', '<p.*?>(.*?)</p>' ] if video_url: content = abstract else: content = ''.join(tools.get_info(html, regexs)) content = tools.del_html_tag(content) if len(content) < len(abstract): content = abstract # 敏感事件 sensitive_id = '' sensitive_event_infos = oracledb.find( 'select * from tab_mvms_sensitive_event') for sensitive_event_info in sensitive_event_infos: _id = sensitive_event_info[0] keyword1 = sensitive_event_info[3].split( ' ') if sensitive_event_info[3] else [] keyword2 = sensitive_event_info[4].split( ' ') if sensitive_event_info[4] else [] keyword3 = sensitive_event_info[5].split( ' ') if sensitive_event_info[5] else [] if base_parser.is_violate(title + content, key1=keyword1, key2=keyword2, key3=keyword3): sensitive_id = _id # 违规事件 violate_id = '' vioation_knowledge_infos = oracledb.find( 'select * from tab_mvms_violation_knowledge') for vioation_knowledge_info in vioation_knowledge_infos: _id = vioation_knowledge_info[0] keyword1 = vioation_knowledge_info[2].split( ' ') if vioation_knowledge_info[2] else [] keyword2 = vioation_knowledge_info[3].split( ' ') if vioation_knowledge_info[3] else [] keyword3 = vioation_knowledge_info[4].split( ' ') if vioation_knowledge_info[4] else [] if base_parser.is_violate(title + content, key1=keyword1, key2=keyword2, key3=keyword3): violate_id = _id log.debug(''' title: %s abstract : %s img_url : %s original_url: %s release_time : %s video_main_url: %s video_url: %s content : %s column_id: %d sensitive_id: %d violate_id: %d ''' % (title, abstract, img_url, original_url, release_time, video_main_url, video_url, content, column_id, sensitive_id and sensitive_id or 0, violate_id and violate_id or 0)) # 如果是视频栏 并且不包含敏感或违法信息 则不下载 if column_id == VIDEO: if not sensitive_id and not violate_id: continue # 下载 base_path = FILE_LOCAL_PATH is_download = 0 # 下载图片 img_name = '' if img_url: img_name = 'images/' + tools.get_current_date( date_format='%Y-%m-%d') + "/" + tools.get_current_date( date_format='%Y%m%d%H%M%S.%f') + '.jpg' is_download = tools.download_file(img_url, base_path, img_name) if not is_download: img_name = '' # 下载视频 video_name = '' if video_url: video_name = 'videos/' + tools.get_current_date( date_format='%Y-%m-%d') + "/" + tools.get_current_date( date_format='%Y%m%d%H%M%S.%f') + '.mp4' is_download = tools.download_file(video_url, base_path, video_name) if not is_download: video_name = '' if original_url: base_parser.add_va_app_content_info( 'VAApp_content_info', SITE_ID, title, abstract, img_url, img_name, original_url, release_time, video_url, video_name, content, column_id, is_download, sensitive_id, violate_id, STORAGE_ID) base_parser.update_url('VAApp_urls', root_url, Constance.DONE)
def deal_article(self, req_url, text): """ 解析文章 :param req_url: :param text: :return: """ sn = tools.get_param(req_url, "sn") if not text: self._task_manager.update_article_task_state(sn, -1) return None selector = Selector(text) content = selector.xpath( '//div[@class="rich_media_content "]|//div[@class="rich_media_content"]|//div[@class="share_media"]' ) title = (selector.xpath('//h2[@class="rich_media_title"]/text()'). extract_first(default="").strip()) account = (selector.xpath('//a[@id="js_name"]/text()').extract_first( default="").strip()) author = (selector.xpath( '//span[@class="rich_media_meta rich_media_meta_text"]//text()'). extract_first(default="").strip()) publish_timestamp = selector.re_first('n="(\d{10})"') publish_timestamp = int( publish_timestamp) if publish_timestamp else None publish_time = (tools.timestamp_to_date(publish_timestamp) if publish_timestamp else None) pics_url = content.xpath(".//img/@src|.//img/@data-src").extract() biz = tools.get_param(req_url, "__biz") digest = selector.re_first('var msg_desc = "(.*?)"') cover = selector.re_first('var cover = "(.*?)";') or selector.re_first( 'msg_cdn_url = "(.*?)"') source_url = selector.re_first("var msg_source_url = '(.*?)';") content_html = content.extract_first(default="") comment_id = selector.re_first('var comment_id = "(\d+)"') article_data = { "account": account, "title": title, "url": req_url, "author": author, "publish_time": publish_time, "__biz": biz, "digest": digest, "cover": cover, "pics_url": pics_url, "content_html": content_html, "source_url": source_url, "comment_id": comment_id, "sn": sn, "spider_time": tools.get_current_date(), } # 入库 if article_data and data_pipeline.save_article( article_data) is not None: self._task_manager.update_article_task_state(sn, 1) return self._task_manager.get_task()
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark']['keyword'] monitor_type = url_info['remark']['monitor_type'] official_accounts_id = remark retry_times = url_info['retry_times'] headers = { "Host": "weixin.sogou.com", "Connection": "keep-alive", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.8", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Cookie": "ABTEST=8|1506658658|v1; IPLOC=CN1100; SUID=C5C47C7B642E940A0000000059CDC962; SUID=C5C47C7B1508990A0000000059CDC963; weixinIndexVisited=1; SUV=00F95AA57B7CC4C559CDC963CE316529; SNUID=2B2A9295EDE8B7A2BCECB605EE30F1BE; JSESSIONID=aaadcwpP9yaKs-PCMhz6v", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36", "Upgrade-Insecure-Requests": "1" } # 获取代理 proxies = base_parser.get_proxies() headers["User-Agent"] = base_parser.get_user_agent() # 解析 # print(proxies) # html, r = tools.get_html_by_requests('http://ip.chinaz.com/getip.aspx', headers = headers, proxies = proxies) # print(html) html, request = tools.get_html_by_requests(root_url, headers = headers, proxies = proxies) if not html: base_parser.update_url('urls', root_url, Constance.TODO, retry_times + 1) return # print(html) regex = '<input type=text name="c" value="" placeholder="(.*?)" id="seccodeInput">' check_info = tools.get_info(html, regex, fetch_one = True) print(root_url) log.debug('取文章链接' + check_info) if check_info: base_parser.update_url('urls', root_url, Constance.TODO, retry_times + 1) return # 公众号信息块 regex = '<!-- a -->(.*?)<!-- z -->' account_block = tools.get_info(html, regex, fetch_one = True) # url regex = '<a.*?account_name.*?href="(.*?)">' account_url = tools.get_info(account_block, regex, fetch_one = True) account_url = account_url.replace('&',"&") log.debug('account_url = ' + account_url) if not account_url: base_parser.update_url('urls', root_url, Constance.EXCEPTION) return headers = { "Accept-Language": "zh-CN,zh;q=0.8", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding": "gzip, deflate", "Host": "mp.weixin.qq.com", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36", "Upgrade-Insecure-Requests": "1", "Connection": "keep-alive" } # 代理 proxies = base_parser.get_proxies() headers["User-Agent"] = base_parser.get_user_agent() proxies = {} #使用代理会出现验证码 暂时不使用 html, request = tools.get_html_by_requests(account_url, headers = headers, proxies = proxies) regex = '<input class="weui_input frm_input" id="input" placeholder="(.*?)" maxlength="4">' check_info = tools.get_info(html, regex, fetch_one = True) log.debug(''' 取文章详细内容 %s url %s request.headers %s '''%(check_info, account_url, request.headers)) # print(html) regex = 'var msgList = (.*?});' article_json = tools.get_info(html, regex, fetch_one = True) article_json = tools.get_json(article_json) article_list = article_json.get('list', {}) for article in article_list: title = tools.get_json_value(article, 'app_msg_ext_info.title') is_have = mongodb.find('WWA_wechat_article', {'title' : title}) if is_have: log.debug(title + " 已存在") continue summary = tools.get_json_value(article, 'app_msg_ext_info.digest') image_url = tools.get_json_value(article, 'app_msg_ext_info.cover') sexy_image_url = [] # 下载图片 local_image_url = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.jpg' is_download = tools.download_file(image_url, local_image_url) local_image_url = local_image_url if is_download else '' sexy_image_url.append(local_image_url) article_url = tools.get_json_value(article, 'app_msg_ext_info.content_url') article_url = tools.get_full_url('http://mp.weixin.qq.com', article_url) article_url = article_url.replace('&',"&") release_time = tools.get_json_value(article, 'comm_msg_info.datetime') release_time = tools.timestamp_to_date(int(release_time)) if release_time else '' content_html, request = tools.get_html_by_requests(article_url, headers = headers, proxies = proxies) regex = '(<div class="rich_media_content " id="js_content">.*?)<script nonce' content = tools.get_info(content_html, regex, fetch_one = True) # # 取content里的图片 下载图片 然后替换内容中原来的图片地址 regex = '<img.*?data-src="(.*?)"' images = tools.get_info(content, regex) for image in images: local_image_path = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.' + (image[image.find('wx_fmt=') + len('wx_fmt='):(image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) if image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) != -1 else None)] if 'wx_fmt=' in image else 'jpg') is_download = tools.download_file(image, local_image_path) if is_download: content = content.replace(image, local_image_path) sexy_image_url.append(local_image_path) tools.delay_time(5) # 敏感事件 sensitive_id = '' if monitor_type == 1 or monitor_type == 2: sensitive_event_infos = oracledb.find('select t.id, t.keyword1, t.keyword2, t.keyword3 from tab_mvms_sensitive_event t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time') for sensitive_event_info in sensitive_event_infos: _id = sensitive_event_info[0] keyword1 = sensitive_event_info[1].split(',') if sensitive_event_info[1] else [] keyword2 = sensitive_event_info[2].split(',') if sensitive_event_info[2] else [] keyword3 = sensitive_event_info[3].split(',') if sensitive_event_info[3] else [] if base_parser.is_violate(title + content, key1 = keyword1, key2 = keyword2, key3 = keyword3): sensitive_id = _id break # 违规事件 violate_id = '' if monitor_type == 0 or monitor_type == 2: vioation_knowledge_infos = oracledb.find('select t.id, t.keyword1, t.keyword2, t.keyword3 from tab_mvms_violation_knowledge t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time') for vioation_knowledge_info in vioation_knowledge_infos: _id = vioation_knowledge_info[0] keyword1 = vioation_knowledge_info[1].split(',') if vioation_knowledge_info[1] else [] keyword2 = vioation_knowledge_info[2].split(',') if vioation_knowledge_info[2] else [] keyword3 = vioation_knowledge_info[3].split(',') if vioation_knowledge_info[3] else [] if base_parser.is_violate(title + tools.del_html_tag(content), key1=keyword1, key2=keyword2, key3=keyword3): violate_id = _id break log.debug(''' 标题 %s 简介 %s 图片地址 %s 文章地址 %s 发布时间 %s 内容 %s 本地贴图地址 %s 违规状态 %s 敏感事件 %s 图片鉴别地址 %s '''%(title, summary, image_url, article_url, release_time, content, local_image_url, violate_id, sensitive_id, sexy_image_url)) base_parser.add_wechat_content_info('WWA_wechat_article', site_id, official_accounts_id, title, summary, image_url, article_url, release_time, content, video_url = '', local_image_url = local_image_url, violate_status = violate_id, sensitive_id = sensitive_id, sexy_image_url = sexy_image_url) # 同一天发布的 oneday_article_list = article.get('app_msg_ext_info', {}).get('multi_app_msg_item_list', []) for article in oneday_article_list: title = tools.get_json_value(article, 'title') summary = tools.get_json_value(article, 'digest') image_url = tools.get_json_value(article, 'cover') sexy_image_url = [] # 下载图片 local_image_url = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.jpg' is_download = tools.download_file(image_url, local_image_url) local_image_url = local_image_url if is_download else '' sexy_image_url.append(local_image_url) article_url = tools.get_json_value(article, 'content_url') article_url = tools.get_full_url('http://mp.weixin.qq.com', article_url) article_url = article_url.replace('&',"&") content_html, request = tools.get_html_by_requests(article_url, headers = headers, proxies = proxies) regex = '(<div class="rich_media_content " id="js_content">.*?)<script nonce' content = tools.get_info(content_html, regex, fetch_one = True) # 取content里的图片 下载图片 然后替换内容中原来的图片地址 regex = '<img.*?data-src="(.*?)"' images = tools.get_info(content, regex) for image in images: local_image_path = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.' + (image[image.find('wx_fmt=') + len('wx_fmt='):(image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) if image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) != -1 else None)] if 'wx_fmt=' in image else 'jpg') is_download = tools.download_file(image, local_image_path) if is_download: content = content.replace(image, local_image_path) sexy_image_url.append(local_image_path) tools.delay_time(5) # 敏感事件 sensitive_id = '' sensitive_event_infos = oracledb.find('select * from tab_mvms_sensitive_event') for sensitive_event_info in sensitive_event_infos: _id = sensitive_event_info[0] keyword1 = sensitive_event_info[3].split(',') if sensitive_event_info[3] else [] keyword2 = sensitive_event_info[4].split(',') if sensitive_event_info[4] else [] keyword3 = sensitive_event_info[5].split(',') if sensitive_event_info[5] else [] if base_parser.is_violate(title + content, key1 = keyword1, key2 = keyword2, key3 = keyword3): sensitive_id = _id break # 违规事件 violate_id = '' vioation_knowledge_infos = oracledb.find('select * from tab_mvms_violation_knowledge') for vioation_knowledge_info in vioation_knowledge_infos: _id = vioation_knowledge_info[0] keyword1 = vioation_knowledge_info[2].split(',') if vioation_knowledge_info[2] else [] keyword2 = vioation_knowledge_info[3].split(',') if vioation_knowledge_info[3] else [] keyword3 = vioation_knowledge_info[4].split(',') if vioation_knowledge_info[4] else [] if base_parser.is_violate(title + tools.del_html_tag(content), key1=keyword1, key2=keyword2, key3=keyword3): violate_id = _id break log.debug(''' 标题 %s 简介 %s 图片地址 %s 文章地址 %s 发布时间 %s 内容 %s 本地贴图地址 %s 违规状态 %s 敏感事件 %s 图片鉴别地址 %s '''%(title, summary, image_url, article_url, release_time, content, local_image_url, violate_id, sensitive_id, sexy_image_url)) base_parser.add_wechat_content_info('WWA_wechat_article', site_id, official_accounts_id, title, summary, image_url, article_url, release_time, content, video_url = '', local_image_url = local_image_url, violate_status = violate_id, sensitive_id = sensitive_id, sexy_image_url = sexy_image_url) base_parser.update_url('WWA_wechat_article_url', root_url, Constance.DONE) tools.delay_time()
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] # 解析 html, request = tools.get_html_by_requests(root_url, headers=HEADER) if not html: base_parser.update_url('urls', root_url, Constance.EXCEPTION) return news_box = tools.get_tag(html, name='div', attrs={'class': "news-box"})[0] news_list = tools.get_tag(news_box, name='li') for news in news_list: try: # 图片 image = tools.get_tag(news, name='img')[0] image = tools.get_json_value(image, 'src') # url url = tools.get_tag(news, name='h3')[0] try: url = tools.get_json_value(url.a, 'href') except: url = '' # 标题 title = tools.get_tag(news, name='h3')[0] title = tools.get_text(title) title = tools.del_html_tag(title) # 内容 content = tools.get_tag(news, name='p', attrs={'class': "txt-info"})[0] content = tools.get_text(content) content = tools.del_html_tag(content) # 观看数 watched_count = '' # 来源 origin = tools.get_tag(news, name='div', attrs={'class': "s-p"})[0] origin = ''.join(tools.get_info(origin, '<a.*?>(.*?)<')) # 日期 release_time = tools.get_tag(news, name='div', attrs={'class': "s-p"})[0] release_time = tools.get_json_value(release_time, 't') release_time = tools.timestamp_to_date(int(release_time)) # 判断是否有视频 根据视频播放图标判断 regex = '<div class="img-box">.*?<i></i>.*?</div>' play_icon = tools.get_info(news, regex) except: continue contained_key, contained_key_count = base_parser.get_contained_key( title, content, remark['search_keyword1'], remark['search_keyword2'], remark['search_keyword3']) log.debug( ''' 标题: %s 内容: %s 来源: %s 原文url:%s 图片url:%s 观看数: %s 日期: %s 有视频: %d 关键词: %s 关键词数:%s ''' % (title, content, origin, url, image, watched_count, release_time, play_icon and True or False, contained_key, contained_key_count)) if not contained_key or not play_icon: continue base_parser.add_content_info('VA_content_info', SITE_ID, url, title, content, image_url=image, release_time=release_time, origin=origin, watched_count=watched_count, search_type=SEARCH_TYPE, keyword=contained_key, keyword_count=contained_key_count) base_parser.update_url('VA_urls', root_url, Constance.DONE)
def __parse_article_list(self, article_list): ''' @summary: 解析文章列表 --------- @param article_list: 文章列表信息 str { "list":[ { "comm_msg_info":{ "id":1000000513, "type":49, "datetime":1511354167, "fakeid":"3082125093", "status":2, "content":"" }, "app_msg_ext_info":{ "title":"Python 内存优化", "digest":"实际项目中,pythoner更加关注的是Python的性能问题。本文,关注的是Python的内存优化,一般说来,如果不发生内存泄露,运行在服务端的Python代码不用太关心内存,但是如果运行在客户端,那还是有优化的必要。", "content":"", "fileid":505083208, "content_url":"http:\/\/mp.weixin.qq.com\/s?__biz=MzA4MjEyNTA5Mw==&mid=2652566858&idx=1&sn=d2a76f4a601f94d8acc7b436d18e9648&chksm=8464dd00b313541684c14f974325ea6ae725ffc901fd9888cc00d1acdd13619de3297a5d9a35&scene=27#wechat_redirect", "source_url":"http:\/\/www.cnblogs.com\/xybaby\/p\/7488216.html", "cover":"http:\/\/mmbiz.qpic.cn\/mmbiz_jpg\/fhujzoQe7TpODTuicia4geCiaIj1AbZwVQQVbRHy3FhzwMHEvCvtzXVicHTaPEu8jZ2pgkCAgBqEHugYMvzg3tpoww\/0?wx_fmt=jpeg", "subtype":9, "is_multi":1, "multi_app_msg_item_list":[ { "title":"面向对象:With the wonder of your love, the sun above always shines", "digest":"With the wonder of your love, the sun above always shines", "content":"", "fileid":505083209, "content_url":"http:\/\/mp.weixin.qq.com\/s?__biz=MzA4MjEyNTA5Mw==&mid=2652566858&idx=2&sn=97f223783da7748080f8103654447c99&chksm=8464dd00b313541601938565a41487ea76209331fd6f4c8996a2ff5572f4fd465de9fa4cbaac&scene=27#wechat_redirect", "source_url":"https:\/\/mp.weixin.qq.com\/s\/_uD9jY4nXQQ6CtA__dsN8w?scene=25#wechat_redirect", "cover":"http:\/\/mmbiz.qpic.cn\/mmbiz_jpg\/fhujzoQe7TpODTuicia4geCiaIj1AbZwVQQ5ukvwH1GPq5zlWxv05WvRiaw6BiaeyGRD1w17nAPGTlQgEvvDuZnB9HA\/0?wx_fmt=jpeg", "author":"", "copyright_stat":101, "del_flag":1 } ], "author":"", "copyright_stat":100, "del_flag":1 } } ] } --------- @result: ''' # log.debug(tools.dumps_json(article_list)) # 解析json内容里文章信息 def parse_article_info(article_info, release_time): if not article_info: return # log.debug(tools.dumps_json(article_info)) title = article_info.get('title') summary = article_info.get('digest') url = article_info.get('content_url').replace('\\', '').replace( 'amp;', '') source_url = article_info.get('source_url').replace('\\', '') # 引用的文章链接 cover = article_info.get('cover').replace('\\', '') author = article_info.get('author') if url and url.startswith( 'http://mp.weixin.qq.com/' ): # 被发布者删除的文章 无url和其他信息, 此时取不到mid 且不用入库, 或者商城类的url不入库 mid = tools.get_param(url, 'mid') or tools.get_param( url, 'appmsgid') # 图文消息id 同一天发布的图文消息 id一样 idx = tools.get_param(url, 'idx') or tools.get_param( url, 'itemidx') # 第几条图文消息 从1开始 article_id = mid + idx # 用mid和idx 拼接 确定唯一一篇文章 如mid = 2650492260 idx = 1,则article_id = 26504922601 # 判断该文章库中是否已存在 if WechatAction._wechat_service.is_exist( 'wechat_article', article_id) or (ONLY_TODAY_MSG and release_time < tools.get_current_date('%Y-%m-%d')): self._is_need_get_more = False return # 不往下进行 舍弃之后的文章 __biz = tools.get_param(url, '__biz') # 用于关联公众号 # 缓存文章信息 WechatAction._article_info[article_id] = { 'article_id': int(article_id), 'title': title, 'summary': summary, 'release_time': release_time, 'url': url, 'source_url': source_url, 'cover': cover, 'account': '', 'author': author, '__biz': __biz, 'read_num': None, 'like_num': None, 'content': '', 'comment': [], 'record_time': tools.get_current_date() } # 将文章url添加到待抓取队列 WechatAction._todo_urls.append(url) # log.debug(tools.dumps_json(article_list)) article_list = tools.get_json(article_list) article_list = article_list.get('list', []) for article in article_list: article_type = article.get('comm_msg_info', {}).get('type') if article_type != 49: # 49为常见的图文消息、其他消息有文本、语音、视频,此处不采集,格式不统一 continue release_time = article.get('comm_msg_info', {}).get('datetime') release_time = tools.timestamp_to_date(release_time) # 微信公众号每次可以发多个图文消息 # 第一个图文消息 app_msg_ext_info = article.get('app_msg_ext_info', {}) parse_article_info(app_msg_ext_info, release_time) if not self._is_need_get_more: break # 同一天附带的图文消息 multi_app_msg_item_list = app_msg_ext_info.get( 'multi_app_msg_item_list') for multi_app_msg_item in multi_app_msg_item_list: parse_article_info(multi_app_msg_item, release_time) if not self._is_need_get_more: break
def parser_comment_article(html, video_id, program_id, url, page=1): ''' @summary: 评论区 如 http://www.iqiyi.com/a_19rrhcvhph.html --------- @param html: @param video_id: --------- @result: ''' regex = 'data-qitancomment-tvid="(.*?)"' tvid = tools.get_info(html, regex, fetch_one=True) regex = 'data-qitancomment-qitanid="48970759"' aid = tools.get_info(html, regex, fetch_one=True) if not tvid and not aid: return comment_url = 'http://api-t.iqiyi.com/qx_api/comment/get_video_comments?aid={aid}&albumid={video_id}&categoryid=15&cb=fnsucc&escape=true&is_video_page=true&need_reply=true&need_subject=true&need_total=1&page={page}&page_size=10&page_size_reply=3&qitan_comment_type=1&qitanid={aid}&qypid=01010011010000000000&reply_sort=hot&sort=add_time&tvid={tvid}'.format( aid=aid, video_id=video_id, tvid=tvid, page=page) comment_json = tools.get_json_by_requests(comment_url) comments = comment_json.get('data', {}).get('comments', []) for comment in comments: article_id = comment.get('contentId') title = comment.get('title') content = comment.get('content') image_urls = None release_time = comment.get('addTime') release_time = tools.timestamp_to_date(release_time) up_count = comment.get('counterList', {}).get('likes') watch_count = comment.get('counterList', {}).get('reads') comment_count = comment.get('counterList', {}).get('replies') name = comment.get('userInfo', {}).get('uname') head_url = comment.get('userInfo', {}).get('icon') gender = int(comment.get('userInfo', {}).get('gender')) log.debug(''' id: %s 节目id %s 头像地址: %s 名字: %s 发布时间: %s 标题: %s 内容: %s 图片地址: %s 观看量: %s 点赞量: %s 评论量: %s ''' % (article_id, video_id, head_url, name, release_time, title, content, '', watch_count, up_count, comment_count)) if self_base_parser.add_article(article_id, head_url, name, release_time, title, content, image_urls, watch_count, up_count, comment_count, program_id=program_id, gender=gender, url=url, info_type=3, emotion=random.randint(0, 2), collect=0, source='爱奇艺'): # 解析回复 reply_list = comment.get('replyList') or [] parser_relpy_comment(reply_list) else: break else: if comments: parser_comment_article(html, video_id, program_id, url, page + 1)