示例#1
0
    def load_task(self):
        if TaskService._offset == 1:
            log.info('开始新的一轮抓取')
            TaskService._spider_start_timestamp = tools.get_current_timestamp()
            TaskService._total_task_size = 0

            # 清空url表
            TaskService._redisdb.clear('news:news_urls')
            TaskService._redisdb.clear('news:news_urls_dupefilter')


        task_sql = '''
            select *
              from (select t.id, t.name, t.position, t.url, t.depth, rownum r
                      from TAB_IOPM_SITE t
                     where classify = 1
                       and t.mointor_status = 701
                       and t.position != 35
                       and rownum < {page_size})
             where r >= {offset}
        '''.format(page_size = TaskService._offset + TASK_BUFFER_SIZE, offset = TaskService._offset)
        TaskService._offset += TASK_BUFFER_SIZE

        print(task_sql)
        tasks = TaskService._db.find(task_sql)
        TaskService._total_task_size += len(tasks)

        if not tasks:
            TaskService._spider_end_timestamp = tools.get_current_timestamp()
            log.info('已做完一轮,共处理网站%s个 耗时%s'%(TaskService._total_task_size, tools.seconds_to_h_m_s(TaskService._spider_end_timestamp - TaskService._spider_start_timestamp)))
            TaskService._offset = 1
            self.load_task()

        TaskService._task_ring_buff.put_data(tasks)
示例#2
0
    def get_download_url(url):
        html, r = tools.get_html_by_requests(url)

        tvid = re.compile('player-tvid="(\d{4,11})"').findall(str(html))
        if not tvid:
            tvid = re.compile('list-tvid="(\d{4,11})"').findall(str(html))
        for i in tvid:
            tvid = i

        album_id = ''.join(re.compile('player-albumid="(\d{4,11})"').findall(str(html)))
        if not album_id:
            album_id = ''.join(re.compile('list-albumid="(\d{4,11})"').findall(str(html)))
            if not album_id:
                album_id = ''.join(re.compile('albumId: ?(\d{4,11}),').findall(str(html)))
                if not album_id:
                    album_id = ''.join(re.compile('param\[\'albumId\'\] ?= ?"(\d{4,11})"').findall(str(html)))

        current_time = tools.get_current_timestamp() * 1000
        current_time = str(current_time)

        url = 'http://iface2.iqiyi.com/video/3.0/v_download?app_k=8e48946f144759d86a50075555fd5862&app_v=8.1&qyid=D2E02B97-0F35-486F-9CD4-A2EC13BBC8FB&secure_p=iPhone&secure_v=1&dev_hw=%7B%22cpu%22:%22%22,%22mem%22:%222802%22%7D&net_sts=1&device_id=D2E02B97-0F35-486F-9CD4-A2EC13BBC8FB&dev_os=10.2.1&dev_ua=iPhone9,2&net_ip=%7B%22country%22:%22%E4%B8%AD%E5%9B%BD%22,%22province%22:%22%E5%8C%97%E4%BA%AC%22,%22city%22:%22%E5%8C%97%E4%BA%AC%22,%22cc%22:%22%E5%9B%BD%E5%86%85%E5%85%B6%E4%BB%96%22,%22area%22:%22%E5%8D%8E%E5%8C%97%22,%22timeout%22:0,%22respcode%22:0%7D&album_id=' + album_id + '&tvid=' + tvid + '&req_times=1&play_core=0&platform_id=12&app_p=iphone&app_t=0&usr_res=16&ppid=1229289410&cookie=53igk5Vn7X1xpazWBjzW2HUN4XGjNSP4aQypF7affdnBUaC6rknOS4dzvIcU1pMm2m2Qfb&lang=zh_CN&app_lm=cn&pps=0&req_sn=' + current_time
        json_ = tools.get_json_by_requests(url, headers=DOWNLOAD_HEADER)

        try:
            video_download_url = ''.join(re.compile('\'1\': {(.+?)},').findall(str(json_)))
            video_download_url = ''.join(re.compile('\'url\': ?\'(.+?)\'').findall(str(video_download_url)))
            video_download_url, r = tools.get_html_by_requests(video_download_url)
            video_download_url = ''.join(re.compile('"l":"(.+?)"').findall(str(video_download_url)))
        except:
            video_download_url = ''
        return video_download_url
示例#3
0
def parser_comment(content_id, wall_id, page=1):
    log.debug('正在爬取第 %s 页文章评论 content_id = %s' % (page, content_id))
    flow_comment_url = 'http://sns-comment.iqiyi.com/v2/comment/get_comments.action?contentid={content_id}&page={page}&authcookie=null&page_size=40&wallId={wall_id}&agenttype=117&t={timestamp_m}'.format(
        content_id=content_id,
        page=page,
        wall_id=wall_id,
        timestamp_m=int(tools.get_current_timestamp() * 1000))

    comment_json = tools.get_json_by_requests(flow_comment_url)
    data = comment_json.get('data', {})

    # 可作为翻页的依据
    total_count = data.get('totalCount', 0)
    count = data.get('count', 0)

    replies = data.get('replies', [])
    for reply in replies:
        reply_source = reply.get("replySource", {})
        if not deal_comment(reply_source):
            break

        if not deal_comment(reply):
            break

    else:
        if replies:
            parser_comment(content_id, wall_id, page + 1)
示例#4
0
    def __open_next_page(self):
        '''
        @summary: 跳转到历史文章
        ---------
        @param __biz:
        @param pass_ticket:
        @param appmsg_token:
        @param offset:
        ---------
        @result:
        '''
        is_done = False  # 是否做完一轮
        is_all_done = False  # 是否全部做完(所有公众号当日的发布的信息均已采集)

        if WechatAction._todo_urls:
            url = WechatAction._todo_urls.popleft()
        else:
            # 做完一个公众号 更新其文章数
            WechatAction._wechat_service.update_account_article_num(
                WechatAction._current_account_biz)

            # 跳转到下一个公众号
            account_id, __biz, is_done, is_all_done = WechatAction._wechat_service.get_next_account(
            )
            WechatAction._account_info[__biz] = account_id or ''

            # url = 'http://mp.weixin.qq.com/mp/getmasssendmsg?__biz=%s#wechat_webview_type=1&wechat_redirect'%__biz
            url = 'https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=%s&scene=124#wechat_redirect' % __biz
            log.debug('''
                下一个公众号 : %s
                ''' % url)

        # 注入js脚本实现自动跳转
        if is_all_done:  # 当天文章均已爬取 下一天再爬
            # 睡眠到下一天
            sleep_time = self.get_next_day_time_interval()
        elif is_done:  # 做完一轮 休息
            sleep_time = self.get_wait_time()
        elif ONLY_TODAY_MSG and tools.get_current_date(
        ) < tools.get_current_date(
                "%Y-%m-%d"
        ) + ' ' + SPIDER_START_TIME:  # 只爬取今日文章且当前时间小于指定的开始时间,则休息不爬取,因为公众号下半夜很少发布文章
            sleep_time = self.get_spider_start_time_interval()
        else:  # 做完一篇文章 间隔一段时间
            sleep_time = self.get_sleep_time()

        log.debug('''
            next_page_url : %s
            is_done:        %s
            is_all_done:    %s
            sleep_time:     %s
            next_start_time %s
            ''' % (url, is_done, is_all_done,
                   tools.seconds_to_h_m_s(sleep_time / 1000),
                   tools.timestamp_to_date(tools.get_current_timestamp() +
                                           sleep_time / 1000)))
        next_page = "<script>setTimeout(function(){window.location.href='%s';},%d);</script>" % (
            url, sleep_time)
        return next_page
示例#5
0
    def get_task(self, url=None, tip=''):
        """
        获取任务
        :param url: 指定url时,返回该url包装后的任务。否则先取公众号任务,无则取文章任务。若均无任务,则休眠一段时间之后再取
        :return:
        """

        sleep_time = random.randint(self._spider_interval_min,
                                    self._spider_interval_max)

        if not url:
            account_task = self.get_account_task()
            if account_task:
                __biz = account_task.get('__biz')
                last_publish_time = account_task.get('last_publish_time')
                self.record_last_article_publish_time(__biz, last_publish_time)
                tip = '正在抓取列表'
                url = 'https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz={}&scene=124#wechat_redirect'.format(
                    __biz)
            else:
                article_task = self.get_article_task()
                if article_task:
                    tip = '正在抓取详情'
                    url = article_task.get('article_url')
                else:
                    sleep_time = config.get('spider').get('no_task_sleep_time')
                    log.info('暂无任务 休眠 {}s'.format(sleep_time))
                    tip = '暂无任务 '

        if url:
            next_page = "{tip} 休眠 {sleep_time}s 下次刷新时间 {begin_spider_time} <script>setTimeout(function(){{window.location.href='{url}';}},{sleep_time_msec});</script>".format(
                tip=tip and tip + ' ',
                sleep_time=sleep_time,
                begin_spider_time=tools.timestamp_to_date(
                    tools.get_current_timestamp() + sleep_time),
                url=url,
                sleep_time_msec=sleep_time * 1000)
        else:
            next_page = "{tip} 休眠 {sleep_time}s 下次刷新时间 {begin_spider_time} <script>setTimeout(function(){{window.location.reload();}},{sleep_time_msec});</script>".format(
                tip=tip and tip + ' ',
                sleep_time=sleep_time,
                begin_spider_time=tools.timestamp_to_date(
                    tools.get_current_timestamp() + sleep_time),
                sleep_time_msec=sleep_time * 1000)

        return next_page
示例#6
0
    def __open_next_page(self):
        '''
        @summary: 跳转到历史文章
        ---------
        @param __biz:
        @param pass_ticket:
        @param appmsg_token:
        @param offset:
        ---------
        @result:
        '''
        is_done = False # 是否做完一轮
        url = None

        while WechatAction._todo_urls:
            result = WechatAction._todo_urls.popleft()
            if callable(result): # 为更新公众号已做完的回调
                result() #执行回调
            else:
                url = result
                break

        if not url:
            # 跳转到下一个公众号
            account = WechatAction._wechat_service.get_next_account()
            if account:
                account_id, __biz = account
                WechatAction._account_info[__biz] = account_id or ''

                url = 'https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=%s&scene=124#wechat_redirect'%__biz
                log.debug('''
                    下一个公众号 : %s
                    '''%url)
            else:
                is_done = True

        # 注入js脚本实现自动跳转
        if is_done: # 做完一轮 休息
            sleep_time = self.get_wait_time()
        elif ONLY_TODAY_MSG and tools.get_current_date() < tools.get_current_date("%Y-%m-%d") + ' ' + SPIDER_START_TIME: # 只爬取今日文章且当前时间小于指定的开始时间,则休息不爬取,因为公众号下半夜很少发布文章
            sleep_time = self.get_spider_start_time_interval()
        else: # 做完一篇文章 间隔一段时间
            sleep_time = self.get_sleep_time()

        tip_sleep_time = tools.seconds_to_h_m_s(sleep_time / 1000)
        tip_next_start_time = tools.timestamp_to_date(tools.get_current_timestamp() + sleep_time / 1000)
        if not url:
            url = 'http://localhost:6210/tip/wait?sleep_time={}&next_start_time={}'.format(tip_sleep_time, tip_next_start_time)

        log.debug('''
            next_page_url : %s
            is_done:        %s
            sleep_time:     %s
            next_start_time %s
            '''%(url, is_done, tip_sleep_time, tip_next_start_time))
        next_page = "休眠 %s 下次刷新时间 %s<script>setTimeout(function(){window.location.href='%s';},%d);</script>"%(tip_sleep_time, tip_next_start_time, url, sleep_time)
        return next_page
示例#7
0
def add_root_url(parser_params={}):
    log.debug('''
        添加根url
        parser_params : %s
        ''' % str(parser_params))

    base_url = 'http://is.snssdk.com/api/news/feed/v51/'

    params = 泸州

    time_interval = ONE_PAGE_TIME_INTERVAL
    content_released_time = tools.get_current_timestamp() - 86400 * 30  # 一天
    current_timestamp = tools.get_current_timestamp()

    max_behot_time = current_timestamp
    while max_behot_time >= content_released_time:
        max_behot_time -= time_interval

        current_timestamp = current_timestamp + random.randint(60, 300)

        # 泸州的文章
        params['category'] = 'news_local'
        params[
            'last_refresh_sub_entrance_interval'] = current_timestamp  # + random.randint(60, 300)
        params['max_behot_time'] = max_behot_time

        url = tools.joint_url(base_url, params)
        base_parser.add_url('WWA_app_urls', SITE_ID, url, remark=NEWS_LOCAL)

        # 视频
        params['category'] = 'video'
        params[
            'last_refresh_sub_entrance_interval'] = current_timestamp  # + random.randint(60, 300)
        params['max_behot_time'] = max_behot_time

        url = tools.joint_url(base_url, params)
        base_parser.add_url('WWA_app_urls', SITE_ID, url, remark=VIDEO)
示例#8
0
def get_url(time_lenght = 60):
    '''
    @summary:
    ---------
    @param time_lenght: 时间段 分钟
    ---------
    @result:
    '''

    current_date = tools.get_current_date()
    per_date = tools.read_file(STO_PER_SYNC_TIME) or tools.timestamp_to_date(tools.get_current_timestamp() - time_lenght * 60)

    tools.write_file(STO_PER_SYNC_TIME, current_date)

    root_url = 'http://192.168.60.38:8001/hotspot_al/interface/getCluesDataSearchInfo?pageNo=%d&pageSize=100&updateSTime={per_date}&updateETime={current_date}&sort=5&isDesc=0'.format(per_date = per_date, current_date = current_date)
    return root_url
示例#9
0
    def get_next_day_time_interval(self):
        '''
        @summary: 获取爬虫次日开始爬取的时间
        当日公众号新发布的文章均已爬取,则次日9:00开始爬取
        ---------
        ---------
        @result:
        '''
        tomorrow = tools.get_tomorrow() + ' ' + SPIDER_START_TIME
        current_timestamp = tools.get_current_timestamp()
        tomorrow_timestamp = tools.date_to_timestamp(tomorrow)

        next_day_time_interval = tomorrow_timestamp - current_timestamp  # 秒
        # 转换为毫秒
        next_day_time_interval *= 1000

        return next_day_time_interval
示例#10
0
    def get_wait_check_account(self):
        '''
        @summary:
        ---------
        @param :
        ---------
        @result:
        '''
        # 取抓取完的公众号,且最近发布时间已过去两小时,则再次监测是否又发布新文章
        before_tow_hours = tools.timestamp_to_date(
            tools.get_current_timestamp() - 60 * 60 * 2)
        sql = '''
            select t.id,
                   t.domain,
                   t.name,
                   to_char(t.last_article_release_time, 'yyyy-mm-dd hh24:mi:ss'),
                   t.biz
              from TAB_IOPM_SITE t
             where t.biz is not null
               and mointor_status = 701
               and t.spider_status = 603
               and (t.last_article_release_time is null or
                   t.last_article_release_time <=
                   to_date('{}', 'yyyy-mm-dd hh24:mi:ss'))
        '''.format(before_tow_hours)

        accounts = self._oracledb.find(sql)

        # 若无抓取完的公众号,且redis中无抓取任务,则数据库中非603任务可能为丢失任务,需要重新下发
        if not accounts and not self._redisdb.sget_count('wechat:account'):
            sql = '''
                select t.id,
                       t.domain,
                       t.name,
                       to_char(t.last_article_release_time, 'yyyy-mm-dd hh24:mi:ss'),
                       t.biz
                  from TAB_IOPM_SITE t
                 where t.biz is not null
                   and mointor_status = 701
                   and t.spider_status != 603
            '''

            accounts = self._oracledb.find(sql)

        return accounts
示例#11
0
    def get_spider_start_time_interval(self):
        '''
        @summary: 获取爬虫开始爬取的时间
        当日爬取时间小于9:00 则9点后爬取
        ---------
        ---------
        @result:
        '''

        spider_start_time = tools.get_current_date("%Y-%m-%d") + ' ' + SPIDER_START_TIME
        current_timestamp = tools.get_current_timestamp()
        spider_start_timestamp = tools.date_to_timestamp(spider_start_time)

        spider_start_time_interval = spider_start_timestamp - current_timestamp # 秒
        # 转换为毫秒
        spider_start_time_interval *= 1000

        return spider_start_time_interval
示例#12
0
def add_root_url(parser_params = {}):
    log.debug('''
        添加根url
        parser_params : %s
        '''%str(parser_params))
    category_infos = [(-1, "推荐"), (10, "体育"), (11, "资讯"), (12, "影视"), (13, "娱乐"), (17,"社会")]
    for cid, cname in category_infos:
        nwtime = tools.get_current_timestamp()
        #推荐模块
        sign = get_md5_sign(nwtime)
        tj_base_url = 'http://user.xiaoyouzb.net/v3/vod/small_recommend?'
        para = {
            "nwtime": "{}".format(nwtime),
            "sign": "{}".format(sign),
            "type": "1",
            "cateId": "{}".format(cid),
            "pageNum": "0",
            "isFirst": "N",
            "_u": "edac2c15598946bd9ba7bda78a83489c",
            "version": "4.7.0",
            "platform": "android",
            "appx": "yuntu",
            "apppn": "org.fungo.fungolive",
            "enterprise": "0",
            "channel": "tencent",
            "market": "32",
            "os_version": "8.0.0",
            "device_model": "MIX%202",
            "device_code": "780493075490198",
            "udid": "77e2cb72797f20afdcaaa6265872cea9",
            "androidId": "220240afd2e0e640",
            "source": "android",
        }
        tj_url = tj_base_url + urlencode(para)

        base_parser.add_url('urls', SITE_ID, tj_url, remark={"category_name": cname}, depth=0)

        url = tj_url.replace("pageNum=0", "pageNum={}")
        url_pages = [url.format(page) for page in range(1, 61)]
        for url_page in url_pages:
            base_parser.add_url('urls', SITE_ID, url_page, remark={"category_name": cname}, depth=0)
示例#13
0
    def monitor_cookies(self):
        '''
        @summary: 监控管理cookies
        1、删除无用的cookie : 不可用次数超过最大值
        2、将闲置24小时的cookie 设为可用
        ---------
        ---------
        @result:
        '''

        # 删除无用的cookie
        sql = 'delete from sogou_cookies where un_available_times > %d'%MAX_UN_AVAILABLE_TIMES
        self._sqlite3db.delete(sql)

        # 将闲置24小时的cookie 设为可用
        sql = '''
            update sogou_cookies set
                is_available = 1
            where un_available_time < '%s'
        '''%(tools.timestamp_to_date(tools.get_current_timestamp() - 24 * 60 * 60 ))

        self._sqlite3db.update(sql)
def juji_parser(url, remark):
    program_id = remark

    html, res = tools.get_html_by_requests(url)

    tvid = tools.get_info(
        html, ['player-tvid="(\d{4,11})"', 'list-tvid="(\d{4,11})"'],
        fetch_one=True)
    pcInfo_url = "http://mixer.video.iqiyi.com/jp/mixin/videos/" + str(tvid)
    # print(pcInfo_url)
    html2, res = tools.get_html_by_requests(pcInfo_url)

    album_id = tools.get_info(html, [
        'player-albumid="(\d{4,11})', 'list-albumid="(\d{4,11})"',
        'albumId: ?(\d{4,11}),', 'param\[\'albumId\'\] ?= ?"(\d{4,11})"'
    ],
                              fetch_one=True)

    episode_name = tools.get_info(html, ['meta.+?"irTitle" content="(.+?)"'],
                                  fetch_one=True)

    image_url = tools.get_info(html,
                               ['<meta property="og:image" content="(.+?)"/>'],
                               fetch_one=True)
    image_url = image_url.replace('.jpg', '_160_90.jpg')

    play_count = tools.get_info(html2, ['"playCount":(.+?),'], fetch_one=True)
    time_length = tools.get_info(html2, ['"duration":\s*(.+?),'],
                                 fetch_one=True)
    episode_num = tools.get_info(html2, ['"order":\s*(.+?),'], fetch_one=True)

    current_time = tools.get_current_timestamp() * 1000
    current_time = str(current_time)

    download_json_url = 'http://iface2.iqiyi.com/video/3.0/v_download?app_k=8e48946f144759d86a50075555fd5862&app_v=8.1&qyid=D2E02B97-0F35-486F-9CD4-A2EC13BBC8FB&secure_p=iPhone&secure_v=1&dev_hw=%7B%22cpu%22:%22%22,%22mem%22:%222802%22%7D&net_sts=1&device_id=D2E02B97-0F35-486F-9CD4-A2EC13BBC8FB&dev_os=10.2.1&dev_ua=iPhone9,2&net_ip=%7B%22country%22:%22%E4%B8%AD%E5%9B%BD%22,%22province%22:%22%E5%8C%97%E4%BA%AC%22,%22city%22:%22%E5%8C%97%E4%BA%AC%22,%22cc%22:%22%E5%9B%BD%E5%86%85%E5%85%B6%E4%BB%96%22,%22area%22:%22%E5%8D%8E%E5%8C%97%22,%22timeout%22:0,%22respcode%22:0%7D&album_id=' + album_id + '&tvid=' + tvid + '&req_times=1&play_core=0&platform_id=12&app_p=iphone&app_t=0&usr_res=16&ppid=1229289410&cookie=53igk5Vn7X1xpazWBjzW2HUN4XGjNSP4aQypF7affdnBUaC6rknOS4dzvIcU1pMm2m2Qfb&lang=zh_CN&app_lm=cn&pps=0&req_sn=' + current_time
    json_ = tools.get_json_by_requests(download_json_url,
                                       headers=download_header)
    download_url = tools.get_json_value(json_, 'video.mp4_res.1.url')
    download_url, res = tools.get_html_by_requests(download_url)
    download_url = tools.get_info(download_url, ['"l":"(.+?)"'],
                                  fetch_one=True)

    log.debug('''
                                        节目id:            %s
                                        当前集数:          %s
                                        本集时长:          %s
                                        播放次数:          %s
                                        节目名称:          %s
                                        下载地址:          %s
                                        节目链接:          %s
                                        图片地址:          %s
                                        ''' %
              (program_id, episode_num, time_length, play_count, episode_name,
               download_url, url, image_url))

    base_parser.add_program_episode_info('PROGRAM_EPISODE_info',
                                         SITE_ID,
                                         program_id=program_id,
                                         episode_num=episode_num,
                                         time_length=time_length,
                                         episode_name=episode_name,
                                         download_url=download_url,
                                         episode_url=url,
                                         image_url=image_url,
                                         play_count=play_count)

    base_parser.update_url('PROGRAM_urls', url, Constance.DONE)
示例#15
0
class WechatService():
    _db = OracleDB()
    _es = ES()
    _redisdb = RedisDB()
    _wechat_sogou = WechatSogou()
    _wechat_public_platform = WechatPublicPlatform()

    _todo_accounts = collections.deque()
    _rownum = 1

    _is_done = False  # 做完一轮
    _is_all_done = False  # 所有账号当日发布的消息均已爬取

    # wechat_sogou 最后没被封的时间
    _wechat_sogou_enable = True
    _wechat_sogou_last_unenable_time = tools.get_current_timestamp()

    # wechat_public_platform 最后没被封的时间
    _wechat_public_platform_enable = True
    _wechat_public_platform_last_unenable_time = tools.get_current_timestamp()

    def __init__(self):
        pass

    def __load_todo_account(self):
        accounts = WechatService._redisdb.sget('wechat:account', count=1)

        for account in accounts:
            account = eval(account)
            WechatService._todo_accounts.append(account)

    def is_have_new_article(self, account_id, account_name, __biz):
        '''
        @summary: 检查是否有新发布的文章
        ---------
        @param account_id:
        @param __biz:
        ---------
        @result:
        '''

        result = ''
        if WechatService._wechat_sogou_enable:  # 搜狗微信可用
            result = WechatService._wechat_sogou.is_have_new_article(
                account_id=account_id, account=account_name)
            if result == constance.UPDATE:
                # 有新发布的文章 抓取
                pass

            elif result == constance.NOT_UPDATE:
                # 无新发布的文章 pass
                pass

            elif result == constance.ERROR:
                pass

            elif result == constance.VERIFICATION_CODE:
                # 被封了 请求失败 记录下失败时间
                WechatService._wechat_sogou_enable = False
                WechatService._wechat_sogou_last_unenable_time = tools.get_current_timestamp(
                )

        # 搜狗微信停用时间超过24小时了 可重新尝试
        elif tools.get_current_timestamp(
        ) - WechatService._wechat_sogou_last_unenable_time > TIME_INTERVAL:  # 搜狗微信不可用 但是已经间歇一天 还可以一试
            result = WechatService._wechat_sogou.is_have_new_article(
                account_id=account_id, account=account_name)
            if result == constance.UPDATE:
                # 搜狗微信可用
                WechatService._wechat_sogou_enable = True

            elif result == constance.NOT_UPDATE:
                pass

            elif result == constance.ERROR:
                pass

            elif result == constance.VERIFICATION_CODE:
                pass

            # 更新下可用时间
            WechatService._wechat_sogou_last_unenable_time = tools.get_current_timestamp(
            )

        # 如果搜狗微信不可用 则使用微信公众平台检查是否有新发布的文章
        if not result or result == constance.VERIFICATION_CODE:
            if WechatService._wechat_public_platform_enable:  # 微信公众平台可用
                result = WechatService._wechat_public_platform.is_have_new_article(
                    __biz)
                if result == constance.UPDATE:
                    # 有新发布的文章 抓取
                    pass

                elif result == constance.NOT_UPDATE:
                    # 无新发布的文章 pass
                    pass

                elif result == constance.ERROR:
                    # 被封了 请求失败 记录下失败时间
                    WechatService._wechat_public_platform_enable = False
                    WechatService._wechat_public_platform_last_unenable_time = tools.get_current_timestamp(
                    )

            elif tools.get_current_timestamp(
            ) - WechatService._wechat_public_platform_last_unenable_time > TIME_INTERVAL:  # 搜狗微信不可用 但是已经间歇一天 还可以一试
                result = WechatService._wechat_public_platform.is_have_new_article(
                    __biz)
                if result == constance.UPDATE:
                    # 有新发布的文章 抓取
                    WechatService._wechat_public_platform_enable = True

                elif result == constance.NOT_UPDATE:
                    # 无新发布的文章 pass
                    pass

                elif result == constance.ERROR:
                    # 被封了 请求失败 记录下失败时间
                    pass

                # 更新下可用时间
                WechatService._wechat_public_platform_last_unenable_time = tools.get_current_timestamp(
                )

        return result

    def get_next_account(self):
        '''
        @summary:
        ---------
        ---------
        @result: 返回biz, 是否已做完一圈 (biz, True)
        '''

        if not WechatService._todo_accounts:
            self.__load_todo_account()

        if not WechatService._todo_accounts:
            return None

        oralce_id, account_id, account_name, last_article_release_time, biz = WechatService._todo_accounts.popleft(
        )
        next_account_id = account_id
        next_account_biz = biz
        next_account_name = account_name

        next_account = next_account_id, next_account_biz

        sql = "update TAB_IOPM_SITE t set t.spider_status=602 where t.biz = '%s'" % (
            next_account_biz)
        WechatService._db.update(sql)

        return next_account

    def update_account_article_num(self, __biz):
        # 查询es 统计数量
        # 今日
        body = {
            "size": 0,
            "query": {
                "filtered": {
                    "filter": {
                        "range": {
                            "record_time": {
                                "gte":
                                tools.get_current_date('%Y-%m-%d') +
                                ' 00:00:00',
                                "lte":
                                tools.get_current_date('%Y-%m-%d') +
                                ' 23:59:59'
                            }
                        }
                    },
                    "query": {
                        'match': {
                            "__biz": __biz
                        }
                    }
                }
            }
        }
        result = WechatService._es.search('wechat_article', body)
        today_msg = result.get('hits', {}).get('total', 0)

        # 历史总信息量
        body = {
            "size": 0,
            "query": {
                "filtered": {
                    "query": {
                        'match': {
                            "__biz": __biz
                        }
                    }
                }
            }
        }
        result = WechatService._es.search('wechat_article', body)
        total_msg = result.get('hits', {}).get('total', 0)

        if total_msg:
            sql = "update TAB_IOPM_SITE t set t.today_msg = %d, t.total_msg = %d, t.spider_status=603 where t.biz = '%s'" % (
                today_msg, total_msg, __biz)
        else:
            sql = "update TAB_IOPM_SITE t set t.today_msg = %d, t.spider_status=603 where t.biz = '%s'" % (
                today_msg, __biz)
        print(sql)
        WechatService._db.update(sql)

    def is_exist(self, table, data_id):
        if WechatService._es.get(table, data_id=data_id, doc_type=table):
            return True
        else:
            return False

    def add_article_info(self, article_info):
        '''
        @summary:
        ---------
        @param article_info:
        ---------
        @result:
        '''

        log.debug('''
            -----文章信息-----
            标题     %s
            发布时间 %s
            作者     %s
            公众号   %s
            url      %s
            ''' % (article_info['title'], article_info['release_time'],
                   article_info['author'], article_info['account'],
                   article_info['url']))

        WechatService._es.add('wechat_article', article_info,
                              article_info.get('article_id'))

    def add_account_info(self, account_info):
        log.debug('''
            -----公众号信息-----
            %s''' % tools.dumps_json(account_info))

        WechatService._es.add('wechat_account', account_info,
                              account_info.get('__biz'))
示例#16
0
 def is_zombie_account(self, last_publish_timestamp):
     if tools.get_current_timestamp() - last_publish_timestamp > self._zombie_account_not_publish_article_days * 86400:
         return True
     return False
示例#17
0
def parser_next_page_article(video_id, wall_id, feed_id, sns_time, url):
    article_json_url = 'http://api-t.iqiyi.com/feed/get_feeds?authcookie=&device_id=pc_web&m_device_id=a11e6ea94270eaaa0b46be30af84fc54&agenttype=118&wallId={wall_id}&feedTypes=1%2C7%2C8%2C9&count=20&top=1&hasRecomFeed=1&feedId={feed_id}&needTotal=1&notice=1&version=1&upOrDown=1&snsTime={sns_time}&_={timestamp_m}'.format(
        wall_id=wall_id,
        feed_id=feed_id,
        sns_time=sns_time,
        timestamp_m=int(tools.get_current_timestamp() * 1000))
    print(article_json_url)
    article_json = tools.get_json_by_requests(article_json_url)

    wall_id = article_json.get('data', {}).get('wallId')
    # 评论数组
    feeds = article_json.get('data', {}).get('feeds', [])
    for feed in feeds:
        article_id = feed.get('commentId')

        head_url = feed.get('icon')

        name = feed.get('name')

        release_time = feed.get('releaseDate')
        release_time = tools.timestamp_to_date(release_time)

        title = feed.get('feedTitle')

        content = feed.get('description')

        image_urls = ','.join(
            [img.get('url') for img in feed.get('pictures', [])])  #逗号分隔

        watch_count = feed.get('uvCount')

        up_count = feed.get('agreeCount')

        comment_count = feed.get('commentCount')

        log.debug('''
            id:       %s
            节目id     %s
            头像地址: %s
            名字:     %s
            发布时间: %s
            标题:     %s
            内容:     %s
            图片地址: %s
            观看量:   %s
            点赞量:   %s
            评论量:   %s
            ''' % (article_id, video_id, head_url, name, release_time, title,
                   content, image_urls, watch_count, up_count, comment_count))

        if self_base_parser.add_article(article_id,
                                        head_url,
                                        name,
                                        release_time,
                                        title,
                                        content,
                                        image_urls,
                                        watch_count,
                                        up_count,
                                        comment_count,
                                        program_id=video_id,
                                        gender=random.randint(0, 1),
                                        url=url,
                                        info_type=3,
                                        emotion=random.randint(0, 2),
                                        collect=0,
                                        source='爱奇艺'):
            # 解析評論
            parser_comment(article_id, wall_id)
        else:
            break
    else:
        if feeds:
            feed_id = feeds[-1].get('feedId')
            sns_time = feeds[-1].get('snsTime')
            parser_next_page_article(video_id, wall_id, feed_id, sns_time, url)
示例#18
0
def add_root_url(parser_params={}):
    log.debug('''
        添加根url
        parser_params : %s
        ''' % str(parser_params))

    base_url = 'http://is.snssdk.com/api/news/feed/v51/'
    params = {
        'version_code': '6.0.3',
        'app_name': 'news_article',
        'vid': 'B0DB5DD0-FF94-4773-85B1-EFC11132C2A4',
        'device_id': '34633749953',
        'channel': 'App Store',
        'resolution': '1242*2208',
        'aid': 13,
        'ab_version':
        '117912,112577,101786,117787,115757,101533,117646,118765,110341,113607,118273,114108,113114,106784,113608,101558,105475,112401,117714,105610,118581,118607,105821,112578,115570,118604,118850,116615,31210,118530,118216,114338',
        'ab_feature': 'z1',
        'openudid': '7064ff7d773ef8efeb5d6a25f62cd3d85035674f',
        'live_sdk_version': '1.6.5',
        'idfv': 'B0DB5DD0-FF94-4773-85_b1-EFC11132C2A4',
        'ac': 'WIFI',
        'os_version': '10.2.1',
        'ssmix': 'a',
        'device_platform': 'iphone',
        'iid': 8954368598,
        'ab_client': 'a1,f2,f7,e1',
        'device_type': 'iPhone 7 Plus',
        'idfa': 'D2E02B97-0F35-486F-9CD4-A2EC13BBC8FB',
        'LBS_status': 'deny',
        'category': 'news_local',
        'city': '',
        'concern_id': '',
        'count': 20,
        'cp': '5089F85eBd4BDq1',
        'detail': 1,
        'image': 1,
        'language': 'zh-Hans-CN',
        'last_refresh_sub_entrance_interval': 1482077184,
        'loc_mode': 0,
        'max_behot_time': 1481063762,
        'refer': 1,
        'strict': 0,
        'tt_from': 'load_more',
        'user_city': '泸州'
    }

    time_interval = ONE_PAGE_TIME_INTERVAL
    content_released_time = tools.get_current_timestamp() - 86400  # 一天
    current_timestamp = tools.get_current_timestamp()

    max_behot_time = current_timestamp
    while max_behot_time >= content_released_time:
        max_behot_time -= time_interval

        current_timestamp = current_timestamp + random.randint(60, 300)

        # 泸州的文章
        params['category'] = 'news_local'
        params[
            'last_refresh_sub_entrance_interval'] = current_timestamp  # + random.randint(60, 300)
        params['max_behot_time'] = max_behot_time

        url = tools.joint_url(base_url, params)
        base_parser.add_url('VAApp_urls', SITE_ID, url, remark=NEWS_LOCAL)

        # 视频
        params['category'] = 'video'
        params[
            'last_refresh_sub_entrance_interval'] = current_timestamp  # + random.randint(60, 300)
        params['max_behot_time'] = max_behot_time

        url = tools.joint_url(base_url, params)
        base_parser.add_url('VAApp_urls', SITE_ID, url, remark=VIDEO)
示例#19
0
sys.path.append('../../')

import init
import base.base_parser as base_parser
import base.constance as Constance
import utils.tools as tools
from utils.log import log

# 必须定义 网站id
SITE_ID = 1
# 必须定义 网站名
NAME = '映客'

# 拼參數用
SECRET_KEY = "8D2E##1[5$^(38#%#d3z96;]35q#MD28"
CURRENT_TIMESTAMP = tools.get_current_timestamp()
S_SG = tools.get_md5(SECRET_KEY + str(CURRENT_TIMESTAMP))  #Sig由固定密钥


@tools.run_safe_model(__name__)
# 必须定义 添加网站信息
def add_site_info():
    log.debug('添加网站信息')
    site_id = SITE_ID
    name = NAME
    table = 'LiveApp_site_info'
    url = 'http://www.inke.cn/hotlive_list.html'

    base_parser.add_website_info(table, site_id, url, name)

def monitor_task():
    task_manager = TaskManager()
    total_time = 0

    task_count = 0
    begin_time = None
    end_time = None
    spend_hours = None

    is_show_start_tip = False
    is_show_have_task = False

    while True:
        task_count = task_manager.get_task_count()
        if not task_count:
            if not is_show_start_tip:
                log.info('开始监控任务池...')
                is_show_start_tip = True

            total_time += CHECK_HAVE_TASK_SLEEP_TIME
            tools.delay_time(CHECK_HAVE_TASK_SLEEP_TIME)
        else:
            if not is_show_have_task:
                log.info('任务池中有%s条任务,work可以正常工作' % task_count)
                is_show_have_task = True

            total_time = 0
            tools.delay_time(CHECK_HAVE_TASK_SLEEP_TIME)

        if total_time > MAX_NULL_TASK_TIME:
            is_show_start_tip = False
            is_show_have_task = False

            # 结束一轮 做些统计
            if begin_time:
                # 统计时间
                end_time = tools.timestamp_to_date(
                    tools.get_current_timestamp() - MAX_NULL_TASK_TIME)
                spend_time = tools.date_to_timestamp(
                    end_time) - tools.date_to_timestamp(begin_time)
                spend_hours = tools.seconds_to_h_m_s(spend_time)

                # 统计url数量
                depth_count_info = task_manager.get_ever_depth_count(5)

                # 统计文章数量
                article_count_msg = statistic_article_count.get_article_count_msg(
                    begin_time, end_time)

                log.info(
                    '''
                    ------- 已做完一轮 --------
                    \r开始时间:%s
                    \r结束时间:%s
                    \r耗时:%s
                    \r网站数量:%s
                    \rurl数量信息:%s
                    \r文章数量信息:%s
                    ''' %
                    (begin_time, end_time, spend_hours, task_count,
                     tools.dumps_json(depth_count_info), article_count_msg))

            # 删除url指纹
            log.info('删除url指纹...')
            task_manager.clear_task()

            log.info('redis 中连续%s秒无任务,超过允许最大等待%s秒 开始添加任务' %
                     (total_time, MAX_NULL_TASK_TIME))
            # 取任务
            tasks = task_manager.get_task_from_oracle()
            if tasks:
                total_time = 0
                task_manager.add_task_to_redis(tasks)
                task_count = task_manager.get_task_count()
                if task_count:
                    begin_time = tools.get_current_date()
                    log.info('添加任务到redis中成功 共添加%s条任务。 work开始工作' % (task_count))
            else:
                log.error('未从oracle中取到任务')
示例#21
0
    def is_have_new_article(self, account_id, account_name, __biz):
        '''
        @summary: 检查是否有新发布的文章
        ---------
        @param account_id:
        @param __biz:
        ---------
        @result:
        '''

        result = ''
        if WechatService._wechat_sogou_enable:  # 搜狗微信可用
            result = WechatService._wechat_sogou.is_have_new_article(
                account_id=account_id, account=account_name)
            if result == constance.UPDATE:
                # 有新发布的文章 抓取
                pass

            elif result == constance.NOT_UPDATE:
                # 无新发布的文章 pass
                pass

            elif result == constance.ERROR:
                pass

            elif result == constance.VERIFICATION_CODE:
                # 被封了 请求失败 记录下失败时间
                WechatService._wechat_sogou_enable = False
                WechatService._wechat_sogou_last_unenable_time = tools.get_current_timestamp(
                )

        # 搜狗微信停用时间超过24小时了 可重新尝试
        elif tools.get_current_timestamp(
        ) - WechatService._wechat_sogou_last_unenable_time > TIME_INTERVAL:  # 搜狗微信不可用 但是已经间歇一天 还可以一试
            result = WechatService._wechat_sogou.is_have_new_article(
                account_id=account_id, account=account_name)
            if result == constance.UPDATE:
                # 搜狗微信可用
                WechatService._wechat_sogou_enable = True

            elif result == constance.NOT_UPDATE:
                pass

            elif result == constance.ERROR:
                pass

            elif result == constance.VERIFICATION_CODE:
                pass

            # 更新下可用时间
            WechatService._wechat_sogou_last_unenable_time = tools.get_current_timestamp(
            )

        # 如果搜狗微信不可用 则使用微信公众平台检查是否有新发布的文章
        if not result or result == constance.VERIFICATION_CODE:
            if WechatService._wechat_public_platform_enable:  # 微信公众平台可用
                result = WechatService._wechat_public_platform.is_have_new_article(
                    __biz)
                if result == constance.UPDATE:
                    # 有新发布的文章 抓取
                    pass

                elif result == constance.NOT_UPDATE:
                    # 无新发布的文章 pass
                    pass

                elif result == constance.ERROR:
                    # 被封了 请求失败 记录下失败时间
                    WechatService._wechat_public_platform_enable = False
                    WechatService._wechat_public_platform_last_unenable_time = tools.get_current_timestamp(
                    )

            elif tools.get_current_timestamp(
            ) - WechatService._wechat_public_platform_last_unenable_time > TIME_INTERVAL:  # 搜狗微信不可用 但是已经间歇一天 还可以一试
                result = WechatService._wechat_public_platform.is_have_new_article(
                    __biz)
                if result == constance.UPDATE:
                    # 有新发布的文章 抓取
                    WechatService._wechat_public_platform_enable = True

                elif result == constance.NOT_UPDATE:
                    # 无新发布的文章 pass
                    pass

                elif result == constance.ERROR:
                    # 被封了 请求失败 记录下失败时间
                    pass

                # 更新下可用时间
                WechatService._wechat_public_platform_last_unenable_time = tools.get_current_timestamp(
                )

        return result
示例#22
0
class WechatService():
    _db = OracleDB()
    _es = ES()
    _wechat_sogou = WechatSogou()
    _wechat_public_platform = WechatPublicPlatform()

    _todo_accounts = collections.deque()
    _rownum = 1

    _is_done = False  # 做完一轮
    _is_all_done = False  # 所有账号当日发布的消息均已爬取

    # wechat_sogou 最后没被封的时间
    _wechat_sogou_enable = True
    _wechat_sogou_last_unenable_time = tools.get_current_timestamp()

    # wechat_public_platform 最后没被封的时间
    _wechat_public_platform_enable = True
    _wechat_public_platform_last_unenable_time = tools.get_current_timestamp()

    def __init__(self):
        pass

    def __load_todo_account(self):
        if not WechatService._todo_accounts:
            sql = ''
            if not WechatService._is_all_done:
                sql = '''
                    select *
                       from (select rownum r, t.id, t.domain, t.biz, t.name
                               from TAB_IOPM_SITE t
                              where t.biz is not null and mointor_status = 701 and (today_msg is null or today_msg = 0) and rownum < {size})
                      where r >= {rownum}
                    '''.format(rownum=WechatService._rownum,
                               size=WechatService._rownum + SIZE)
            else:  # 今日公众号发布的新文章均已爬取
                sql = '''
                    select *
                       from (select rownum r, t.id, t.domain, t.biz, t.name
                               from TAB_IOPM_SITE t
                              where t.biz is not null and mointor_status = 701 and rownum < {size})
                      where r >= {rownum}
                    '''.format(rownum=WechatService._rownum,
                               size=WechatService._rownum + SIZE)

            print(sql)
            results = WechatService._db.find(sql)
            if not results:
                if WechatService._rownum == 1:
                    # 今日公众号发布的新文章均已爬取,爬虫休息,明日再爬
                    WechatService._is_all_done = True  # 为了WeichatAction 设置休眠时间用
                    # 取下一天的公众号
                    self.__load_todo_account()

                else:
                    WechatService._is_done = True
                    WechatService._rownum = 1
                    self.__load_todo_account()

            else:
                WechatService._todo_accounts = collections.deque(
                    results)  #  转为队列
                WechatService._rownum += SIZE

    def is_have_new_article(self, account_id, account_name, __biz):
        '''
        @summary: 检查是否有新发布的文章
        ---------
        @param account_id:
        @param __biz:
        ---------
        @result:
        '''

        result = ''
        if WechatService._wechat_sogou_enable:  # 搜狗微信可用
            result = WechatService._wechat_sogou.is_have_new_article(
                account_id=account_id, account=account_name)
            if result == constance.UPDATE:
                # 有新发布的文章 抓取
                pass

            elif result == constance.NOT_UPDATE:
                # 无新发布的文章 pass
                pass

            elif result == constance.ERROR:
                pass

            elif result == constance.VERIFICATION_CODE:
                # 被封了 请求失败 记录下失败时间
                WechatService._wechat_sogou_enable = False
                WechatService._wechat_sogou_last_unenable_time = tools.get_current_timestamp(
                )

        # 搜狗微信停用时间超过24小时了 可重新尝试
        elif tools.get_current_timestamp(
        ) - WechatService._wechat_sogou_last_unenable_time > TIME_INTERVAL:  # 搜狗微信不可用 但是已经间歇一天 还可以一试
            result = WechatService._wechat_sogou.is_have_new_article(
                account_id=account_id, account=account_name)
            if result == constance.UPDATE:
                # 搜狗微信可用
                WechatService._wechat_sogou_enable = True

            elif result == constance.NOT_UPDATE:
                pass

            elif result == constance.ERROR:
                pass

            elif result == constance.VERIFICATION_CODE:
                pass

            # 更新下可用时间
            WechatService._wechat_sogou_last_unenable_time = tools.get_current_timestamp(
            )

        # 如果搜狗微信不可用 则使用微信公众平台检查是否有新发布的文章
        if not result or result == constance.VERIFICATION_CODE:
            if WechatService._wechat_public_platform_enable:  # 微信公众平台可用
                result = WechatService._wechat_public_platform.is_have_new_article(
                    __biz)
                if result == constance.UPDATE:
                    # 有新发布的文章 抓取
                    pass

                elif result == constance.NOT_UPDATE:
                    # 无新发布的文章 pass
                    pass

                elif result == constance.ERROR:
                    # 被封了 请求失败 记录下失败时间
                    WechatService._wechat_public_platform_enable = False
                    WechatService._wechat_public_platform_last_unenable_time = tools.get_current_timestamp(
                    )

            elif tools.get_current_timestamp(
            ) - WechatService._wechat_public_platform_last_unenable_time > TIME_INTERVAL:  # 搜狗微信不可用 但是已经间歇一天 还可以一试
                result = WechatService._wechat_public_platform.is_have_new_article(
                    __biz)
                if result == constance.UPDATE:
                    # 有新发布的文章 抓取
                    WechatService._wechat_public_platform_enable = True

                elif result == constance.NOT_UPDATE:
                    # 无新发布的文章 pass
                    pass

                elif result == constance.ERROR:
                    # 被封了 请求失败 记录下失败时间
                    pass

                # 更新下可用时间
                WechatService._wechat_public_platform_last_unenable_time = tools.get_current_timestamp(
                )

        return result

    def get_next_account(self):
        '''
        @summary:
        ---------
        ---------
        @result: 返回biz, 是否已做完一圈 (biz, True)
        '''

        while True:
            if not WechatService._todo_accounts:
                self.__load_todo_account()

            next_account_info = WechatService._todo_accounts.popleft()
            next_account_id = next_account_info[2]
            next_account_biz = next_account_info[3]
            next_account_name = next_account_info[4]

            next_account = next_account_id, next_account_biz, WechatService._is_done, WechatService._is_all_done

            if not WechatService._wechat_sogou_enable:
                log.debug('搜狗微信不可用')

            if not WechatService._wechat_public_platform_enable:
                log.debug('微信公众平台不可用')

            # 不用检查是否发布新文章 直接跳出
            if not CHECK_NEW_ARTICLE:
                break

            # 搜狗微信和微信公众平台均不可用 跳出
            if not WechatService._wechat_sogou_enable and not WechatService._wechat_public_platform_enable:
                break

            # 使用检查新文章时,有一定的几率跳出, 采用微信客户端直接爬取,防止搜狗微信使用频繁出现验证码
            if random.randint(1, 5) == 1:
                log.debug('跳出 防止搜狗微信被封')
                break

            # 检查是今日是否有文章发布
            result = self.is_have_new_article(next_account_id,
                                              next_account_name,
                                              next_account_biz)
            if result == constance.UPDATE:
                break
            elif result == constance.NOT_UPDATE:
                if WechatService._is_done:  # 防止公众号都没更新, 产生死循环 都检查完一遍 发现都没更新  直接跳出
                    break
                else:
                    # tools.delay_time(5)
                    continue
            elif result == constance.ERROR:
                break
            elif result == constance.VERIFICATION_CODE:
                break
            else:  # 检查更新不可用 直接调用客户端爬取
                break

        # 重置_is_done与_is_all_done 状态
        WechatService._is_done = False
        WechatService._is_all_done = False

        return next_account

    def update_account_article_num(self, __biz):
        # 查询es 统计数量
        # 今日
        body = {
            "size": 0,
            "query": {
                "filtered": {
                    "filter": {
                        "range": {
                            "record_time": {
                                "gte":
                                tools.get_current_date('%Y-%m-%d') +
                                ' 00:00:00',
                                "lte":
                                tools.get_current_date('%Y-%m-%d') +
                                ' 23:59:59'
                            }
                        }
                    },
                    "query": {
                        'match': {
                            "__biz": __biz
                        }
                    }
                }
            }
        }
        result = WechatService._es.search('wechat_article', body)
        today_msg = result.get('hits', {}).get('total', 0)

        # 历史总信息量
        body = {
            "size": 0,
            "query": {
                "filtered": {
                    "query": {
                        'match': {
                            "__biz": __biz
                        }
                    }
                }
            }
        }
        result = WechatService._es.search('wechat_article', body)
        total_msg = result.get('hits', {}).get('total', 0)

        if total_msg:
            sql = "update TAB_IOPM_SITE t set t.today_msg = %d, t.total_msg = %d where t.biz = '%s'" % (
                today_msg, total_msg, __biz)
        else:
            sql = "update TAB_IOPM_SITE t set t.today_msg = %d where t.biz = '%s'" % (
                today_msg, __biz)
        print(sql)
        WechatService._db.update(sql)

    def is_exist(self, table, data_id):
        if WechatService._es.get(table, data_id=data_id, doc_type=table):
            return True
        else:
            return False

    def add_article_info(self, article_info):
        '''
        @summary:
        ---------
        @param article_info:
        ---------
        @result:
        '''

        log.debug('''
            -----文章信息-----
            标题     %s
            发布时间 %s
            作者     %s
            公众号   %s
            url      %s
            ''' % (article_info['title'], article_info['release_time'],
                   article_info['author'], article_info['account'],
                   article_info['url']))

        WechatService._es.add('wechat_article', article_info,
                              article_info.get('article_id'))

    def add_account_info(self, account_info):
        log.debug('''
            -----公众号信息-----
            %s''' % tools.dumps_json(account_info))

        WechatService._es.add('wechat_account', account_info,
                              account_info.get('__biz'))