Exemplo n.º 1
0
    def deal_null_biz(self):
        sql = 'select id, name, domain from TAB_IOPM_SITE t where classify = 2 and t.biz is null'
        accounts_info = self._db.find(sql)

        for account_info in accounts_info:
            print(account_info)
            _id = account_info[0]
            account = account_info[1]
            account_id = account_info[2]

            account_info = self.get_account_info(account_id, account)
            log.debug(tools.dumps_json(account_info))

            if account_info.get('__biz'):
                account = account or account_info.get('account')
                account_id = account_id or account_info.get('account_id')
                __biz = account_info.get('__biz') or ''

                sql = "update TAB_IOPM_SITE set name = '%s', domain = '%s', biz = '%s' where id = %s"%(account, account_id, __biz, _id)
                log.debug(sql)
                self._db.update(sql)

            elif not account_info.get('check_info'):
                log.debug('查无此公众号 :%s'% account)

            tools.delay_time(60)
Exemplo n.º 2
0
 def run(self):
     while not self._thread_stop:
         try:
             self.__add_url_to_db()
             tools.delay_time(1)
         except Exception as e:
             log.error(e)
Exemplo n.º 3
0
    def run(self):
        while not self._thread_stop:
            try:
                self.__add_article_to_db()
            except Exception as e:
                log.error(e)

            log.debug('缓存中文章数量 %s'%len(self._articles_deque))
            tools.delay_time(1)
    def run(self):
        while True:
            tools.delay_time(60 * 60)  # 一小时后更细权重
            print('更新线索权重...')
            self.load_clues_weight()
            self.load_classify_weight()
            self.load_related_factor()

            print('更新线索权重完毕')
 def run(self):
     while True:
         try:
             datas = self.get_data_from_redis(SYNC_STEP)
             if not datas:
                 print('无数据 休眠...')
             elif self.add_data_to_es(datas):
                 self._sync_count += len(datas)
                 tools.print_one_line('已同步 %d 条数据' % self._sync_count)
             tools.delay_time(1)
         except Exception as e:
             log.error(e)
Exemplo n.º 6
0
def main():
    while True:
        if task_status.is_doing:
            log.debug('正在做 不取任务')
            tools.delay_time(SEARCH_TASK_SLEEP_TIME)
            continue

        task_status.is_doing = True

        # 查找任务
        get_task_url = MASTER_ADDRESS + '/task/get_task'
        print(get_task_url)
        update_task_url = MASTER_ADDRESS + '/task/update_task'
        data = tools.get_json_by_requests(get_task_url)
        # tasks = [[209690, '百度新闻', 11, 'http://news.baidu.com/?tn=news',  3]]
        print(data)
        tasks = data.get('tasks', [])
        parser_count = data.get('thread_count')

        def begin_callback():
            log.info('\n********** news begin **********')
            # 更新任务状态 doing

            data = {'tasks': str(tasks), 'status': 602}

            if tools.get_json_by_requests(update_task_url, data=data):
                log.debug('更新任务状态 正在做...')

        def end_callback():
            log.info('\n********** news end **********')
            task_status.is_doing = False

            data = {'tasks': str(tasks), 'status': 603}

            if tools.get_json_by_requests(update_task_url, data=data):
                log.debug('更新任务状态 已做完!')

        # 配置spider
        spider = Spider(tab_urls='news:news_urls',
                        parser_count=parser_count,
                        begin_callback=begin_callback,
                        end_callback=end_callback,
                        parser_params=tasks,
                        delete_tab_urls=False)

        # 添加parser
        spider.add_parser(news_parser)

        spider.start()
Exemplo n.º 7
0
def add_root_url(parser_params={}):
    log.debug('''
        添加根url
        parser_params : %s
        ''' % str(parser_params))

    keywords = parser_params['keywords']

    for keyword in keywords:
        if keyword:
            url = 'http://weixin.sogou.com/weixin?query=%s&_sug_type_=&s_from=input&_sug_=y&type=1&page=1&ie=utf8' % keyword
            if mongodb.find('WWA_wechat_account_url', {'url': url}):
                continue

            headers = {
                "Upgrade-Insecure-Requests": "1",
                "User-Agent":
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
                "Cache-Control": "max-age=0",
                "Connection": "keep-alive",
                "Accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
                "Accept-Language": "zh-CN,zh;q=0.8",
                "Accept-Encoding": "gzip, deflate",
                "Cookie":
                "wuid=AAGPF/32GQAAAAqLFD2BdAAAGwY=; CXID=A468F618D67D4868DC83E6061B1B3CCC; ABTEST=0|1500285612|v1; weixinIndexVisited=1; SUV=006317867B7CC4C5596C8AAD6B089707; SUIR=0A14ACB4D0CA9B50A8ABB33CD0CA69FA; ld=ekllllllll2BbH49lllllVOm1tylllll1kecBlllll9lllll9Zlll5@@@@@@@@@@; ad=AZllllllll2Bzw7GlllllVOeQA6lllll1kectkllll9lllllVqxlw@@@@@@@@@@@; SUID=72780CD23D148B0A59688B0C0002AD65; IPLOC=CN1100; sct=11; SNUID=B4B50E097177247B9A6BE55E72153425; JSESSIONID=aaaVCfkabuJQTfaNW5f1v",
                "Host": "weixin.sogou.com"
            }

            html, r = tools.get_html_by_requests(url, headers=headers)
            # 判断是否存在公众号
            not_page_tip = '/new/pc/images/bg_404_2.png'
            if not_page_tip in html:
                continue

            # 取页码
            regex = 'id="pagebar_container">.*>(\d*?)</a>.*?<a id="sogou_next"'
            page_num = tools.get_info(html, regex, fetch_one=True)
            page_num = int(page_num) if page_num else 1

            for page in range(1, page_num + 1):
                url = 'http://weixin.sogou.com/weixin?query=%s&_sug_type_=&s_from=input&_sug_=y&type=1&page=%d&ie=utf8' % (
                    keyword, page)
                base_parser.add_url('WWA_wechat_account_url', SITE_ID, url)

            tools.delay_time()
Exemplo n.º 8
0
 def run(self):
     is_show_tip = False
     while True:
         try:
             datas = self.get_data_from_redis(SYNC_STEP)
             if not datas:
                 if not is_show_tip:
                     print('\n{time} 无数据 休眠...    '.format(
                         time=tools.get_current_date()))
                     is_show_tip = True
             elif self.add_data_to_es(datas):
                 is_show_tip = False
                 self._sync_count += len(datas)
                 tools.print_one_line('已同步 %d 条数据' % self._sync_count)
             tools.delay_time(1)
         except Exception as e:
             log.error(e)
Exemplo n.º 9
0
 def run(self):
     while True:
         tools.delay_time(60 * 60)
         print('更新事件知识库...')
         self._event_knowledges = self.load_event_knowledges()
         print('更新事件知识库完毕')
Exemplo n.º 10
0
import sys

sys.path.append('..')
import init

import utils.tools as tools
from utils.log import log
from db.oracledb import OracleDB
from base.wechat_public_platform import WechatPublicPlatform
from base.wechat_sogou import WechatSogou

if __name__ == '__main__':
    db = OracleDB()
    # wechat_public_platform =  WechatPublicPlatform()
    wechat_sogou = WechatSogou()
    # 取微信号
    # sql = 'select t.name, t.keyword2 from TAB_IOPM_CLUES t where t.zero_id = 7 and t.first_id = 137 and t.second_id = 183'
    # accounts = db.find(sql)
    accounts = ['骨朵网络影视']
    for account in accounts:
        account_id = ''
        account_name = account
        biz = wechat_sogou.get_biz(account_id=account_id, account=account_name)
        if biz:
            sql = "insert into TAB_IOPM_SITE t (t.id, t.name, t.position, t.classify, t.mointor_status, t.biz, t.priority) values (seq_iopm_site.nextval, '{name}', 1, 2, 701, '{biz}', 1)".format(
                name=account_name, biz=biz)
            print(sql)
            db.add(sql)
        tools.delay_time(10)
        # break
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    weibo_id = url_info['remark']['search_keyword']
    monitor_type = url_info['remark']['monitor_type']

    for i in range(1, 100):
        weibo_content_url = root_url + '&page=%d' % i

        # 代理
        headers = {
            "Cache-Control":
            "max-age=0",
            "Cookie":
            "_T_WM=e0a91a3ed6286a67e649ce567fbbd17a; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D2304131560851875_-_WEIBO_SECOND_PROFILE_WEIBO%26fid%3D100103type%253D401%2526q%253D%26uicode%3D10000011",
            "Accept-Language":
            "zh-CN,zh;q=0.8",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
            "Host":
            "m.weibo.cn",
            "Accept-Encoding":
            "gzip, deflate, br",
            "Upgrade-Insecure-Requests":
            "1",
            "Connection":
            "keep-alive",
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"
        }
        proxies = base_parser.get_proxies()
        headers["User-Agent"] = base_parser.get_user_agent()
        proxies = {}
        html = tools.get_json_by_requests(weibo_content_url,
                                          headers=headers,
                                          proxies=proxies)

        cards = tools.get_json_value(html, 'cards')
        if len(cards) < 2:
            base_parser.update_url('WWA_weibo_info_urls', root_url,
                                   Constance.DONE)
            return

        tools.delay_time(10)
        for card in cards:
            mblog = tools.get_json_value(card, 'mblog')
            if not mblog:
                continue

            url = tools.get_json_value(card, 'scheme')

            # 代理
            headers = {
                "User-Agent":
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
                "Accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
                "Accept-Encoding": "gzip, deflate, br",
                "Cookie":
                "_T_WM=e0a91a3ed6286a67e649ce567fbbd17a; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D100103type%253D401%2526q%253D%26fid%3D2304131560851875_-_WEIBO_SECOND_PROFILE_WEIBO%26uicode%3D10000011",
                "Host": "m.weibo.cn",
                "Accept-Language": "zh-CN,zh;q=0.8",
                "Upgrade-Insecure-Requests": "1",
                "Connection": "keep-alive"
            }
            proxies = base_parser.get_proxies()
            headers["User-Agent"] = base_parser.get_user_agent()
            proxies = {}
            origin_html, r = tools.get_html_by_requests(url,
                                                        headers=headers,
                                                        proxies=proxies)
            if not origin_html:
                continue

            release_time = get_release_time(mblog)
            come_from = tools.get_json_value(mblog, 'source')
            regexs = ['"text": "(.+?)",']
            content = ''.join(tools.get_info(origin_html, regexs))
            # content = tools.del_html_tag(content)
            content = content.replace('\\', '')

            sexy_image_url = []

            regexs = ['"pic_ids": \[(.*?)\],']
            image_url = ''.join(tools.get_info(origin_html, regexs))
            image_url = tools.del_html_tag(image_url).replace('\"',
                                                              '').replace(
                                                                  '\\n', '')
            if image_url:
                image_url = image_url.split(',')
                for i in range(len(image_url)):
                    image_url[i] = 'http://wx2.sinaimg.cn/large/' + image_url[
                        i] + '.jpg'

                sexy_image_url = image_url
                image_url = ','.join(image_url)
            regexs = ['"stream_url": "(.*?)"']
            video_url = ''.join(tools.get_info(origin_html, regexs))
            transpond_count = tools.get_json_value(mblog, 'reposts_count')
            praise_count = tools.get_json_value(mblog, 'attitudes_count')

            # 敏感事件
            sensitive_id = ''
            if monitor_type == 1 or monitor_type == 2:
                sensitive_event_infos = oracledb.find(
                    'select t.id, t.keyword1, t.keyword2, t.keyword3 from tab_mvms_sensitive_event t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time'
                )
                for sensitive_event_info in sensitive_event_infos:
                    _id = sensitive_event_info[0]
                    keyword1 = sensitive_event_info[1].split(
                        ',') if sensitive_event_info[1] else []
                    keyword2 = sensitive_event_info[2].split(
                        ',') if sensitive_event_info[2] else []
                    keyword3 = sensitive_event_info[3].split(
                        ',') if sensitive_event_info[3] else []

                    if base_parser.is_violate(content,
                                              key1=keyword1,
                                              key2=keyword2,
                                              key3=keyword3):
                        sensitive_id = _id
                        break

            # 违规事件
            violate_id = ''
            if monitor_type == 0 or monitor_type == 2:
                vioation_knowledge_infos = oracledb.find(
                    'select t.id, t.keyword1, t.keyword2, t.keyword3 from tab_mvms_violation_knowledge t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time'
                )
                for vioation_knowledge_info in vioation_knowledge_infos:
                    _id = vioation_knowledge_info[0]
                    keyword1 = vioation_knowledge_info[1].split(
                        ',') if vioation_knowledge_info[1] else []
                    keyword2 = vioation_knowledge_info[2].split(
                        ',') if vioation_knowledge_info[2] else []
                    keyword3 = vioation_knowledge_info[3].split(
                        ',') if vioation_knowledge_info[3] else []

                    if base_parser.is_violate(content,
                                              key1=keyword1,
                                              key2=keyword2,
                                              key3=keyword3):
                        violate_id = _id
                        break

            # 下载视频
            is_mp4 = tools.is_file(video_url, 'mp4')
            if is_mp4:
                local_video_path = FILE_LOCAL_PATH + 'videos/' + tools.get_current_date(
                    date_format='%Y-%m-%d') + "/" + tools.get_current_date(
                        date_format='%Y%m%d%H%M%S.%f') + '.mp4'
                is_download = tools.download_file(video_url, local_video_path)
                video_url = local_video_path if is_download else ''
            else:
                video_url = ''

            log.debug('''
                      原文地址:     %s
                      微博ID:       %s
                      发布时间:     %s
                      来自:         %s
                      内容:         %s
                      图片地址:     %s
                      视频地址:     %s
                      转发数:       %s
                      点赞数:       %s
                      违规id:       %s
                      敏感事件       %s
                      图像鉴别地址   %s
                     ''' %
                      (url, weibo_id, release_time, come_from, content,
                       image_url, video_url, transpond_count, praise_count,
                       violate_id, sensitive_id, sexy_image_url))

            if content:
                base_parser.add_wwa_weibo_info_info(
                    'WWA_weibo_info_info',
                    SITE_ID,
                    url,
                    weibo_id,
                    release_time,
                    come_from,
                    content,
                    image_url,
                    video_url,
                    transpond_count,
                    praise_count,
                    violate_id,
                    sensitive_id=sensitive_id,
                    sexy_image_url=sexy_image_url)
        tools.delay_time()

    base_parser.update_url('WWA_weibo_info_urls', root_url, Constance.DONE)
Exemplo n.º 12
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']['keyword']
    monitor_type = url_info['remark']['monitor_type']
    official_accounts_id = remark
    retry_times = url_info['retry_times']

    headers = {
    "Host": "weixin.sogou.com",
    "Connection": "keep-alive",
    "Accept-Encoding": "gzip, deflate",
    "Accept-Language": "zh-CN,zh;q=0.8",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Cookie": "ABTEST=8|1506658658|v1; IPLOC=CN1100; SUID=C5C47C7B642E940A0000000059CDC962; SUID=C5C47C7B1508990A0000000059CDC963; weixinIndexVisited=1; SUV=00F95AA57B7CC4C559CDC963CE316529; SNUID=2B2A9295EDE8B7A2BCECB605EE30F1BE; JSESSIONID=aaadcwpP9yaKs-PCMhz6v",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
    "Upgrade-Insecure-Requests": "1"
    }

    # 获取代理
    proxies = base_parser.get_proxies()
    headers["User-Agent"] = base_parser.get_user_agent()

    # 解析
    # print(proxies)
    # html, r = tools.get_html_by_requests('http://ip.chinaz.com/getip.aspx', headers = headers, proxies = proxies)
    # print(html)

    html, request = tools.get_html_by_requests(root_url, headers = headers, proxies = proxies)
    if not html:
        base_parser.update_url('urls', root_url, Constance.TODO, retry_times + 1)
        return

    # print(html)
    regex = '<input type=text name="c" value="" placeholder="(.*?)" id="seccodeInput">'
    check_info = tools.get_info(html, regex, fetch_one = True)
    print(root_url)
    log.debug('取文章链接' + check_info)

    if check_info:
        base_parser.update_url('urls', root_url, Constance.TODO, retry_times + 1)
        return

    # 公众号信息块
    regex = '<!-- a -->(.*?)<!-- z -->'
    account_block = tools.get_info(html, regex, fetch_one = True)
    # url
    regex = '<a.*?account_name.*?href="(.*?)">'
    account_url = tools.get_info(account_block, regex, fetch_one = True)
    account_url = account_url.replace('&amp;',"&")
    log.debug('account_url = ' + account_url)

    if not account_url:
        base_parser.update_url('urls', root_url, Constance.EXCEPTION)
        return

    headers = {
        "Accept-Language": "zh-CN,zh;q=0.8",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate",
        "Host": "mp.weixin.qq.com",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
        "Upgrade-Insecure-Requests": "1",
        "Connection": "keep-alive"
    }

    # 代理
    proxies = base_parser.get_proxies()
    headers["User-Agent"] = base_parser.get_user_agent()
    proxies = {} #使用代理会出现验证码 暂时不使用

    html, request = tools.get_html_by_requests(account_url, headers = headers, proxies = proxies)
    regex = '<input class="weui_input frm_input" id="input" placeholder="(.*?)" maxlength="4">'
    check_info = tools.get_info(html, regex, fetch_one = True)
    log.debug('''
        取文章详细内容 %s
        url %s
        request.headers %s
        '''%(check_info, account_url, request.headers))
    # print(html)

    regex = 'var msgList = (.*?});'
    article_json = tools.get_info(html, regex, fetch_one = True)
    article_json = tools.get_json(article_json)

    article_list = article_json.get('list', {})
    for article in article_list:
        title = tools.get_json_value(article, 'app_msg_ext_info.title')
        is_have = mongodb.find('WWA_wechat_article', {'title' : title})
        if is_have:
            log.debug(title + " 已存在")
            continue

        summary = tools.get_json_value(article, 'app_msg_ext_info.digest')
        image_url = tools.get_json_value(article, 'app_msg_ext_info.cover')

        sexy_image_url = []

        # 下载图片
        local_image_url = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.jpg'
        is_download = tools.download_file(image_url, local_image_url)
        local_image_url = local_image_url if is_download else ''
        sexy_image_url.append(local_image_url)

        article_url = tools.get_json_value(article, 'app_msg_ext_info.content_url')
        article_url = tools.get_full_url('http://mp.weixin.qq.com', article_url)
        article_url = article_url.replace('&amp;',"&")

        release_time = tools.get_json_value(article, 'comm_msg_info.datetime')
        release_time = tools.timestamp_to_date(int(release_time)) if release_time else ''

        content_html, request = tools.get_html_by_requests(article_url, headers = headers, proxies = proxies)
        regex = '(<div class="rich_media_content " id="js_content">.*?)<script nonce'
        content = tools.get_info(content_html, regex, fetch_one = True)

        # # 取content里的图片 下载图片 然后替换内容中原来的图片地址
        regex = '<img.*?data-src="(.*?)"'
        images = tools.get_info(content, regex)
        for image in images:
            local_image_path = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.' + (image[image.find('wx_fmt=') + len('wx_fmt='):(image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) if image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) != -1 else None)] if 'wx_fmt=' in image else 'jpg')
            is_download = tools.download_file(image, local_image_path)
            if is_download:
                content = content.replace(image, local_image_path)
                sexy_image_url.append(local_image_path)
            tools.delay_time(5)

        # 敏感事件
        sensitive_id = ''
        if monitor_type == 1 or monitor_type == 2:
            sensitive_event_infos = oracledb.find('select t.id, t.keyword1, t.keyword2, t.keyword3 from tab_mvms_sensitive_event t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time')
            for sensitive_event_info in sensitive_event_infos:
                _id = sensitive_event_info[0]
                keyword1 = sensitive_event_info[1].split(',') if sensitive_event_info[1] else []
                keyword2 = sensitive_event_info[2].split(',') if sensitive_event_info[2] else []
                keyword3 = sensitive_event_info[3].split(',') if sensitive_event_info[3] else []

                if base_parser.is_violate(title + content, key1 = keyword1, key2 = keyword2, key3 = keyword3):
                    sensitive_id = _id
                    break

        # 违规事件
        violate_id = ''
        if monitor_type == 0 or monitor_type == 2:
            vioation_knowledge_infos = oracledb.find('select t.id, t.keyword1, t.keyword2, t.keyword3 from tab_mvms_violation_knowledge t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time')
            for vioation_knowledge_info in vioation_knowledge_infos:
                _id = vioation_knowledge_info[0]
                keyword1 = vioation_knowledge_info[1].split(',') if vioation_knowledge_info[1] else []
                keyword2 = vioation_knowledge_info[2].split(',') if vioation_knowledge_info[2] else []
                keyword3 = vioation_knowledge_info[3].split(',') if vioation_knowledge_info[3] else []

                if base_parser.is_violate(title + tools.del_html_tag(content), key1=keyword1, key2=keyword2, key3=keyword3):
                    violate_id = _id
                    break

        log.debug('''
            标题         %s
            简介         %s
            图片地址     %s
            文章地址     %s
            发布时间     %s
            内容         %s
            本地贴图地址 %s
            违规状态     %s
            敏感事件     %s
            图片鉴别地址 %s
            '''%(title, summary, image_url, article_url, release_time, content, local_image_url, violate_id, sensitive_id, sexy_image_url))

        base_parser.add_wechat_content_info('WWA_wechat_article', site_id, official_accounts_id, title, summary, image_url, article_url, release_time, content, video_url = '', local_image_url = local_image_url, violate_status = violate_id, sensitive_id = sensitive_id, sexy_image_url = sexy_image_url)

        # 同一天发布的
        oneday_article_list = article.get('app_msg_ext_info', {}).get('multi_app_msg_item_list', [])
        for article in oneday_article_list:
            title = tools.get_json_value(article, 'title')
            summary = tools.get_json_value(article, 'digest')
            image_url = tools.get_json_value(article, 'cover')

            sexy_image_url = []

            # 下载图片
            local_image_url = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.jpg'
            is_download = tools.download_file(image_url, local_image_url)
            local_image_url = local_image_url if is_download else ''
            sexy_image_url.append(local_image_url)

            article_url = tools.get_json_value(article, 'content_url')
            article_url = tools.get_full_url('http://mp.weixin.qq.com', article_url)
            article_url = article_url.replace('&amp;',"&")

            content_html, request = tools.get_html_by_requests(article_url, headers = headers, proxies = proxies)
            regex = '(<div class="rich_media_content " id="js_content">.*?)<script nonce'
            content = tools.get_info(content_html, regex, fetch_one = True)

            # 取content里的图片 下载图片 然后替换内容中原来的图片地址
            regex = '<img.*?data-src="(.*?)"'
            images = tools.get_info(content, regex)
            for image in images:
                local_image_path = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.' + (image[image.find('wx_fmt=') + len('wx_fmt='):(image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) if image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) != -1 else None)] if 'wx_fmt=' in image else 'jpg')
                is_download = tools.download_file(image, local_image_path)
                if is_download:
                    content = content.replace(image, local_image_path)
                    sexy_image_url.append(local_image_path)
                tools.delay_time(5)

            # 敏感事件
            sensitive_id = ''
            sensitive_event_infos = oracledb.find('select * from tab_mvms_sensitive_event')
            for sensitive_event_info in sensitive_event_infos:
                _id = sensitive_event_info[0]
                keyword1 = sensitive_event_info[3].split(',') if sensitive_event_info[3] else []
                keyword2 = sensitive_event_info[4].split(',') if sensitive_event_info[4] else []
                keyword3 = sensitive_event_info[5].split(',') if sensitive_event_info[5] else []

                if base_parser.is_violate(title + content, key1 = keyword1, key2 = keyword2, key3 = keyword3):
                    sensitive_id = _id
                    break

            # 违规事件
            violate_id = ''
            vioation_knowledge_infos = oracledb.find('select * from tab_mvms_violation_knowledge')
            for vioation_knowledge_info in vioation_knowledge_infos:
                _id = vioation_knowledge_info[0]
                keyword1 = vioation_knowledge_info[2].split(',') if vioation_knowledge_info[2] else []
                keyword2 = vioation_knowledge_info[3].split(',') if vioation_knowledge_info[3] else []
                keyword3 = vioation_knowledge_info[4].split(',') if vioation_knowledge_info[4] else []

                if base_parser.is_violate(title + tools.del_html_tag(content), key1=keyword1, key2=keyword2, key3=keyword3):
                    violate_id = _id
                    break

            log.debug('''
            标题         %s
            简介         %s
            图片地址     %s
            文章地址     %s
            发布时间     %s
            内容         %s
            本地贴图地址 %s
            违规状态     %s
            敏感事件     %s
            图片鉴别地址 %s
            '''%(title, summary, image_url, article_url, release_time, content, local_image_url, violate_id, sensitive_id, sexy_image_url))

            base_parser.add_wechat_content_info('WWA_wechat_article', site_id, official_accounts_id, title, summary, image_url, article_url, release_time, content, video_url = '', local_image_url = local_image_url, violate_status = violate_id, sensitive_id = sensitive_id, sexy_image_url = sexy_image_url)

    base_parser.update_url('WWA_wechat_article_url', root_url, Constance.DONE)
    tools.delay_time()
Exemplo n.º 13
0
def get_datas(root_url):
    count = 0
    page = 1
    retry_times = 0
    max_retry_times = 5

    while True:
        url = root_url%page
        print(url)

        datas = tools.get_json_by_requests(url, headers = HEADERS)
        if not datas:
            if retry_times > max_retry_times:
                break
            else:
                retry_times += 1
                tools.delay_time(2)
                continue
        else:
            retry_times = 0

        if datas['message'] == '查询记录为0':
            print('每页100条  第%d页无数据 共导出 %d 条数据'%(page, count))
            break

        messages = datas['data']['data']
        for msg in messages:
            if not msg['url']:
                continue

            weight = 0 # 权重
            clues_ids = msg['cluesIds']

            # 取id
            sql = 'select SEQ_IOPM_ARTICLE.nextval from dual'
            article_id = db.find(sql)[0][0]

            def export_callback(execute_type, sql, data_json):
                if execute_type != ExportData.EXCEPTION:
                    for clues_id in clues_ids.split(','):
                        print(clues_id)
                        key_map = {
                            'id':'vint_sequence.nextval',
                            'article_id':'vint_%d'%article_id,
                            'clues_id':'vint_%s'%clues_id
                        }
                        export_data.export_to_oracle(key_map = key_map, aim_table = 'TAB_IOPM_ARTICLE_CLUES_SRC', datas = [{}], sync_to_es = True)

            is_negative_emotion = (msg['emotion'] == 2) and 1 or 0
            is_vip = vip_checked.is_vip(msg['url']) or vip_checked.is_vip(msg['websiteName'])or vip_checked.is_vip(msg['author'])

            # 计算权重
            print('===============================')
            url = IOPM_SERVICE_ADDRESS + '/related_sort?article_id=%d&clues_ids=%s&may_invalid=%s&vip_count=%s&negative_emotion_count=%s'%(article_id, msg['cluesIds'], msg['mayInvalid'] or '0', is_vip and 1 or 0, is_negative_emotion)
            weight = tools.get_json_by_requests(url).get('weight', 0)
            print(url)
            print('----------------------------')


            key_map = {
                'id':'vint_%d'%article_id,
                'account': 'str_account',
                'author': 'str_author',
                'clues_ids': 'str_cluesIds',
                'comment_count': 'int_commtcount',
                'content': 'clob_content',
                'emotion': 'vint_%s'%(msg['emotion'] or 3),
                'host': 'str_host',
                'keywords': 'str_keywords',
                'image_url': 'str_picture',
                'release_time': 'date_pubtime',
                'review_count': 'int_reviewCount',
                'title': 'str_title',
                'info_type': 'int_type',
                'up_count': 'int_upCount',
                'url': 'str_url',
                'uuid': 'str_uuid',
                'website_name': 'str_websiteName',
                'MAY_INVALID':'int_mayInvalid',
                'KEYWORD_CLUES_ID':'str_keywordAndIds',
                'keywords_count':'vint_%d'%len(msg['keywords'].split(',')),
                'is_vip':'vint_%d'%vip_checked.is_vip(msg['url']) or vip_checked.is_vip(msg['websiteName'])or vip_checked.is_vip(msg['author']),
                'weight':'vint_%s'%weight,
                'record_time':'vdate_%s'%tools.get_current_date(),
                'transmit_count':'str_forwardcount',
                'INTERACTION_COUNT':'vint_%s'%get_interaction_count(msg['commtcount'], msg['reviewCount'], msg['forwardcount'], msg['upCount'])
            }

            export_data.export_to_oracle(key_map = key_map, aim_table = 'TAB_IOPM_ARTICLE_INFO', unique_key = 'url', datas = msg, callback = export_callback, unique_key_mapping_source_key = {'url': 'str_url'}, sync_to_es = True)
            count += 1

        page += 1

if __name__ == '__main__':
    #     "hot_value": 52.0,
    # "article_count": 8,
    # "clues_ids": "250,925,924,389,274,924,250,273,250,430,279,916,916,925,925,274,274,250,275,102,274,916,927,953,930,927,930,930,250,928,928,109,273,928",
    # "vip_count": 3,
    # "zero_ids": "6,2,5,7",
    # "negative_emotion_count": 8,
    # "hot_id": "f443d613-bc0e-330b-9643-7798e0c5ca97"
    related_sort = RelatedSortService()
    related_sort.start()
    clue_ids = '936,936,274,936'
    a = related_sort.deal_hot('25cd565c-4c0d-30a8-b853-21913e2dc6fa',
                              hot_value=52.0,
                              clues_id=clue_ids,
                              zero_ids='6,2,5,7',
                              article_count=8,
                              vip_count=3,
                              negative_emotion_count=8)
    print(a)
    tools.delay_time(5)

    # # b = related_sort.get_article_releated_weight(1123802)
    # # print(b)

    # # related_sort.load_related_factor()
    # # print(related_sort.get_related_factor(RelatedSortService.CLUES_FACTOR))
    # # print(related_sort.get_related_factor(RelatedSortService.HOT_FACTOR))
    # print(0.23 * 0.3 + 0.25 * 0.7 + 0.2* 1 +  0.5 *0)
Exemplo n.º 15
0
def monitor_task():
    task_manager = TaskManager()
    total_time = 0

    task_count = 0
    begin_time = None
    end_time = None
    spend_hours = None

    is_show_start_tip = False
    is_show_have_task = False

    while True:
        task_count = task_manager.get_task_count()
        if not task_count:
            if not is_show_start_tip:
                log.info('开始监控任务池...')
                is_show_start_tip = True

            total_time += CHECK_HAVE_TASK_SLEEP_TIME
            tools.delay_time(CHECK_HAVE_TASK_SLEEP_TIME)
        else:
            if not is_show_have_task:
                log.info('任务池中有%s条任务,work可以正常工作' % task_count)
                is_show_have_task = True

            total_time = 0
            tools.delay_time(CHECK_HAVE_TASK_SLEEP_TIME)

        if total_time > MAX_NULL_TASK_TIME:
            is_show_start_tip = False
            is_show_have_task = False

            # 结束一轮 做些统计
            if begin_time:
                # 统计时间
                end_time = tools.timestamp_to_date(
                    tools.get_current_timestamp() - MAX_NULL_TASK_TIME)
                spend_time = tools.date_to_timestamp(
                    end_time) - tools.date_to_timestamp(begin_time)
                spend_hours = tools.seconds_to_h_m_s(spend_time)

                # 统计url数量
                depth_count_info = task_manager.get_ever_depth_count(5)

                # 统计文章数量
                article_count_msg = statistic_article_count.get_article_count_msg(
                    begin_time, end_time)

                log.info(
                    '''
                    ------- 已做完一轮 --------
                    \r开始时间:%s
                    \r结束时间:%s
                    \r耗时:%s
                    \r网站数量:%s
                    \rurl数量信息:%s
                    \r文章数量信息:%s
                    ''' %
                    (begin_time, end_time, spend_hours, task_count,
                     tools.dumps_json(depth_count_info), article_count_msg))

            # 删除url指纹
            log.info('删除url指纹...')
            task_manager.clear_task()

            log.info('redis 中连续%s秒无任务,超过允许最大等待%s秒 开始添加任务' %
                     (total_time, MAX_NULL_TASK_TIME))
            # 取任务
            tasks = task_manager.get_task_from_oracle()
            if tasks:
                total_time = 0
                task_manager.add_task_to_redis(tasks)
                task_count = task_manager.get_task_count()
                if task_count:
                    begin_time = tools.get_current_date()
                    log.info('添加任务到redis中成功 共添加%s条任务。 work开始工作' % (task_count))
            else:
                log.error('未从oracle中取到任务')
Exemplo n.º 16
0
 def run(self):
     while True:
         tools.delay_time(60 * 60)
         print('更新keywords...')
         self._clues = self.get_clues()
         print('更新keywords完毕')
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    monitor_type = url_info['remark']

    for i in range(2, 100):
        list_url = root_url + '&page=%d' % i
        html = tools.get_json_by_requests(list_url)

        cards = tools.get_json_value(html, 'cards')
        card_group = []
        for i in cards:
            card_group = tools.get_json_value(i, 'card_group')
            if card_group:
                break
        if not card_group:
            break

        for info in card_group:
            user_info = tools.get_json_value(info, 'user')
            _id = tools.get_json_value(user_info, 'id')

            user_url = 'http://m.weibo.cn/api/container/getIndex?containerid=230283%s_-_INFO' % _id
            user_url_html = tools.get_json_by_requests(user_url)
            user_url_cards = tools.get_json_value(user_url_html, 'cards')
            user_url_card_group = tools.get_json_value(user_url_cards[0],
                                                       'card_group')
            area = ''
            for i in user_url_card_group:
                if tools.get_json_value(i, 'item_name') == '所在地':
                    area = tools.get_json_value(i, 'item_content')
                else:
                    continue

            name = tools.get_json_value(user_info, 'screen_name')
            is_verified_reason = 101
            verified_reason = tools.get_json_value(user_info,
                                                   'verified_reason')
            if verified_reason:
                is_verified_reason = 102
            sex = tools.get_json_value(user_info, 'gender')
            if sex == 'f':
                sex = 1
            elif sex == 'm':
                sex = 0
            else:
                sex = ''
            image_url = tools.get_json_value(user_info, 'profile_image_url')
            url = tools.get_json_value(user_info, 'profile_url')
            summary = tools.get_json_value(user_info, 'description')
            user_url_2 = 'http://m.weibo.cn/api/container/getIndex?containerid=100505%s' % _id
            user_url_html_2 = tools.get_json_by_requests(user_url_2)
            fans_count = tools.get_json_value(user_url_html_2,
                                              'userInfo.followers_count')
            follow_count = tools.get_json_value(user_url_html_2,
                                                'userInfo.follow_count')

            log.debug('''
                         用户id:     %s
                         微博昵称:   %s
                         微博地址:   %s
                         头像地址:   %s
                         微博认证:   %s
                         是否认证:   %s
                         所在地:     %s
                         性别:       %s
                         简介:       %s
                         粉丝数:     %s
                         关注数:     %s
                         监测状态:   %s
                        ''' % (_id, name, url, image_url, verified_reason,
                               is_verified_reason, area, sex, summary,
                               fans_count, follow_count, monitor_type))
            base_parser.add_wwa_weibo_user_info('WWA_weibo_user_info', SITE_ID,
                                                _id, name, url, image_url,
                                                verified_reason,
                                                is_verified_reason, area, sex,
                                                summary, fans_count,
                                                follow_count, monitor_type)
        tools.delay_time()
    base_parser.update_url('WWA_weibo_user_urls', root_url, Constance.DONE)
    tools.delay_time()


# parser({'url': 'http://m.weibo.cn/api/container/getIndex?type=user&containerid=100103type%3D3%26q%3D%E9%87%8D%E5%BA%86%E7%94%B5%E8%A7%86%E5%8F%B0'})
Exemplo n.º 18
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']

    headers = {
        "Upgrade-Insecure-Requests": "1",
        "User-Agent":
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
        "Cache-Control": "max-age=0",
        "Connection": "keep-alive",
        "Accept":
        "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Accept-Language": "zh-CN,zh;q=0.8",
        "Accept-Encoding": "gzip, deflate",
        "Cookie":
        "wuid=AAGPF/32GQAAAAqLFD2BdAAAGwY=; CXID=A468F618D67D4868DC83E6061B1B3CCC; ABTEST=0|1500285612|v1; weixinIndexVisited=1; SUV=006317867B7CC4C5596C8AAD6B089707; SUIR=0A14ACB4D0CA9B50A8ABB33CD0CA69FA; ld=ekllllllll2BbH49lllllVOm1tylllll1kecBlllll9lllll9Zlll5@@@@@@@@@@; ad=AZllllllll2Bzw7GlllllVOeQA6lllll1kectkllll9lllllVqxlw@@@@@@@@@@@; SUID=72780CD23D148B0A59688B0C0002AD65; IPLOC=CN1100; sct=11; SNUID=B4B50E097177247B9A6BE55E72153425; JSESSIONID=aaaVCfkabuJQTfaNW5f1v",
        "Host": "weixin.sogou.com"
    }

    # 解析
    html, request = tools.get_html_by_requests(root_url, headers=headers)
    if not html:
        base_parser.update_url('urls', root_url, Constance.EXCEPTION)
        return

    regex = '<input type=text name="c" value="" placeholder="(.*?)" id="seccodeInput">'
    check_info = tools.get_info(html, regex, fetch_one=True)
    log.debug('取公众号列表' + check_info)

    # 公众号信息块
    regex = '<!-- a -->(.*?)<!-- z -->'
    account_blocks = tools.get_info(html, regex)

    if not account_blocks:
        base_parser.update_url('urls', root_url, Constance.EXCEPTION)
        return

    # 文章数url
    regex = '<script>var account_anti_url = "(.*?)";</script>'
    articles_count_url = tools.get_info(html, regex, fetch_one=True)
    articles_count_url = tools.get_full_url('http://weixin.sogou.com',
                                            articles_count_url)
    articles_count_json = tools.get_json_by_requests(articles_count_url).get(
        'msg', {})

    for account_block in account_blocks:
        # print(account_block)
        regex = '<a.*?account_name.*?>(.*?)</a>'
        name = tools.get_info(account_block, regex, fetch_one=True)
        name = tools.del_html_tag(name)

        is_have = mongodb.find('WWA_wechat_official_accounts', {'name': name})
        if is_have:
            log.debug(name + " 已存在")
            continue

        regex = '<div class="img-box">.*?<img src="(.*?)"'
        image_url = tools.get_info(account_block, regex, fetch_one=True)

        # 下载图片
        local_image_url = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(
            date_format='%Y-%m-%d') + "/" + tools.get_current_date(
                date_format='%Y%m%d%H%M%S.%f') + '.jpg'
        is_download = tools.download_file(image_url, local_image_url)
        local_image_url = local_image_url if is_download else ''

        regex = '<p class="tit">.*?(<i></i>).*?<p class="info">'
        is_verified = 102 if tools.get_info(
            account_block, regex, fetch_one=True) else 101

        regex = '<label name="em_weixinhao">(.*?)</label>'
        account_id = tools.get_info(account_block, regex, fetch_one=True)

        regex = '<li id="sogou_vr_.*?d="(.*?)">'
        article_count_key = tools.get_info(account_block,
                                           regex,
                                           fetch_one=True)
        article_count = articles_count_json.get(article_count_key, '')
        article_count = article_count[:article_count.find(',')]

        regex = '<dt>功能介绍.*?<dd>(.*?)</dd>'
        summary = tools.get_info(account_block, regex, fetch_one=True)
        summary = tools.del_html_tag(summary)

        regex = "认证.*?<dd>(.*?)</dd>"
        certification = tools.get_info(account_block, regex, fetch_one=True)

        regex = '微信扫一扫关注.*?<img.*?src="(.*?)"'
        barcode_url = tools.get_info(account_block, regex, fetch_one=True)
        barcode_url = barcode_url.replace('&amp;', "&")

        # 下载图片
        local_barcode_url = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(
            date_format='%Y-%m-%d') + "/" + tools.get_current_date(
                date_format='%Y%m%d%H%M%S.%f') + '.jpg'
        is_download = tools.download_file(barcode_url, local_barcode_url)
        local_barcode_url = local_barcode_url if is_download else ''

        regex = '<a.*?account_name.*?href="(.*?)">'
        account_url = tools.get_info(account_block, regex, fetch_one=True)
        account_url = account_url.replace('&amp;', "&")

        log.debug('''
            公众号名称          %s
            公众号账号          %s
            账号url             %s
            贴图                %s
            本地贴图            %s
            文章数量            %s
            简介                %s
            微信认证            %s
            是否加V(是否认证) %s
            二维码              %s
            本地二维码          %s
            ''' % (name, account_id, account_url, image_url, local_image_url,
                   article_count, summary, certification, is_verified,
                   barcode_url, local_barcode_url))

        base_parser.add_wechat_account_info(
            'WWA_wechat_official_accounts', site_id, name, account_id,
            account_url, image_url, local_image_url, article_count, summary,
            certification, is_verified, barcode_url, local_barcode_url)

    base_parser.update_url('WWA_wechat_account_url', root_url, Constance.DONE)
    tools.delay_time()
Exemplo n.º 19
0
    # 用记事本打开文件后,会在conf文本头前面加上\ufeff,需要处理掉
    content = tools.read_file('config.conf')
    tools.write_file('config.conf', content.replace('\ufeff', ''))

    # 读配置
    cp = configparser.ConfigParser(allow_no_value = True)
    with codecs.open('config.conf', 'r', encoding='utf-8') as f:
        cp.read_file(f)

    sections = cp.sections()
    for section in sections:
        remote_url = cp.get(section, 'remote_url')
        local_save_path = cp.get(section, 'local_save_path')
        project_path = cp.get(section, 'project_path')
        main_lnk_paths = cp.get(section, 'main_lnk_paths').split(',')
        sync_files = cp.get(section, 'sync_files').split(',')
        ignore_files = cp.get(section, 'ignore_files').split(',')

        # # 调用
        update_code = UpdateCode(remote_url, local_save_path, project_path, main_lnk_paths, sync_files, ignore_files)
        if update_code.check_remote_tag():
            update_code.download_code()
            update_code.copy_file()
            update_code.close_process()
            update_code.start_process()

if __name__ == '__main__':
    while True:
        main()
        tools.delay_time(60 * 60)
Exemplo n.º 20
0
def add_root_url(keywords):
    log.debug('''
        添加根url
        parser_params : %s
        ''' % str(keywords))
    # page_max_count = 236

    for keyword in keywords:
        next_keyword = False
        for page_num in range(1, 236):
            url = 'https://m.weibo.cn/api/container/getIndex?type=wb&queryVal=%s' % keyword + \
                  '&featurecode=20000320&luicode=10000011&lfid=100103type%3D1%26q%3D' + keyword \
                  + '&title=' + keyword + '&containerid=100103type%3D2%26q%3D' + keyword + '&page=%d' % page_num
            # base_parser.add_url('WEIBO_urls', SITE_ID, url)
            print('-----------------------------------')
            print(keyword)
            print(url)
            info_json = tools.get_json_by_requests(url)
            # log.debug(info_json)
            info_list = info_json.get('data', {}).get('cards', [])
            if info_list:
                info_list = info_list[0]['card_group']
            else:
                info_list = []
                next_keyword = True

            for weibo_info in info_list:
                content = weibo_info['mblog']['text']
                _id = weibo_info['mblog']['id']
                release_time = weibo_info['mblog']['created_at']
                release_time = get_release_time(release_time)

                url = 'https://m.weibo.cn/status/' + _id
                user_name = weibo_info['mblog']['user']['screen_name']
                video_url = tools.get_info(str(weibo_info),
                                           'stream_url":"(.+?)"',
                                           fetch_one=True)
                reposts_count = weibo_info['mblog']['reposts_count']
                comments_count = weibo_info['mblog']['comments_count']
                attitudes_count = weibo_info['mblog']['attitudes_count']

                is_continue = base_parser.save_weibo_info(
                    'WEIBO_info',
                    site_id=SITE_ID,
                    content=content,
                    release_time=release_time,
                    user_name=user_name,
                    video_url=video_url,
                    _id=_id,
                    url=url,
                    reposts_count=reposts_count,
                    comments_count=comments_count,
                    attitudes_count=attitudes_count)

                if not is_continue:
                    next_keyword = True
                    break

            if next_keyword:
                break

            tools.delay_time(10)
Exemplo n.º 21
0
def read_data():
    try:
        datas = codecs.open('data.txt', 'r', encoding='utf-8')
        info = datas.readlines()[-1].strip()
        infos=info.split(',')
        return infos[-1]
    except Exception as err:
        print(err)
        return ''
while True:

    with open(packets_file_path, 'rb') as file:
        streams = file.read()
        # print(streams.decode('gbk', 'ignore'))

        tools.delay_time(2)
        stream_url = tools.get_info(streams.decode('gbk', 'ignore'), 'str_stream_url[a-z](.+?)\n',
                                    allow_repeat=False)
        try:
            print(len(stream_url))
            stream=stream_url[-1]

            url=read_data()
            if stream == url:
                print('数据相同无法存入')
            else:
                f = open('data.txt', 'a', encoding="utf-8")
                f.write(stream + "\n")
                f.close()
                print('已存入链接:  %s'%stream)
        except Exception as err:
Exemplo n.º 22
0
            article_info['RELEASE_TIME'] = news.get('release_time')
            article_info['RECORD_TIME'] = news.get('record_time')

            # article_info['RELEASE_TIME'] = tools.get_current_date()
            article_info['URL'] = news.get('url')
            article_info['UUID'] = news.get('uuid')
            article_info['WEBSITE_NAME'] = news.get('site_name')
            article_info['AUTHOR'] = news.get('author')
            article_info['INFO_TYPE'] = 8
            article_info['ID'] = news.get('uuid')
            article_info['SUMMARY'] = news.get('title')
            article_info['IMAGE_URL'] = news.get('image_url')

            article_infos.append(article_info)

            max_record_time = news.get('record_time')

        self.deal_article(article_infos)
        self.record_now_record_time(max_record_time)


if __name__ == '__main__':
    video_sync = VideoSync()
    while True:
        video_news_list = video_sync.get_article()
        # print(video_news_list)
        if not video_news_list:
            log.debug('同步数据到最新 sleep %ds ...' % SLEEP_TIME)
            tools.delay_time(SLEEP_TIME)
        else:
            video_sync.deal_video_article(video_news_list)
Exemplo n.º 23
0
                oracledb.close()

                # 入redis, 作为微信爬虫的任务池
                data = (oralce_id, account_id, account_name,
                        last_article_release_time, biz)
                self._redisdb.sadd('wechat:account', data)


if __name__ == '__main__':
    check_new_article = CheckNewArticle()
    while True:
        accounts = check_new_article.get_wait_check_account()

        while accounts:
            threads = []
            for i in range(MAX_THREAD_COUNT):
                if accounts:
                    thread = threading.Thread(
                        target=check_new_article.check_new_article,
                        args=(accounts.pop(0), ))
                    threads.append(thread)
                    thread.start()
                else:
                    break

            for thread in threads:
                thread.join()

        print('休眠10分钟之后检查下一轮')
        tools.delay_time(600)
Exemplo n.º 24
0
 def run(self):
     while True:
         self.monitor_cookies()
         tools.delay_time(MONITOR_COOKIES_INTERVAL)
Exemplo n.º 25
0
    def deal_news(self):
        '''
        @summary: 取tab_news_csr_result信息
        ---------
        ---------
        @result:
        '''
        while True:
            body = {
                "query": {
                    "filtered": {
                        "filter": {
                            "range": {
                                "csr_res_id": {  # 查询大于该csr_res_id 的信息
                                    "gt": self._current_csr_res_id
                                }
                            }
                        }
                    }
                },
                "_source": ["csr_res_id", "csr_content", "start_time"],
                "sort": [{
                    "csr_res_id": "asc"
                }]
            }

            news_json = self._es.search('tab_news_csr_result', body)
            news_list = news_json.get('hits', {}).get('hits', [])

            if not news_list:
                log.debug(
                    'tab_news_csr_result 表中无大于%s的csr_res_id\nsleep %s...' %
                    (self._current_csr_res_id, SLEEP_TIME))
                tools.delay_time(SLEEP_TIME)
                continue

            for news_info in news_list:
                news = news_info.get('_source')
                csr_res_id = news.get('csr_res_id')
                csr_content = news.get('csr_content')
                start_time = news.get('start_time')

                log.debug('''
                    处理 tab_news_csr_result
                    csr_res_id  %s
                    start_time  %s
                    csr_content %s
                    ''' % (csr_res_id, start_time, csr_content))

                # 找相似文章
                similar_hot = None
                hots = self._get_same_day_hots(csr_content, start_time)

                # 遍历相似的文章,比较相似度
                for hot_info in hots:
                    hot = hot_info.get('_source')
                    hot_text = hot.get('csr_content')

                    temp_similarity = compare_text(csr_content, hot_text)
                    if temp_similarity > MIN_SIMILARITY:
                        similar_hot = hot

                    break  #hots 按照匹配值排序后,第一个肯定是最相似的,无需向后比较

                # 如果找到相似的文章,追加csr_res_id和hot值, 否则将该条信息最为新的热点
                if similar_hot:  # 找到相似的热点
                    log.debug('找到所属热点:%s' % similar_hot.get('csr_content'))

                    data = {}

                    # 更新热点的热度及追加文章的id
                    data["hot"] = similar_hot["hot"] + 1
                    data["csr_res_ids"] = similar_hot[
                        "csr_res_ids"] + ',' + csr_res_id

                    # 更新热点
                    self._es.update_by_id("tab_news_csr_hot",
                                          data_id=similar_hot.get("hot_id"),
                                          data=data)

                else:  # 没有找到相似的热点, 将当前文章作为热点
                    log.debug('无所属热点')

                    hot_info = {
                        'hot_id': csr_res_id,
                        'hot': 1,
                        'start_time': start_time,
                        'csr_res_ids': csr_res_id,
                        'csr_content': csr_content
                    }
                    self._es.add('tab_news_csr_hot',
                                 hot_info,
                                 data_id=csr_res_id)

                # 保存当前的id
                self._current_csr_res_id = csr_res_id
                self._save_current_id()