def image_predict(self, image_url):
        if not image_url:
            return -1
        # 如果是网络图片 先下载 识别 然后删除
        if image_url.startswith('http'):
            local_image_path = TEMP_IMAGE_SAVE_PATH + tools.get_current_date(
                date_format='%Y%m%d%H%M%S.%f') + '.jpg'
            is_success = tools.download_file(image_url, local_image_path)
            image_url = local_image_path if is_success else image_url
            result = ImagePornRecg.__image_porn_dll.Pic_Predict(
                image_url, self._porn_image_index)
            tools.del_file(local_image_path)

        # 如果是本地图片 不是jpg格式 则需要转成jpg格式
        elif not image_url.endswith('jpg'):
            jpg_image_url = image_url[:image_url.rfind('.')] + '.jpg'
            is_success = ffmpeg_manager.convert_file_format(
                image_url, jpg_image_url)
            image_url = jpg_image_url if is_success else image_url
            result = ImagePornRecg.__image_porn_dll.Pic_Predict(
                image_url, self._porn_image_index)

        else:
            result = ImagePornRecg.__image_porn_dll.Pic_Predict(
                image_url, self._porn_image_index)
        return result
Пример #2
0
def spider_picture(p_url, end):
    for i in range(1,11):
        i = str(i)
        url = p_url+i+end
        html, r = tools.get_html_by_requests(url)
        regex = 'title=".*?".*?src = "(.*?)".*?<div class="wrapper-listTitle">'
        img_urls = tools.get_info(html, regex)
        regex_name = 'rseat="dsjp7".*?title="(.*?)".*?src = ".*?"'
        names = tools.get_info(html, regex_name)
        j=0
        for img_url in img_urls:
            name = names[j]
            name = tools.del_html_tag(name)
            j=j+1
            #print(img_url,'---',name,'****',j)
            FILE_LOCAL_PATH = 'd:'
            sto_path = '/picture/' + name + '.jpg'
            tools.download_file(img_url, FILE_LOCAL_PATH, sto_path)
Пример #3
0
 def add_root_urls(url):
     html, r = tools.get_html_by_requests(url)
     # print(html)
     regex = '<div class="site-piclist_pic">(.*?)</li>'
     html_infos = tools.get_info(html, regex)
     s = 0
     for info in html_infos:
         regex = 'href = "(.*?)" class="site-piclist_pic_link"'
         url = tools.get_info(info, regex)
         url = url and url[0] or ''
         regex = 'rseat="bigTitle.*?title="(.*?)"'
         name = tools.get_info(info, regex)
         name = name and name[0] or ''
         name = tools.del_html_tag(name)
         video_download_url = get_download_url(url)
         FILE_LOCAL_PATH = 'd:'
         sto_path = '/videos/' + name + '.mp4'
         tools.download_file(video_download_url, FILE_LOCAL_PATH, sto_path)
         print(video_download_url, name)
def spider_picture(p_url, end):
    for i in range(1, 7):
        i = str(i)
        url = p_url + i + end
        #print(url)
        html, r = tools.get_html_by_requests(url)
        #print(html)
        regex = '<a class="figure.*?<img.*?src="(.*?)"/>'
        img_urls = tools.get_info(html, regex)

        regex_name = 'data-widget-searchlist-tvname="(.*?)"'
        names = tools.get_info(html, regex_name)
        j = 0
        for img_url in img_urls:
            name = names[j]
            name = tools.del_html_tag(name)
            j = j + 1
            # if not re.match(".jpg", img_url):
            #     img_url = img_url+'.jpg'
            #print(img_url,'---',name,'****',j)
            FILE_LOCAL_PATH = 'd:'
            sto_path = '/ViolatePicture/' + name + '.jpg'
            tools.download_file(img_url, FILE_LOCAL_PATH, sto_path)
Пример #5
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']

    column_id = remark

    headers = {
        'Host': 'is.snssdk.com',
        'Accept': ' */*',
        'X-SS-Cookie':
        '_ba=BA0.2-20170101-51e32-mV0oh6KwzUmWxXl227kO; install_id=8738029911; ttreq=1$b34d173d3544397b1ca82d19a58a7db80e2aef29; qh[360]=1; alert_coverage=33; _ga=GA1.2.1084363974.1479979043; login_flag=cd47dd57ff2f963719bc324163954696; sessionid=3554607744525de375854663cc7e355b; sid_guard="3554607744525de375854663cc7e355b|1489461314|2592000|Thu\054 13-Apr-2017 03:15:14 GMT"; sid_tt=3554607744525de375854663cc7e355b',
        'tt-request-time': '1489990271848',
        'Cookie':
        ' _ba=BA0.2-20170101-51e32-mV0oh6KwzUmWxXl227kO; install_id=8738029911; ttreq=1$b34d173d3544397b1ca82d19a58a7db80e2aef29; qh[360]=1; alert_coverage=33; _ga=GA1.2.1084363974.1479979043; login_flag=cd47dd57ff2f963719bc324163954696; sessionid=3554607744525de375854663cc7e355b; sid_guard="3554607744525de375854663cc7e355b|1489461314|2592000|Thu\054 13-Apr-2017 03:15:14 GMT"; sid_tt=3554607744525de375854663cc7e355b',
        'User-Agent': 'News/6.0.1 (iPhone; iOS 10.2.1; Scale/3.00)',
        'Accept-Language': ' zh-Hans-CN;q=1, en-CN;q=0.9',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': ' keep-alive'
    }

    json = tools.get_json_by_requests(root_url)

    if not json:
        base_parser.update_url('VAApp_urls', root_url, Constance.EXCEPTION)
        return

    datas = json['data']
    for data in datas:
        data = tools.get_json_value(data, 'content')

        title = tools.get_json_value(data, 'title')

        # 检测数据库中是否存在,若存在则退出
        if db.find('VAApp_content_info', {'title': title}):
            continue

        abstract = tools.get_json_value(data, 'abstract')
        abstract = abstract and abstract or tools.get_json_value(
            data, 'content')

        img_url = tools.get_json_value(data, 'image_list.url')
        img_url = img_url and img_url or tools.get_json_value(
            data, 'middle_image.url')
        img_url = img_url and img_url or tools.get_json_value(
            data, 'large_image_list.url')
        img_url = img_url and img_url.replace('.webp', '.jpg') or img_url

        original_url = tools.get_json_value(data, 'article_url')
        original_url = original_url and original_url or tools.get_json_value(
            data, 'share_url')

        release_time = tools.get_json_value(data, 'publish_time')
        release_time = release_time and release_time or tools.get_json_value(
            data, '1481012423')
        release_time = release_time and tools.timestamp_to_date(
            release_time) or release_time

        video_msg = tools.get_json_value(data, 'video_play_info')  #需要处理
        video_main_url = tools.get_json_value(video_msg,
                                              'video_list.video_2.main_url')
        video_main_url = video_main_url and video_main_url or tools.get_json_value(
            video_msg, 'video_list.video_1.main_url')
        parse_video_url = tools.compile_js(PARSE_VIDEO_URL_JSFUNC)
        video_url = parse_video_url('base64decode', video_main_url)

        html = tools.get_html_auto_deal_code(original_url)
        regexs = [
            'class="article-content">(.*?)<div class="article-actions">',
            '<div class="content">(.*?)<div class="suggestion-list-con"',
            '<!-- 文章内容 -->(.*?)<!-- @end 文章内容 -->',
            'class="yi-content-text">(.*?)<div class="yi-normal"',
            '<p.*?>(.*?)</p>'
        ]

        if video_url:
            content = abstract
        else:
            content = ''.join(tools.get_info(html, regexs))
            content = tools.del_html_tag(content)

        if len(content) < len(abstract):
            content = abstract

        # 敏感事件
        sensitive_id = ''
        sensitive_event_infos = oracledb.find(
            'select * from tab_mvms_sensitive_event')
        for sensitive_event_info in sensitive_event_infos:
            _id = sensitive_event_info[0]
            keyword1 = sensitive_event_info[3].split(
                ' ') if sensitive_event_info[3] else []
            keyword2 = sensitive_event_info[4].split(
                ' ') if sensitive_event_info[4] else []
            keyword3 = sensitive_event_info[5].split(
                ' ') if sensitive_event_info[5] else []

            if base_parser.is_violate(title + content,
                                      key1=keyword1,
                                      key2=keyword2,
                                      key3=keyword3):
                sensitive_id = _id

        # 违规事件
        violate_id = ''
        vioation_knowledge_infos = oracledb.find(
            'select * from tab_mvms_violation_knowledge')
        for vioation_knowledge_info in vioation_knowledge_infos:
            _id = vioation_knowledge_info[0]
            keyword1 = vioation_knowledge_info[2].split(
                ' ') if vioation_knowledge_info[2] else []
            keyword2 = vioation_knowledge_info[3].split(
                ' ') if vioation_knowledge_info[3] else []
            keyword3 = vioation_knowledge_info[4].split(
                ' ') if vioation_knowledge_info[4] else []

            if base_parser.is_violate(title + content,
                                      key1=keyword1,
                                      key2=keyword2,
                                      key3=keyword3):
                violate_id = _id

        log.debug('''
            title:          %s
            abstract :      %s
            img_url :       %s
            original_url:   %s
            release_time :  %s
            video_main_url: %s
            video_url:      %s
            content :       %s
            column_id:      %d
            sensitive_id:   %d
            violate_id:     %d

            ''' % (title, abstract, img_url, original_url, release_time,
                   video_main_url, video_url, content, column_id, sensitive_id
                   and sensitive_id or 0, violate_id and violate_id or 0))

        # 如果是视频栏 并且不包含敏感或违法信息 则不下载
        if column_id == VIDEO:
            if not sensitive_id and not violate_id:
                continue

        # 下载
        base_path = FILE_LOCAL_PATH
        is_download = 0

        # 下载图片
        img_name = ''
        if img_url:
            img_name = 'images/' + tools.get_current_date(
                date_format='%Y-%m-%d') + "/" + tools.get_current_date(
                    date_format='%Y%m%d%H%M%S.%f') + '.jpg'
            is_download = tools.download_file(img_url, base_path, img_name)
            if not is_download:
                img_name = ''

        # 下载视频
        video_name = ''
        if video_url:
            video_name = 'videos/' + tools.get_current_date(
                date_format='%Y-%m-%d') + "/" + tools.get_current_date(
                    date_format='%Y%m%d%H%M%S.%f') + '.mp4'
            is_download = tools.download_file(video_url, base_path, video_name)
            if not is_download:
                video_name = ''

        if original_url:
            base_parser.add_va_app_content_info(
                'VAApp_content_info', SITE_ID, title, abstract, img_url,
                img_name, original_url, release_time, video_url, video_name,
                content, column_id, is_download, sensitive_id, violate_id,
                STORAGE_ID)

    base_parser.update_url('VAApp_urls', root_url, Constance.DONE)
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    weibo_id = url_info['remark']['search_keyword']
    monitor_type = url_info['remark']['monitor_type']

    for i in range(1, 100):
        weibo_content_url = root_url + '&page=%d' % i

        # 代理
        headers = {
            "Cache-Control":
            "max-age=0",
            "Cookie":
            "_T_WM=e0a91a3ed6286a67e649ce567fbbd17a; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D2304131560851875_-_WEIBO_SECOND_PROFILE_WEIBO%26fid%3D100103type%253D401%2526q%253D%26uicode%3D10000011",
            "Accept-Language":
            "zh-CN,zh;q=0.8",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
            "Host":
            "m.weibo.cn",
            "Accept-Encoding":
            "gzip, deflate, br",
            "Upgrade-Insecure-Requests":
            "1",
            "Connection":
            "keep-alive",
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"
        }
        proxies = base_parser.get_proxies()
        headers["User-Agent"] = base_parser.get_user_agent()
        proxies = {}
        html = tools.get_json_by_requests(weibo_content_url,
                                          headers=headers,
                                          proxies=proxies)

        cards = tools.get_json_value(html, 'cards')
        if len(cards) < 2:
            base_parser.update_url('WWA_weibo_info_urls', root_url,
                                   Constance.DONE)
            return

        tools.delay_time(10)
        for card in cards:
            mblog = tools.get_json_value(card, 'mblog')
            if not mblog:
                continue

            url = tools.get_json_value(card, 'scheme')

            # 代理
            headers = {
                "User-Agent":
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
                "Accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
                "Accept-Encoding": "gzip, deflate, br",
                "Cookie":
                "_T_WM=e0a91a3ed6286a67e649ce567fbbd17a; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D100103type%253D401%2526q%253D%26fid%3D2304131560851875_-_WEIBO_SECOND_PROFILE_WEIBO%26uicode%3D10000011",
                "Host": "m.weibo.cn",
                "Accept-Language": "zh-CN,zh;q=0.8",
                "Upgrade-Insecure-Requests": "1",
                "Connection": "keep-alive"
            }
            proxies = base_parser.get_proxies()
            headers["User-Agent"] = base_parser.get_user_agent()
            proxies = {}
            origin_html, r = tools.get_html_by_requests(url,
                                                        headers=headers,
                                                        proxies=proxies)
            if not origin_html:
                continue

            release_time = get_release_time(mblog)
            come_from = tools.get_json_value(mblog, 'source')
            regexs = ['"text": "(.+?)",']
            content = ''.join(tools.get_info(origin_html, regexs))
            # content = tools.del_html_tag(content)
            content = content.replace('\\', '')

            sexy_image_url = []

            regexs = ['"pic_ids": \[(.*?)\],']
            image_url = ''.join(tools.get_info(origin_html, regexs))
            image_url = tools.del_html_tag(image_url).replace('\"',
                                                              '').replace(
                                                                  '\\n', '')
            if image_url:
                image_url = image_url.split(',')
                for i in range(len(image_url)):
                    image_url[i] = 'http://wx2.sinaimg.cn/large/' + image_url[
                        i] + '.jpg'

                sexy_image_url = image_url
                image_url = ','.join(image_url)
            regexs = ['"stream_url": "(.*?)"']
            video_url = ''.join(tools.get_info(origin_html, regexs))
            transpond_count = tools.get_json_value(mblog, 'reposts_count')
            praise_count = tools.get_json_value(mblog, 'attitudes_count')

            # 敏感事件
            sensitive_id = ''
            if monitor_type == 1 or monitor_type == 2:
                sensitive_event_infos = oracledb.find(
                    'select t.id, t.keyword1, t.keyword2, t.keyword3 from tab_mvms_sensitive_event t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time'
                )
                for sensitive_event_info in sensitive_event_infos:
                    _id = sensitive_event_info[0]
                    keyword1 = sensitive_event_info[1].split(
                        ',') if sensitive_event_info[1] else []
                    keyword2 = sensitive_event_info[2].split(
                        ',') if sensitive_event_info[2] else []
                    keyword3 = sensitive_event_info[3].split(
                        ',') if sensitive_event_info[3] else []

                    if base_parser.is_violate(content,
                                              key1=keyword1,
                                              key2=keyword2,
                                              key3=keyword3):
                        sensitive_id = _id
                        break

            # 违规事件
            violate_id = ''
            if monitor_type == 0 or monitor_type == 2:
                vioation_knowledge_infos = oracledb.find(
                    'select t.id, t.keyword1, t.keyword2, t.keyword3 from tab_mvms_violation_knowledge t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time'
                )
                for vioation_knowledge_info in vioation_knowledge_infos:
                    _id = vioation_knowledge_info[0]
                    keyword1 = vioation_knowledge_info[1].split(
                        ',') if vioation_knowledge_info[1] else []
                    keyword2 = vioation_knowledge_info[2].split(
                        ',') if vioation_knowledge_info[2] else []
                    keyword3 = vioation_knowledge_info[3].split(
                        ',') if vioation_knowledge_info[3] else []

                    if base_parser.is_violate(content,
                                              key1=keyword1,
                                              key2=keyword2,
                                              key3=keyword3):
                        violate_id = _id
                        break

            # 下载视频
            is_mp4 = tools.is_file(video_url, 'mp4')
            if is_mp4:
                local_video_path = FILE_LOCAL_PATH + 'videos/' + tools.get_current_date(
                    date_format='%Y-%m-%d') + "/" + tools.get_current_date(
                        date_format='%Y%m%d%H%M%S.%f') + '.mp4'
                is_download = tools.download_file(video_url, local_video_path)
                video_url = local_video_path if is_download else ''
            else:
                video_url = ''

            log.debug('''
                      原文地址:     %s
                      微博ID:       %s
                      发布时间:     %s
                      来自:         %s
                      内容:         %s
                      图片地址:     %s
                      视频地址:     %s
                      转发数:       %s
                      点赞数:       %s
                      违规id:       %s
                      敏感事件       %s
                      图像鉴别地址   %s
                     ''' %
                      (url, weibo_id, release_time, come_from, content,
                       image_url, video_url, transpond_count, praise_count,
                       violate_id, sensitive_id, sexy_image_url))

            if content:
                base_parser.add_wwa_weibo_info_info(
                    'WWA_weibo_info_info',
                    SITE_ID,
                    url,
                    weibo_id,
                    release_time,
                    come_from,
                    content,
                    image_url,
                    video_url,
                    transpond_count,
                    praise_count,
                    violate_id,
                    sensitive_id=sensitive_id,
                    sexy_image_url=sexy_image_url)
        tools.delay_time()

    base_parser.update_url('WWA_weibo_info_urls', root_url, Constance.DONE)
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']

    column_id = remark

    while True:
        try:
            json = tools.get_json_by_requests(root_url,
                                              headers=HEADERS,
                                              data=data,
                                              params=PARAMS)
            newslist = tools.get_json_value(json, 'newslist')
            if not newslist:
                break

            data['cachedCount'] += len(newslist)
            data['page'] += 1
            for news in newslist:
                # print(tools.dumps_json(news))
                title = tools.get_json_value(news, 'title')
                release_time = tools.get_json_value(news, 'time')
                abstract = tools.get_json_value(news, 'abstract')
                original_url = tools.get_json_value(news, 'url')
                img_url = tools.get_json_value(
                    news, 'thumbnails_qqnews')[0] if tools.get_json_value(
                        news, 'thumbnails_qqnews') else ''
                video_frame_url = tools.get_json_value(
                    news, 'video_channel.video.playurl')
                # 取content
                html = tools.get_html_by_urllib(original_url)
                content = tools.get_tag(html,
                                        name='div',
                                        attrs={'class': "main"},
                                        find_all=False)
                content = tools.del_html_tag(str(content))

                # 解析视频真实地址
                video_url = ''
                if video_frame_url:
                    video_vid = tools.get_info(html,
                                               'vid\s*=\s*"\s*([^"]+)"',
                                               fetch_one=True)
                    video_url = ''.join(qq.qq_download_by_vid(video_vid))

                # 判断是否违规
                # 敏感事件
                sensitive_id = ''
                sensitive_event_infos = oracledb.find(
                    'select * from tab_mvms_sensitive_event')
                for sensitive_event_info in sensitive_event_infos:
                    _id = sensitive_event_info[0]
                    keyword1 = sensitive_event_info[3].split(
                        ' ') if sensitive_event_info[3] else []
                    keyword2 = sensitive_event_info[4].split(
                        ' ') if sensitive_event_info[4] else []
                    keyword3 = sensitive_event_info[5].split(
                        ' ') if sensitive_event_info[5] else []

                    if base_parser.is_violate(title + content,
                                              key1=keyword1,
                                              key2=keyword2,
                                              key3=keyword3):
                        sensitive_id = _id

                # 违规事件
                violate_id = ''
                vioation_knowledge_infos = oracledb.find(
                    'select * from tab_mvms_violation_knowledge')
                for vioation_knowledge_info in vioation_knowledge_infos:
                    _id = vioation_knowledge_info[0]
                    keyword1 = vioation_knowledge_info[2].split(
                        ' ') if vioation_knowledge_info[2] else []
                    keyword2 = vioation_knowledge_info[3].split(
                        ' ') if vioation_knowledge_info[3] else []
                    keyword3 = vioation_knowledge_info[4].split(
                        ' ') if vioation_knowledge_info[4] else []

                    if base_parser.is_violate(title + content,
                                              key1=keyword1,
                                              key2=keyword2,
                                              key3=keyword3):
                        violate_id = _id

                log.debug(
                    '''
                title:          %s
                abstract :      %s
                img_url :       %s
                original_url:   %s
                release_time :  %s
                video_url:      %s
                content :       %s
                column_id:      %d
                sensitive_id:   %s
                violate_id:     %s

                ''' %
                    (title, abstract, img_url, original_url, release_time,
                     video_url, content, column_id, sensitive_id, violate_id))

                # 下载
                base_path = FILE_LOCAL_PATH
                is_download = 0

                # 下载图片
                img_name = ''
                if img_url:
                    img_name = 'images/' + tools.get_current_date(
                        date_format='%Y-%m-%d') + "/" + tools.get_current_date(
                            date_format='%Y%m%d%H%M%S.%f') + '.jpg'
                    is_download = tools.download_file(img_url, base_path,
                                                      img_name)
                    if not is_download:
                        img_name = ''

                # 下载视频
                video_name = ''
                if video_url:
                    video_name = 'videos/' + tools.get_current_date(
                        date_format='%Y-%m-%d') + "/" + tools.get_current_date(
                            date_format='%Y%m%d%H%M%S.%f') + '.mp4'
                    is_download = tools.download_file(video_url, base_path,
                                                      video_name)
                    if not is_download:
                        video_name = ''

                if original_url:
                    base_parser.add_va_app_content_info(
                        'VAApp_content_info', SITE_ID, title, abstract,
                        img_url, img_name, original_url, release_time,
                        video_url, video_name, content, column_id, is_download,
                        sensitive_id, violate_id, STORAGE_ID)

        except Exception as e:
            log.debug(e)
            pass

    base_parser.update_url('VAApp_urls', root_url, Constance.DONE)
Пример #8
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']

    headers = {
        "Upgrade-Insecure-Requests": "1",
        "User-Agent":
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
        "Cache-Control": "max-age=0",
        "Connection": "keep-alive",
        "Accept":
        "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Accept-Language": "zh-CN,zh;q=0.8",
        "Accept-Encoding": "gzip, deflate",
        "Cookie":
        "wuid=AAGPF/32GQAAAAqLFD2BdAAAGwY=; CXID=A468F618D67D4868DC83E6061B1B3CCC; ABTEST=0|1500285612|v1; weixinIndexVisited=1; SUV=006317867B7CC4C5596C8AAD6B089707; SUIR=0A14ACB4D0CA9B50A8ABB33CD0CA69FA; ld=ekllllllll2BbH49lllllVOm1tylllll1kecBlllll9lllll9Zlll5@@@@@@@@@@; ad=AZllllllll2Bzw7GlllllVOeQA6lllll1kectkllll9lllllVqxlw@@@@@@@@@@@; SUID=72780CD23D148B0A59688B0C0002AD65; IPLOC=CN1100; sct=11; SNUID=B4B50E097177247B9A6BE55E72153425; JSESSIONID=aaaVCfkabuJQTfaNW5f1v",
        "Host": "weixin.sogou.com"
    }

    # 解析
    html, request = tools.get_html_by_requests(root_url, headers=headers)
    if not html:
        base_parser.update_url('urls', root_url, Constance.EXCEPTION)
        return

    regex = '<input type=text name="c" value="" placeholder="(.*?)" id="seccodeInput">'
    check_info = tools.get_info(html, regex, fetch_one=True)
    log.debug('取公众号列表' + check_info)

    # 公众号信息块
    regex = '<!-- a -->(.*?)<!-- z -->'
    account_blocks = tools.get_info(html, regex)

    if not account_blocks:
        base_parser.update_url('urls', root_url, Constance.EXCEPTION)
        return

    # 文章数url
    regex = '<script>var account_anti_url = "(.*?)";</script>'
    articles_count_url = tools.get_info(html, regex, fetch_one=True)
    articles_count_url = tools.get_full_url('http://weixin.sogou.com',
                                            articles_count_url)
    articles_count_json = tools.get_json_by_requests(articles_count_url).get(
        'msg', {})

    for account_block in account_blocks:
        # print(account_block)
        regex = '<a.*?account_name.*?>(.*?)</a>'
        name = tools.get_info(account_block, regex, fetch_one=True)
        name = tools.del_html_tag(name)

        is_have = mongodb.find('WWA_wechat_official_accounts', {'name': name})
        if is_have:
            log.debug(name + " 已存在")
            continue

        regex = '<div class="img-box">.*?<img src="(.*?)"'
        image_url = tools.get_info(account_block, regex, fetch_one=True)

        # 下载图片
        local_image_url = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(
            date_format='%Y-%m-%d') + "/" + tools.get_current_date(
                date_format='%Y%m%d%H%M%S.%f') + '.jpg'
        is_download = tools.download_file(image_url, local_image_url)
        local_image_url = local_image_url if is_download else ''

        regex = '<p class="tit">.*?(<i></i>).*?<p class="info">'
        is_verified = 102 if tools.get_info(
            account_block, regex, fetch_one=True) else 101

        regex = '<label name="em_weixinhao">(.*?)</label>'
        account_id = tools.get_info(account_block, regex, fetch_one=True)

        regex = '<li id="sogou_vr_.*?d="(.*?)">'
        article_count_key = tools.get_info(account_block,
                                           regex,
                                           fetch_one=True)
        article_count = articles_count_json.get(article_count_key, '')
        article_count = article_count[:article_count.find(',')]

        regex = '<dt>功能介绍.*?<dd>(.*?)</dd>'
        summary = tools.get_info(account_block, regex, fetch_one=True)
        summary = tools.del_html_tag(summary)

        regex = "认证.*?<dd>(.*?)</dd>"
        certification = tools.get_info(account_block, regex, fetch_one=True)

        regex = '微信扫一扫关注.*?<img.*?src="(.*?)"'
        barcode_url = tools.get_info(account_block, regex, fetch_one=True)
        barcode_url = barcode_url.replace('&amp;', "&")

        # 下载图片
        local_barcode_url = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(
            date_format='%Y-%m-%d') + "/" + tools.get_current_date(
                date_format='%Y%m%d%H%M%S.%f') + '.jpg'
        is_download = tools.download_file(barcode_url, local_barcode_url)
        local_barcode_url = local_barcode_url if is_download else ''

        regex = '<a.*?account_name.*?href="(.*?)">'
        account_url = tools.get_info(account_block, regex, fetch_one=True)
        account_url = account_url.replace('&amp;', "&")

        log.debug('''
            公众号名称          %s
            公众号账号          %s
            账号url             %s
            贴图                %s
            本地贴图            %s
            文章数量            %s
            简介                %s
            微信认证            %s
            是否加V(是否认证) %s
            二维码              %s
            本地二维码          %s
            ''' % (name, account_id, account_url, image_url, local_image_url,
                   article_count, summary, certification, is_verified,
                   barcode_url, local_barcode_url))

        base_parser.add_wechat_account_info(
            'WWA_wechat_official_accounts', site_id, name, account_id,
            account_url, image_url, local_image_url, article_count, summary,
            certification, is_verified, barcode_url, local_barcode_url)

    base_parser.update_url('WWA_wechat_account_url', root_url, Constance.DONE)
    tools.delay_time()
Пример #9
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']['keyword']
    monitor_type = url_info['remark']['monitor_type']
    official_accounts_id = remark
    retry_times = url_info['retry_times']

    headers = {
    "Host": "weixin.sogou.com",
    "Connection": "keep-alive",
    "Accept-Encoding": "gzip, deflate",
    "Accept-Language": "zh-CN,zh;q=0.8",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Cookie": "ABTEST=8|1506658658|v1; IPLOC=CN1100; SUID=C5C47C7B642E940A0000000059CDC962; SUID=C5C47C7B1508990A0000000059CDC963; weixinIndexVisited=1; SUV=00F95AA57B7CC4C559CDC963CE316529; SNUID=2B2A9295EDE8B7A2BCECB605EE30F1BE; JSESSIONID=aaadcwpP9yaKs-PCMhz6v",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
    "Upgrade-Insecure-Requests": "1"
    }

    # 获取代理
    proxies = base_parser.get_proxies()
    headers["User-Agent"] = base_parser.get_user_agent()

    # 解析
    # print(proxies)
    # html, r = tools.get_html_by_requests('http://ip.chinaz.com/getip.aspx', headers = headers, proxies = proxies)
    # print(html)

    html, request = tools.get_html_by_requests(root_url, headers = headers, proxies = proxies)
    if not html:
        base_parser.update_url('urls', root_url, Constance.TODO, retry_times + 1)
        return

    # print(html)
    regex = '<input type=text name="c" value="" placeholder="(.*?)" id="seccodeInput">'
    check_info = tools.get_info(html, regex, fetch_one = True)
    print(root_url)
    log.debug('取文章链接' + check_info)

    if check_info:
        base_parser.update_url('urls', root_url, Constance.TODO, retry_times + 1)
        return

    # 公众号信息块
    regex = '<!-- a -->(.*?)<!-- z -->'
    account_block = tools.get_info(html, regex, fetch_one = True)
    # url
    regex = '<a.*?account_name.*?href="(.*?)">'
    account_url = tools.get_info(account_block, regex, fetch_one = True)
    account_url = account_url.replace('&amp;',"&")
    log.debug('account_url = ' + account_url)

    if not account_url:
        base_parser.update_url('urls', root_url, Constance.EXCEPTION)
        return

    headers = {
        "Accept-Language": "zh-CN,zh;q=0.8",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate",
        "Host": "mp.weixin.qq.com",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
        "Upgrade-Insecure-Requests": "1",
        "Connection": "keep-alive"
    }

    # 代理
    proxies = base_parser.get_proxies()
    headers["User-Agent"] = base_parser.get_user_agent()
    proxies = {} #使用代理会出现验证码 暂时不使用

    html, request = tools.get_html_by_requests(account_url, headers = headers, proxies = proxies)
    regex = '<input class="weui_input frm_input" id="input" placeholder="(.*?)" maxlength="4">'
    check_info = tools.get_info(html, regex, fetch_one = True)
    log.debug('''
        取文章详细内容 %s
        url %s
        request.headers %s
        '''%(check_info, account_url, request.headers))
    # print(html)

    regex = 'var msgList = (.*?});'
    article_json = tools.get_info(html, regex, fetch_one = True)
    article_json = tools.get_json(article_json)

    article_list = article_json.get('list', {})
    for article in article_list:
        title = tools.get_json_value(article, 'app_msg_ext_info.title')
        is_have = mongodb.find('WWA_wechat_article', {'title' : title})
        if is_have:
            log.debug(title + " 已存在")
            continue

        summary = tools.get_json_value(article, 'app_msg_ext_info.digest')
        image_url = tools.get_json_value(article, 'app_msg_ext_info.cover')

        sexy_image_url = []

        # 下载图片
        local_image_url = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.jpg'
        is_download = tools.download_file(image_url, local_image_url)
        local_image_url = local_image_url if is_download else ''
        sexy_image_url.append(local_image_url)

        article_url = tools.get_json_value(article, 'app_msg_ext_info.content_url')
        article_url = tools.get_full_url('http://mp.weixin.qq.com', article_url)
        article_url = article_url.replace('&amp;',"&")

        release_time = tools.get_json_value(article, 'comm_msg_info.datetime')
        release_time = tools.timestamp_to_date(int(release_time)) if release_time else ''

        content_html, request = tools.get_html_by_requests(article_url, headers = headers, proxies = proxies)
        regex = '(<div class="rich_media_content " id="js_content">.*?)<script nonce'
        content = tools.get_info(content_html, regex, fetch_one = True)

        # # 取content里的图片 下载图片 然后替换内容中原来的图片地址
        regex = '<img.*?data-src="(.*?)"'
        images = tools.get_info(content, regex)
        for image in images:
            local_image_path = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.' + (image[image.find('wx_fmt=') + len('wx_fmt='):(image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) if image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) != -1 else None)] if 'wx_fmt=' in image else 'jpg')
            is_download = tools.download_file(image, local_image_path)
            if is_download:
                content = content.replace(image, local_image_path)
                sexy_image_url.append(local_image_path)
            tools.delay_time(5)

        # 敏感事件
        sensitive_id = ''
        if monitor_type == 1 or monitor_type == 2:
            sensitive_event_infos = oracledb.find('select t.id, t.keyword1, t.keyword2, t.keyword3 from tab_mvms_sensitive_event t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time')
            for sensitive_event_info in sensitive_event_infos:
                _id = sensitive_event_info[0]
                keyword1 = sensitive_event_info[1].split(',') if sensitive_event_info[1] else []
                keyword2 = sensitive_event_info[2].split(',') if sensitive_event_info[2] else []
                keyword3 = sensitive_event_info[3].split(',') if sensitive_event_info[3] else []

                if base_parser.is_violate(title + content, key1 = keyword1, key2 = keyword2, key3 = keyword3):
                    sensitive_id = _id
                    break

        # 违规事件
        violate_id = ''
        if monitor_type == 0 or monitor_type == 2:
            vioation_knowledge_infos = oracledb.find('select t.id, t.keyword1, t.keyword2, t.keyword3 from tab_mvms_violation_knowledge t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time')
            for vioation_knowledge_info in vioation_knowledge_infos:
                _id = vioation_knowledge_info[0]
                keyword1 = vioation_knowledge_info[1].split(',') if vioation_knowledge_info[1] else []
                keyword2 = vioation_knowledge_info[2].split(',') if vioation_knowledge_info[2] else []
                keyword3 = vioation_knowledge_info[3].split(',') if vioation_knowledge_info[3] else []

                if base_parser.is_violate(title + tools.del_html_tag(content), key1=keyword1, key2=keyword2, key3=keyword3):
                    violate_id = _id
                    break

        log.debug('''
            标题         %s
            简介         %s
            图片地址     %s
            文章地址     %s
            发布时间     %s
            内容         %s
            本地贴图地址 %s
            违规状态     %s
            敏感事件     %s
            图片鉴别地址 %s
            '''%(title, summary, image_url, article_url, release_time, content, local_image_url, violate_id, sensitive_id, sexy_image_url))

        base_parser.add_wechat_content_info('WWA_wechat_article', site_id, official_accounts_id, title, summary, image_url, article_url, release_time, content, video_url = '', local_image_url = local_image_url, violate_status = violate_id, sensitive_id = sensitive_id, sexy_image_url = sexy_image_url)

        # 同一天发布的
        oneday_article_list = article.get('app_msg_ext_info', {}).get('multi_app_msg_item_list', [])
        for article in oneday_article_list:
            title = tools.get_json_value(article, 'title')
            summary = tools.get_json_value(article, 'digest')
            image_url = tools.get_json_value(article, 'cover')

            sexy_image_url = []

            # 下载图片
            local_image_url = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.jpg'
            is_download = tools.download_file(image_url, local_image_url)
            local_image_url = local_image_url if is_download else ''
            sexy_image_url.append(local_image_url)

            article_url = tools.get_json_value(article, 'content_url')
            article_url = tools.get_full_url('http://mp.weixin.qq.com', article_url)
            article_url = article_url.replace('&amp;',"&")

            content_html, request = tools.get_html_by_requests(article_url, headers = headers, proxies = proxies)
            regex = '(<div class="rich_media_content " id="js_content">.*?)<script nonce'
            content = tools.get_info(content_html, regex, fetch_one = True)

            # 取content里的图片 下载图片 然后替换内容中原来的图片地址
            regex = '<img.*?data-src="(.*?)"'
            images = tools.get_info(content, regex)
            for image in images:
                local_image_path = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.' + (image[image.find('wx_fmt=') + len('wx_fmt='):(image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) if image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) != -1 else None)] if 'wx_fmt=' in image else 'jpg')
                is_download = tools.download_file(image, local_image_path)
                if is_download:
                    content = content.replace(image, local_image_path)
                    sexy_image_url.append(local_image_path)
                tools.delay_time(5)

            # 敏感事件
            sensitive_id = ''
            sensitive_event_infos = oracledb.find('select * from tab_mvms_sensitive_event')
            for sensitive_event_info in sensitive_event_infos:
                _id = sensitive_event_info[0]
                keyword1 = sensitive_event_info[3].split(',') if sensitive_event_info[3] else []
                keyword2 = sensitive_event_info[4].split(',') if sensitive_event_info[4] else []
                keyword3 = sensitive_event_info[5].split(',') if sensitive_event_info[5] else []

                if base_parser.is_violate(title + content, key1 = keyword1, key2 = keyword2, key3 = keyword3):
                    sensitive_id = _id
                    break

            # 违规事件
            violate_id = ''
            vioation_knowledge_infos = oracledb.find('select * from tab_mvms_violation_knowledge')
            for vioation_knowledge_info in vioation_knowledge_infos:
                _id = vioation_knowledge_info[0]
                keyword1 = vioation_knowledge_info[2].split(',') if vioation_knowledge_info[2] else []
                keyword2 = vioation_knowledge_info[3].split(',') if vioation_knowledge_info[3] else []
                keyword3 = vioation_knowledge_info[4].split(',') if vioation_knowledge_info[4] else []

                if base_parser.is_violate(title + tools.del_html_tag(content), key1=keyword1, key2=keyword2, key3=keyword3):
                    violate_id = _id
                    break

            log.debug('''
            标题         %s
            简介         %s
            图片地址     %s
            文章地址     %s
            发布时间     %s
            内容         %s
            本地贴图地址 %s
            违规状态     %s
            敏感事件     %s
            图片鉴别地址 %s
            '''%(title, summary, image_url, article_url, release_time, content, local_image_url, violate_id, sensitive_id, sexy_image_url))

            base_parser.add_wechat_content_info('WWA_wechat_article', site_id, official_accounts_id, title, summary, image_url, article_url, release_time, content, video_url = '', local_image_url = local_image_url, violate_status = violate_id, sensitive_id = sensitive_id, sexy_image_url = sexy_image_url)

    base_parser.update_url('WWA_wechat_article_url', root_url, Constance.DONE)
    tools.delay_time()
Пример #10
0
 def download_code(self):
     tools.download_file(self._remote_zip_url, self._zip_path)
     tools.unpack_zip(self._zip_path, self._unpack_path)
Пример #11
0
def save_info(table,
              site_id,
              site_name='',
              url='',
              title='',
              content='',
              release_time='',
              image_url='',
              video_url='',
              is_out_link=1,
              download_image=False,
              is_debug=False,
              es_read_status='',
              info_type=''):
    # global num
    # if num<2000:
    #     num+=1
    #     image_recogs=image_recog(image_url)
    # else:
    #     image_recogs=5

    if not download_image:
        sexy_image_url = image_url
        local_image_path = ''
    else:
        file_local_path = tools.get_conf_value('config.conf', 'files',
                                               'zhejiang_app_save_path')
        if image_url:
            img_name = 'images/' + tools.get_current_date(
                date_format='%Y-%m-%d') + "/" + tools.get_current_date(
                    date_format='%Y%m%d%H%M%S.%f') + '.jpg'
            tools.download_file(image_url, file_local_path, img_name)
            local_image_path = file_local_path + img_name
            sexy_image_url = local_image_path
        else:
            local_image_path = ''
            sexy_image_url = ''

    if len(content) > 400:
        temporary_content = content[0:400]
    else:
        temporary_content = content

    # record_time = tools.get_current_date()
    # release_time = tools.format_date(release_time)
    try:
        release_time = tools.format_date(release_time)
    except Exception as e:
        log.debug(e, release_time, url)
    record_time = tools.get_current_date()
    if release_time > record_time:
        return
    content_info = {
        'site_name': site_name,
        'video_url': video_url,
        'image_url': image_url,
        'temporary_content': temporary_content,
        'title': title,
        # 'video_local_path': local_video_path,\
        'img_stor_path': local_image_path,
        'release_time': release_time,
        'is_out_link': is_out_link,
        'url': url,
        'es_read_status': 0,
        'site_id': site_id,
        'read_status': 0,
        'record_time': record_time,
        # 'sexy_image_url': sexy_image_url, 'sexy_image_status': '', 'image_pron_status': image_recogs
    }
    content_info.pop('temporary_content')
    content_info['content'] = content
    if db.add(table, content_info):
        log.debug(content_info)