def inner_add_url(url, keyword):
     while url:
         html_json = tools.get_json_by_requests(url)
         json_value = tools.get_json_value(html_json, 'obj.pageNumberStack')
         hasNext = tools.get_json_value(html_json, 'obj.hasNext')
         if hasNext:
             url = 'http://sj.qq.com/myapp/searchAjax.htm?kw=%s&pns=' % keyword + json_value + '&sid=0'
             base_parser.add_url('WWA_search_app_urls', SITE_ID, url)
             continue
         else:
             break
Exemplo n.º 2
0
 def getdownload(episode_download_url_json):
     episode_json = tools.get_json_by_requests(episode_download_url_json)
     #print(episode_download_url_json)
     episode_download_url = tools.get_json_value(episode_json,
                                                 'msgs.playurl.domain')
     episode_download_url = episode_download_url and episode_download_url[
         0] or ''
     #print('-----',episode_download_url)
     episode_download_url_definition = tools.get_json_value(
         episode_json, 'msgs.playurl.dispatch.1080p')
     episode_download_url_definition = episode_download_url_definition and episode_download_url_definition[
         0] or ''
     print(episode_download_url_definition, '*********')
     episode_download_url = episode_download_url + episode_download_url_definition
     episode_download_url += "&ctv=pc&m3v=1&termid=1&format=1&hwtype=un&ostype=Linux&tag=letv&sign=letv&expect=3&tn={}&pay=0&iscpn=f9051&rateid={}".format(
         random.random(), '1080p')
     episode_download_url_json = tools.get_json_by_requests(
         episode_download_url)
     episode_download_url = tools.get_json_value(episode_download_url_json,
                                                 'location')
     return episode_download_url
Exemplo n.º 3
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']

    column_id = remark

    headers = {
        'Host': 'is.snssdk.com',
        'Accept': ' */*',
        'X-SS-Cookie':
        '_ba=BA0.2-20170101-51e32-mV0oh6KwzUmWxXl227kO; install_id=8738029911; ttreq=1$b34d173d3544397b1ca82d19a58a7db80e2aef29; qh[360]=1; alert_coverage=33; _ga=GA1.2.1084363974.1479979043; login_flag=cd47dd57ff2f963719bc324163954696; sessionid=3554607744525de375854663cc7e355b; sid_guard="3554607744525de375854663cc7e355b|1489461314|2592000|Thu\054 13-Apr-2017 03:15:14 GMT"; sid_tt=3554607744525de375854663cc7e355b',
        'tt-request-time': '1489990271848',
        'Cookie':
        ' _ba=BA0.2-20170101-51e32-mV0oh6KwzUmWxXl227kO; install_id=8738029911; ttreq=1$b34d173d3544397b1ca82d19a58a7db80e2aef29; qh[360]=1; alert_coverage=33; _ga=GA1.2.1084363974.1479979043; login_flag=cd47dd57ff2f963719bc324163954696; sessionid=3554607744525de375854663cc7e355b; sid_guard="3554607744525de375854663cc7e355b|1489461314|2592000|Thu\054 13-Apr-2017 03:15:14 GMT"; sid_tt=3554607744525de375854663cc7e355b',
        'User-Agent': 'News/6.0.1 (iPhone; iOS 10.2.1; Scale/3.00)',
        'Accept-Language': ' zh-Hans-CN;q=1, en-CN;q=0.9',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': ' keep-alive'
    }

    json = tools.get_json_by_requests(root_url)

    if not json:
        base_parser.update_url('VAApp_urls', root_url, Constance.EXCEPTION)
        return

    datas = json['data']
    for data in datas:
        data = tools.get_json_value(data, 'content')

        title = tools.get_json_value(data, 'title')

        # 检测数据库中是否存在,若存在则退出
        if db.find('VAApp_content_info', {'title': title}):
            continue

        abstract = tools.get_json_value(data, 'abstract')
        abstract = abstract and abstract or tools.get_json_value(
            data, 'content')

        img_url = tools.get_json_value(data, 'image_list.url')
        img_url = img_url and img_url or tools.get_json_value(
            data, 'middle_image.url')
        img_url = img_url and img_url or tools.get_json_value(
            data, 'large_image_list.url')
        img_url = img_url and img_url.replace('.webp', '.jpg') or img_url

        original_url = tools.get_json_value(data, 'article_url')
        original_url = original_url and original_url or tools.get_json_value(
            data, 'share_url')

        release_time = tools.get_json_value(data, 'publish_time')
        release_time = release_time and release_time or tools.get_json_value(
            data, '1481012423')
        release_time = release_time and tools.timestamp_to_date(
            release_time) or release_time

        video_msg = tools.get_json_value(data, 'video_play_info')  #需要处理
        video_main_url = tools.get_json_value(video_msg,
                                              'video_list.video_2.main_url')
        video_main_url = video_main_url and video_main_url or tools.get_json_value(
            video_msg, 'video_list.video_1.main_url')
        parse_video_url = tools.compile_js(PARSE_VIDEO_URL_JSFUNC)
        video_url = parse_video_url('base64decode', video_main_url)

        html = tools.get_html_auto_deal_code(original_url)
        regexs = [
            'class="article-content">(.*?)<div class="article-actions">',
            '<div class="content">(.*?)<div class="suggestion-list-con"',
            '<!-- 文章内容 -->(.*?)<!-- @end 文章内容 -->',
            'class="yi-content-text">(.*?)<div class="yi-normal"',
            '<p.*?>(.*?)</p>'
        ]

        if video_url:
            content = abstract
        else:
            content = ''.join(tools.get_info(html, regexs))
            content = tools.del_html_tag(content)

        if len(content) < len(abstract):
            content = abstract

        # 敏感事件
        sensitive_id = ''
        sensitive_event_infos = oracledb.find(
            'select * from tab_mvms_sensitive_event')
        for sensitive_event_info in sensitive_event_infos:
            _id = sensitive_event_info[0]
            keyword1 = sensitive_event_info[3].split(
                ' ') if sensitive_event_info[3] else []
            keyword2 = sensitive_event_info[4].split(
                ' ') if sensitive_event_info[4] else []
            keyword3 = sensitive_event_info[5].split(
                ' ') if sensitive_event_info[5] else []

            if base_parser.is_violate(title + content,
                                      key1=keyword1,
                                      key2=keyword2,
                                      key3=keyword3):
                sensitive_id = _id

        # 违规事件
        violate_id = ''
        vioation_knowledge_infos = oracledb.find(
            'select * from tab_mvms_violation_knowledge')
        for vioation_knowledge_info in vioation_knowledge_infos:
            _id = vioation_knowledge_info[0]
            keyword1 = vioation_knowledge_info[2].split(
                ' ') if vioation_knowledge_info[2] else []
            keyword2 = vioation_knowledge_info[3].split(
                ' ') if vioation_knowledge_info[3] else []
            keyword3 = vioation_knowledge_info[4].split(
                ' ') if vioation_knowledge_info[4] else []

            if base_parser.is_violate(title + content,
                                      key1=keyword1,
                                      key2=keyword2,
                                      key3=keyword3):
                violate_id = _id

        log.debug('''
            title:          %s
            abstract :      %s
            img_url :       %s
            original_url:   %s
            release_time :  %s
            video_main_url: %s
            video_url:      %s
            content :       %s
            column_id:      %d
            sensitive_id:   %d
            violate_id:     %d

            ''' % (title, abstract, img_url, original_url, release_time,
                   video_main_url, video_url, content, column_id, sensitive_id
                   and sensitive_id or 0, violate_id and violate_id or 0))

        # 如果是视频栏 并且不包含敏感或违法信息 则不下载
        if column_id == VIDEO:
            if not sensitive_id and not violate_id:
                continue

        # 下载
        base_path = FILE_LOCAL_PATH
        is_download = 0

        # 下载图片
        img_name = ''
        if img_url:
            img_name = 'images/' + tools.get_current_date(
                date_format='%Y-%m-%d') + "/" + tools.get_current_date(
                    date_format='%Y%m%d%H%M%S.%f') + '.jpg'
            is_download = tools.download_file(img_url, base_path, img_name)
            if not is_download:
                img_name = ''

        # 下载视频
        video_name = ''
        if video_url:
            video_name = 'videos/' + tools.get_current_date(
                date_format='%Y-%m-%d') + "/" + tools.get_current_date(
                    date_format='%Y%m%d%H%M%S.%f') + '.mp4'
            is_download = tools.download_file(video_url, base_path, video_name)
            if not is_download:
                video_name = ''

        if original_url:
            base_parser.add_va_app_content_info(
                'VAApp_content_info', SITE_ID, title, abstract, img_url,
                img_name, original_url, release_time, video_url, video_name,
                content, column_id, is_download, sensitive_id, violate_id,
                STORAGE_ID)

    base_parser.update_url('VAApp_urls', root_url, Constance.DONE)
Exemplo n.º 4
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']

    json = tools.get_json_by_requests(root_url)

    # 主播信息
    lives = tools.get_json_value(json, 'lives')
    # print(tools.dumps_json(lives))

    for live in lives:
        name = tools.get_json_value(live, 'creator.nick')
        image_url = tools.get_json_value(live, 'creator.portrait')
        image_url = tools.get_full_url('http://img2.inke.cn', image_url)

        room_id = tools.get_json_value(live, 'creator.id')
        room_url = tools.get_json_value(live, 'share_addr')
        video_path = tools.get_json_value(live, 'stream_addr')
        watched_count = tools.get_json_value(live, 'online_users')
        address = tools.get_json_value(live, 'city')
        # 取粉丝数
        params = {
            'lc': '0000000000000048',
            'cc': 'TG0001',
            'cv': 'IK3.8.60_Iphone',
            'proto': 7,
            'idfa': 'D2E02B97-0F35-486F-9CD4-A2EC13BBC8FB',
            'idfv': '5779214D-BC8F-446E-A547-913048F7F935',
            'devi': '0a4392f06ab0ff10b44c6f88d95bf4d6db67f0e7',
            'osversion': 'ios_10.200000',
            'ua': 'iPhone9_2',
            'imei': '',
            'imsi': '',
            'uid': 207821358,
            'sid': '20RUXGrYPxpJy75btYQYlVp6lYxi0wj1xV50Ttnls6ty3DcXE5i1',
            'conn': 'wifi',
            'mtid': '987c70ecbcd643998ea6bcd3b8868934',
            'mtxid': 'b0958e29253f',
            'logid': 133,
            'id': room_id,
            's_sg': S_SG,
            's_sc': 100,
            's_st': CURRENT_TIMESTAMP
        }
        fans_json = tools.get_json_by_requests(
            'http://120.55.238.158/api/user/relation/numrelations', params)
        fans_count = tools.get_json_value(fans_json, 'num_followers')

        #主播观众数请求地址
        params = {
            'lc': '0000000000000048',
            'cc': 'TG0001',
            'cv': 'IK3.8.60_Iphone',
            'proto': 7,
            'idfa': 'D2E02B97-0F35-486F-9CD4-A2EC13BBC8FB',
            'idfv': '5779214D-BC8F-446E-A547-913048F7F935',
            'devi': '0a4392f06ab0ff10b44c6f88d95bf4d6db67f0e7',
            'osversion': 'ios_10.200000',
            'ua': 'iPhone9_2',
            'imei': '',
            'imsi': '',
            'uid': 207821358,
            'sid': '20RUXGrYPxpJy75btYQYlVp6lYxi0wj1xV50Ttnls6ty3DcXE5i1',
            'conn': 'wifi',
            'mtid': '987c70ecbcd643998ea6bcd3b8868934',
            'mtxid': 'b0958e29253f',
            'logid': 133,
            'id': tools.get_json_value(live, 'id'),
            'multiaddr': 1,
            's_sg': S_SG,
            's_sc': 100,
            's_st': CURRENT_TIMESTAMP
        }

        watched_count_url = 'http://120.55.238.158/api/live/infos'  #?lc=0000000000000048&cc=TG0001&cv=IK3.8.60_Iphone&proto=7&idfa=D2E02B97-0F35-486F-9CD4-A2EC13BBC8FB&idfv=5779214D-BC8F-446E-A547-913048F7F935&devi=0a4392f06ab0ff10b44c6f88d95bf4d6db67f0e7&osversion=ios_10.200000&ua=iPhone9_2&imei=&imsi=&uid=207821358&sid=20RUXGrYPxpJy75btYQYlVp6lYxi0wj1xV50Ttnls6ty3DcXE5i1&conn=wifi&mtid=987c70ecbcd643998ea6bcd3b8868934&mtxid=b0958e29253f&logid=133&id=1487572239333810%2C1487572432485069%2C1487572763094071%2C1487573160678176%2C1487571635332280&multiaddr=1&s_sg=c3493ab9d9b2e19cfc20f98bb75ff72f&s_sc=100&s_st=1487573119'
        watched_count_url = tools.joint_url(watched_count_url, params)

        live_info = tools.get_json_by_requests(watched_count_url)
        sex = live_info['lives'][0]['creator']['sex']
        sex = 0 if sex == '1' else 1  #数据库中 0 男 1女; 映客中 0 和 3是女 1是男
        age = ''

        log.debug(
            '''
            名字:       %s
            贴图:       %s
            主播id:     %s
            房间url:    %s
            视频流地址: %s
            观看数:     %s
            地址:       %s
            粉丝数:     %s
            性别:       %s
            年龄:       %s
            观众数url:  %s
            ''' %
            (name, image_url, room_id, room_url, video_path, watched_count,
             address, fans_count, sex, age, watched_count_url))

        base_parser.add_anchor_info('LiveApp_anchor_info',
                                    SITE_ID,
                                    name=name,
                                    image_url=image_url,
                                    room_id=room_id,
                                    room_url=room_url,
                                    video_path=video_path,
                                    watched_count=watched_count,
                                    address=address,
                                    fans_count=fans_count,
                                    sex=sex,
                                    age=age,
                                    watched_count_url=watched_count_url)

    base_parser.update_url('LiveApp_urls', root_url, Constance.DONE)
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    description = url_info['remark']

    html, r = tools.get_html_by_requests(source_url)
    regexs = '<ul class="st-list cfix">(.*?)<div class="ssPages area">'
    lis = tools.get_info(html, regexs)
    regexs = '<li>(.*?)</li>'
    html_lis = tools.get_info(lis, regexs)
    for html_li in html_lis:
        url_regex = '<a href="(.*?)"'
        url = tools.get_info(html_li, url_regex)
        url = url and url[0] or ''
        #取到每一部url
        url = "http:" + url

        everyone_html, r = tools.get_html_by_requests(url)

        # 部 信息
        regexs_program_name = '<h3 class="lh-tit">.*?<a.*?>(.*?)</a>'
        program_name = tools.get_info(html_li, regexs_program_name)
        program_name = program_name and program_name[0] or ''


        # 部 url
        program_url = url

        #部 发布时间
        release_time_regex = '发布时间:(.*?)</p>'
        release_time = tools.get_info(html_li, release_time_regex)
        release_time = release_time and release_time[0] or ''

        # 部 摘要
        regexs_summary = '<span class="full_intro" style="display: none">(.*?)</span>'
        summary = tools.get_info(everyone_html, regexs_summary)
        summary = summary and summary[0] or ''
        summary = tools.del_html_tag(summary)

        # 部 图片
        img_url_regex = '<img.*?src="(.*?)".*?>'
        image_url = tools.get_info(html_li, img_url_regex)
        image_url = image_url and image_url[0] or ''
        image_url = "http:"+image_url

        # log.debug('''
        #          depth                       = %s
        #          program_name                = %s
        #          program_url                 = %s
        #          image_url                   = %s
        #          summary                     = %s
        #          release_time                = %s
        #       ''' % (depth, program_name, program_url, image_url, summary, release_time))

        program_id = base_parser.add_program_info('PROGRAM_info', site_id, program_name, program_url,
                                                  image_url=image_url,
                                                  episode='', directors='', actors='', summary=summary,
                                                  release_time=release_time)
        print('-=-=-=-=-=-=-=-=-=-=-')

        #获取每集信息json url参数playlistId,variety_year

        everyone_html, r = tools.get_html_by_requests(program_url)
        playlistId_regex = 'var playlistId="(\d*?)";'
        playlistId = tools.get_info(everyone_html, playlistId_regex)
        playlistId = ''.join(playlistId)


        # 获取每集信息json url参数variety_year

        variety_years_regex = '<li class="v-year">(.*?)</li>'
        variety_years_html = tools.get_info(everyone_html, variety_years_regex)
        variety_years_regex = '<em>(\d*?)</em>'
        variety_years = tools.get_info(variety_years_html, variety_years_regex)


        if playlistId and variety_years:
            for variety_year in variety_years:

                episode_json_url = 'http://tv.sohu.com/item/VideoServlet?callback=&source=sohu&id=' + \
                                   playlistId + '&year=' + variety_year + '&month=0&page=1'
                episode_json = tools.get_json_by_requests(episode_json_url)
                # print(tools.dumps_json(episode_json))
                # 获取集数
                episode_json_infos = tools.get_json_value(episode_json, 'videos')
                #episode = len(episode_json_infos)

                for episode_json_info in episode_json_infos:
                    # 集摘要
                    episode_summary = tools.get_json_value(episode_json_info, 'videoDesc')
                    # 集名字
                    episode_name = tools.get_json_value(episode_json_info, 'title')
                    # 集url
                    episode_url = tools.get_json_value(episode_json_info, 'url')
                    # 下载地址
                    episode_download_url = you_get.get_video_url(episode_url)
                    if episode_download_url:
                        episode_download_url = '^_^'.join(episode_download_url)
                    # 集图片地址
                    episode_image_url = tools.get_json_value(episode_json_info, 'pic10')
                    # 当前集数
                    episode_num = tools.get_json_value(episode_json_info, 'showDate')

                    download_status = 102

                    time_length = ''

                    if episode_download_url:
                        # log.debug('''
                        #                         depth                       = %s
                        #                         episode_num                 = %s
                        #                         time_length                 = %s
                        #                         episode_name                = %s
                        #                         episode_url                 = %s
                        #                         episode_download_url        = %s
                        #                         episode_summary             = %s
                        #                         episode_image_url           = %s
                        #
                        #                      ''' % (
                        # depth, episode_num, time_length, episode_name, episode_url, episode_download_url, episode_summary,
                        # episode_image_url))

                        base_parser.add_program_episode_info('PROGRAM_EPISODE_info', site_id, program_id, episode_num, time_length, episode_name, download_status,
                                                     episode_download_url, episode_url, episode_summary, episode_image_url, sto_path='')
        if playlistId and not variety_years:
            regexs = '<!-- start : juqing title -->(.*?)<!-- end : plot content -->'
            episode_infos = tools.get_info(everyone_html, regexs)
            for episode_info in episode_infos:
                # 集名字
                regex = '<h4><.*?>(.*?)<span></span></a></h4>'
                episode_name = tools.get_info(episode_info, regex)
                episode_name = episode_name and episode_name[0] or ''

                # 摘要
                regex = '<p class="intro synopsis text">(.*?)</p>'
                episode_summary = tools.get_info(episode_info, regex)
                episode_summary = episode_summary and episode_summary[0] or ''
                episode_summary = tools.del_html_tag(episode_summary)

                # 图片url
                regex = '<img src="(.*?)" width=".*?" height=".*?"'
                episode_image_url = tools.get_info(episode_info, regex)
                episode_image_url = episode_image_url and episode_image_url[0] or ''
                episode_image_url = "http:" + episode_image_url

                # 集数
                regex = '<h4><a href=.*?>画心师 第一季(.*?)<span></span></a></h4>'
                episode_num = tools.get_info(episode_info, regex)
                episode_num = episode_num and episode_num[0] or ''

                # url
                regex = '<h4><a href="(.*?)" title=".*?" target="_blank">'
                episode_url = tools.get_info(episode_info, regex)
                episode_url = episode_url and episode_url[0] or ''
                episode_url = "http:" + episode_url

                # 下载地址
                episode_download_url = you_get.get_video_url(episode_url)
                if episode_download_url:
                    episode_download_url = '^_^'.join(episode_download_url)

                download_status = 102

                time_length = ''
                if episode_download_url:
                    log.debug('''
                                           depth                       = %s
                                           episode_num                 = %s
                                           time_length                 = %s
                                           episode_name                = %s
                                           episode_url                 = %s
                                           episode_download_url        = %s
                                           episode_summary             = %s
                                           episode_image_url           = %s

                                        ''' % (
                    depth+1, episode_num, time_length, episode_name, episode_url, episode_download_url, episode_summary,
                    episode_image_url))

                base_parser.add_program_episode_info('PROGRAM_EPISODE_info', site_id, program_id, episode_num,
                                                     time_length, episode_name, download_status,
                                                     episode_download_url, episode_url, episode_summary,
                                                     episode_image_url, sto_path='')

    # 更新source_url为done
    base_parser.update_url('PROGRAM_urls', source_url, Constance.DONE)
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    weibo_id = url_info['remark']['search_keyword']
    monitor_type = url_info['remark']['monitor_type']

    for i in range(1, 100):
        weibo_content_url = root_url + '&page=%d' % i

        # 代理
        headers = {
            "Cache-Control":
            "max-age=0",
            "Cookie":
            "_T_WM=e0a91a3ed6286a67e649ce567fbbd17a; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D2304131560851875_-_WEIBO_SECOND_PROFILE_WEIBO%26fid%3D100103type%253D401%2526q%253D%26uicode%3D10000011",
            "Accept-Language":
            "zh-CN,zh;q=0.8",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
            "Host":
            "m.weibo.cn",
            "Accept-Encoding":
            "gzip, deflate, br",
            "Upgrade-Insecure-Requests":
            "1",
            "Connection":
            "keep-alive",
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"
        }
        proxies = base_parser.get_proxies()
        headers["User-Agent"] = base_parser.get_user_agent()
        proxies = {}
        html = tools.get_json_by_requests(weibo_content_url,
                                          headers=headers,
                                          proxies=proxies)

        cards = tools.get_json_value(html, 'cards')
        if len(cards) < 2:
            base_parser.update_url('WWA_weibo_info_urls', root_url,
                                   Constance.DONE)
            return

        tools.delay_time(10)
        for card in cards:
            mblog = tools.get_json_value(card, 'mblog')
            if not mblog:
                continue

            url = tools.get_json_value(card, 'scheme')

            # 代理
            headers = {
                "User-Agent":
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
                "Accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
                "Accept-Encoding": "gzip, deflate, br",
                "Cookie":
                "_T_WM=e0a91a3ed6286a67e649ce567fbbd17a; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D100103type%253D401%2526q%253D%26fid%3D2304131560851875_-_WEIBO_SECOND_PROFILE_WEIBO%26uicode%3D10000011",
                "Host": "m.weibo.cn",
                "Accept-Language": "zh-CN,zh;q=0.8",
                "Upgrade-Insecure-Requests": "1",
                "Connection": "keep-alive"
            }
            proxies = base_parser.get_proxies()
            headers["User-Agent"] = base_parser.get_user_agent()
            proxies = {}
            origin_html, r = tools.get_html_by_requests(url,
                                                        headers=headers,
                                                        proxies=proxies)
            if not origin_html:
                continue

            release_time = get_release_time(mblog)
            come_from = tools.get_json_value(mblog, 'source')
            regexs = ['"text": "(.+?)",']
            content = ''.join(tools.get_info(origin_html, regexs))
            # content = tools.del_html_tag(content)
            content = content.replace('\\', '')

            sexy_image_url = []

            regexs = ['"pic_ids": \[(.*?)\],']
            image_url = ''.join(tools.get_info(origin_html, regexs))
            image_url = tools.del_html_tag(image_url).replace('\"',
                                                              '').replace(
                                                                  '\\n', '')
            if image_url:
                image_url = image_url.split(',')
                for i in range(len(image_url)):
                    image_url[i] = 'http://wx2.sinaimg.cn/large/' + image_url[
                        i] + '.jpg'

                sexy_image_url = image_url
                image_url = ','.join(image_url)
            regexs = ['"stream_url": "(.*?)"']
            video_url = ''.join(tools.get_info(origin_html, regexs))
            transpond_count = tools.get_json_value(mblog, 'reposts_count')
            praise_count = tools.get_json_value(mblog, 'attitudes_count')

            # 敏感事件
            sensitive_id = ''
            if monitor_type == 1 or monitor_type == 2:
                sensitive_event_infos = oracledb.find(
                    'select t.id, t.keyword1, t.keyword2, t.keyword3 from tab_mvms_sensitive_event t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time'
                )
                for sensitive_event_info in sensitive_event_infos:
                    _id = sensitive_event_info[0]
                    keyword1 = sensitive_event_info[1].split(
                        ',') if sensitive_event_info[1] else []
                    keyword2 = sensitive_event_info[2].split(
                        ',') if sensitive_event_info[2] else []
                    keyword3 = sensitive_event_info[3].split(
                        ',') if sensitive_event_info[3] else []

                    if base_parser.is_violate(content,
                                              key1=keyword1,
                                              key2=keyword2,
                                              key3=keyword3):
                        sensitive_id = _id
                        break

            # 违规事件
            violate_id = ''
            if monitor_type == 0 or monitor_type == 2:
                vioation_knowledge_infos = oracledb.find(
                    'select t.id, t.keyword1, t.keyword2, t.keyword3 from tab_mvms_violation_knowledge t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time'
                )
                for vioation_knowledge_info in vioation_knowledge_infos:
                    _id = vioation_knowledge_info[0]
                    keyword1 = vioation_knowledge_info[1].split(
                        ',') if vioation_knowledge_info[1] else []
                    keyword2 = vioation_knowledge_info[2].split(
                        ',') if vioation_knowledge_info[2] else []
                    keyword3 = vioation_knowledge_info[3].split(
                        ',') if vioation_knowledge_info[3] else []

                    if base_parser.is_violate(content,
                                              key1=keyword1,
                                              key2=keyword2,
                                              key3=keyword3):
                        violate_id = _id
                        break

            # 下载视频
            is_mp4 = tools.is_file(video_url, 'mp4')
            if is_mp4:
                local_video_path = FILE_LOCAL_PATH + 'videos/' + tools.get_current_date(
                    date_format='%Y-%m-%d') + "/" + tools.get_current_date(
                        date_format='%Y%m%d%H%M%S.%f') + '.mp4'
                is_download = tools.download_file(video_url, local_video_path)
                video_url = local_video_path if is_download else ''
            else:
                video_url = ''

            log.debug('''
                      原文地址:     %s
                      微博ID:       %s
                      发布时间:     %s
                      来自:         %s
                      内容:         %s
                      图片地址:     %s
                      视频地址:     %s
                      转发数:       %s
                      点赞数:       %s
                      违规id:       %s
                      敏感事件       %s
                      图像鉴别地址   %s
                     ''' %
                      (url, weibo_id, release_time, come_from, content,
                       image_url, video_url, transpond_count, praise_count,
                       violate_id, sensitive_id, sexy_image_url))

            if content:
                base_parser.add_wwa_weibo_info_info(
                    'WWA_weibo_info_info',
                    SITE_ID,
                    url,
                    weibo_id,
                    release_time,
                    come_from,
                    content,
                    image_url,
                    video_url,
                    transpond_count,
                    praise_count,
                    violate_id,
                    sensitive_id=sensitive_id,
                    sexy_image_url=sexy_image_url)
        tools.delay_time()

    base_parser.update_url('WWA_weibo_info_urls', root_url, Constance.DONE)
def parser(url_info):
    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']
    # root_url = 'http://list.youku.com/show/id_ze7cc3b8ed96711e68ce4.html'
    # depth = 0
    # headers = {'Host': 'cmstool.youku.com',
    #            'Referer': 'http://v.youku.com/v_show/id_XMjY2NzY3MTE4NA.html',
    #            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
    #            'Cookie': '__ysuid=1491380613750xxD; __yscnt=1; juid=01bg7f56tqm9e; __aryft=1495434329; yseid=1495503610725JmZw8d; yseidcount=11; seid=01bgpfc8rb2vm6; ykss=fe922359521ce2d462cbda53; cna=Y5NrEThaR2MCAdOcjEogCug8; __ayvstp=6; __aysvstp=110; l=AmdnSHROpJU3344cDsaqhZhFd5Ex5jvO; isg=AlZW_barEwKJtiefqvOnVZcapwzSXpoxTdXpV8C_SDnUg_YdKIfqQbwzbaiV; __ayft=1495503611023; __aysid=1495416942598jZ1; __arpvid=1495504158930FOANHy-1495504158944; __arycid=; __ayscnt=1; __arcms=; __aypstp=5; __ayspstp=140; ypvid=1495504161820uZFGHk; ysestep=5; yseidtimeout=1495511361821; ycid=0; ystep=237; referhost=; seidtimeout=1495505961826'}

    if depth == 0:
        html = tools.get_html_by_urllib(root_url)

        header_info = tools.get_tag(html, 'div', {'class': 'p-thumb'}, find_all=False)
        program_name = header_info.a['title']

        recent_video_url = header_info.a['href']
        recent_video_url = 'http:'+recent_video_url

        recent_video_id = tools.get_info(recent_video_url, ['id_(.+?)='], fetch_one=True)
        if not recent_video_id:
            recent_video_id = tools.get_info(recent_video_url, ['id_(.+?)\.h'], fetch_one=True)

        actors = tools.get_tag(html, 'li', {'class': 'p-row'})[2].get_text()
        actors = ''.join(tools.re.compile('主持人:(.+)').findall(actors))

        summary = tools.get_tag(html, 'span', {'class': 'text'}, find_all=False).get_text()
        summary = ''.join(tools.re.compile('简介:(.+)').findall(summary))

        image_url = tools.get_tag(html, 'div', {'class': 'p-thumb'}, find_all=False)
        image_url = image_url.img['src']

        list_url = 'https://ups.youku.com/ups/get.json?vid=%s==&ccode=0401&client_ip=&utid=Y5NrEThaR2MCAdOcjEogCug8&client_ts=' % recent_video_id
        list_json = tools.get_json_by_requests(list_url)
        video_list = tools.get_json_value(list_json, 'data.videos.list')
        # print(video_list)

        episode = tools.get_json_value(list_json, 'data.show.episode_total')

        log.debug('''
                      recent_video_url:  %s
                      recent_video_id:   %s
                      集数:              %s
                      主持人:            %s
                      封面地址:          %s
                      专辑地址:          %s
                      简介:              %s
                      节目名称:          %s
                      视频列表:          %s
                      list_url:          %s
                      ''' % (recent_video_url, recent_video_id, episode, actors, image_url, root_url, summary, program_name, video_list, list_url))

        program_id = base_parser.add_program_info('PROGRAM_info', SITE_ID, actors=actors, image_url=image_url, program_url=root_url,
                                     summary=summary, program_name=program_name, episode=episode)

        for vl in video_list:
            vl_id = tools.get_json_value(vl, 'encodevid')
            vl_url = 'http://v.youku.com/v_show/id_%s.html' % vl_id
            base_parser.add_url('PROGRAM_urls', SITE_ID, vl_url, depth=1, remark=program_id)

        base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE)
    elif depth == 1:
        program_id = remark
        html, res = tools.get_html_by_requests(root_url)
        episode_name = tools.get_tag(html, 'h1', find_all=False)
        episode_name = episode_name.get_text()

        videoId = tools.get_info(html, ['videoId:"(.+?)"'], fetch_one=True)
        play_count, res = tools.get_html_by_requests('http://v.youku.com/action/getVideoPlayInfo?vid=%s&callback=tuijsonp5'% videoId)
        if not play_count:
            print(1)
        play_count = tools.get_info(play_count, ['"vv":"(.+?)"'], fetch_one=True)
        play_count = play_count.replace(',', '')

        # info_html, info_res = tools.get_html_by_requests('http://cmstool.youku.com/cms/playlog/get?callback=tuijsonp7', headers)
        # # print(info_html)
        # image_url = tools.get_info(info_html, ['"thumburl":"(.+?)",'], fetch_one=True)
        # image_url = image_url.replace('\\', '')
        # print(image_url)
        # episode_num = tools.get_info(info_html, ['"watchStage":"(.+?)",'], fetch_one=True)
        # episode_num = tools.to_chinese(episode_num)
        # print(episode_num)

        recent_video_id = tools.get_info(root_url, ['id_(.+?)='], fetch_one=True)
        if not recent_video_id:
            recent_video_id = tools.get_info(root_url, ['id_(.+?)\.h'], fetch_one=True)
        list_url = 'https://ups.youku.com/ups/get.json?vid=%s==&ccode=0401&client_ip=&utid=Y5NrEThaR2MCAdOcjEogCug8&client_ts=' % recent_video_id
        list_info = tools.get_json_by_requests(list_url)
        stream = tools.get_json_value(list_info, "data.stream")
        download_url = stream[layer]['m3u8_url']

        time_length = tools.get_json_value(list_info, "data.video.seconds")

        episode_num = tools.get_json_value(list_info, "data.show.stage")

        image_url = tools.get_json_value(list_info, "data.video.logo")

        segs = stream[layer]['segs']
        cdn_url = []
        for video_url in segs:
            cdn_url.append(video_url['cdn_url'])
        # print(cdn_url)

        log.debug('''
                                     节目id:            %s
                                     当前集数:          %s
                                     本集时长:          %s
                                     播放次数:          %s
                                     节目名称:          %s
                                     下载地址:          %s
                                     节目链接:          %s
                                     图片地址:          %s
                                     ''' % (
                 program_id, episode_num, time_length, play_count, episode_name, download_url, root_url, image_url))

        base_parser.add_program_episode_info('PROGRAM_EPISODE_info', SITE_ID, program_id=program_id, episode_num=episode_num,
                                            time_length=time_length, episode_name=episode_name, download_url=download_url,
                                            episode_url=root_url, image_url=image_url, play_count=play_count)

        base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE)
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']

    html_json = tools.get_json_by_requests(root_url)
    html_json = tools.dumps_json(html_json)
    json_values = tools.get_json_value(html_json, 'obj.items')
    for json_value in json_values:
        try:
            url = tools.get_json_value(json_value, 'pkgName')
            url = 'http://sj.qq.com/myapp/detail.htm?apkName=' + url

            title = tools.get_json_value(json_value, 'appDetail.appName')

            author = tools.get_json_value(json_value, 'appDetail.authorName')

            icon_url = tools.get_json_value(json_value, 'appDetail.iconUrl')
            icon_url = icon_url.split()
            image_url = tools.get_json_value(json_value, 'appDetail.images')
            image_url = ','.join(icon_url + image_url)

            update_info = tools.get_json_value(json_value,
                                               'appDetail.newFeature')

            tag = tools.get_json_value(json_value, 'appDetail.versionName')

            summary = tools.get_json_value(json_value, 'appDetail.description')

            app_url = tools.get_json_value(json_value, 'appDetail.apkUrl')

            release_time = tools.get_json_value(json_value,
                                                'appDetail.apkPublishTime')
            release_time = int(release_time)
            release_time = tools.timestamp_to_date(release_time)

            score = tools.get_json_value(json_value, 'appDetail.averageRating')
            score = round(float(score), 1)

            software_size = tools.get_json_value(json_value,
                                                 'appDetail.fileSize')
            software_size = str(round(float(software_size) / 1024 / 1024,
                                      1)) + 'MB'

            download_count = tools.get_json_value(json_value,
                                                  'appDetail.appDownCount')

            platform = 'android'

            language = '中文'

            log.debug('''
                       标题:            %s
                       原文url:         %s
                       简介:            %s
                       更新:            %s
                       评分:            %.1f
                       作者:            %s
                       app下载的url:    %s
                       图片url:         %s
                       大小:            %s
                       版本:            %s
                       平台:            %s
                       下载次数:        %s
                       发布时间:        %s
                       语言             %s
                       ''' % (title, url, summary, update_info, score, author,
                              app_url, image_url, software_size, tag, platform,
                              download_count, release_time, language))

            base_parser.add_WWA_search_app_info('WWA_search_app_content_info',
                                                site_id,
                                                url,
                                                title=title,
                                                summary=summary,
                                                update_info=update_info,
                                                score=score,
                                                author=author,
                                                app_url=app_url,
                                                image_url=image_url,
                                                software_size=software_size,
                                                tag=tag,
                                                platform=platform,
                                                download_count=download_count,
                                                release_time=release_time,
                                                language=language,
                                                sensitive_id='')

        except Exception as e:
            log.error(e)

    base_parser.update_url('WWA_search_app_urls', root_url, Constance.DONE)
Exemplo n.º 9
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    user_id = url_info['remark']['user_id']
    head_url = url_info['remark']['head_url']
    user_name = url_info['remark']['user_name']
    gender = url_info['remark']['gender']
    program_id = url_info['remark']['program_id']

    page_count = 50
    is_continue = True

    for i in range(0, page_count + 1):
        if not is_continue: break

        weibo_content_url = root_url + '&page=%d' % i

        headers = {
            "Cache-Control":
            "max-age=0",
            "Cookie":
            "_T_WM=e0a91a3ed6286a67e649ce567fbbd17a; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D2304131560851875_-_WEIBO_SECOND_PROFILE_WEIBO%26fid%3D100103type%253D401%2526q%253D%26uicode%3D10000011",
            "Accept-Language":
            "zh-CN,zh;q=0.8",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
            "Host":
            "m.weibo.cn",
            "Accept-Encoding":
            "gzip, deflate, br",
            "Upgrade-Insecure-Requests":
            "1",
            "Connection":
            "keep-alive",
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"
        }
        html = tools.get_json_by_requests(weibo_content_url, headers=headers)

        cards = tools.get_json_value(html, 'data.cards')
        if len(cards) < 2:
            base_parser.update_url('mms_urls', root_url, Constance.DONE)
            return

        for card in cards:
            mblog = tools.get_json_value(card, 'mblog')
            if not mblog:
                continue

            url = tools.get_json_value(card, 'scheme')
            article_id = tools.get_json_value(mblog, 'id')
            article_url = 'https://m.weibo.cn/status/' + article_id

            headers = {
                "User-Agent":
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
                "Accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
                "Accept-Encoding": "gzip, deflate, br",
                "Cookie":
                "_T_WM=e0a91a3ed6286a67e649ce567fbbd17a; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D100103type%253D401%2526q%253D%26fid%3D2304131560851875_-_WEIBO_SECOND_PROFILE_WEIBO%26uicode%3D10000011",
                "Host": "m.weibo.cn",
                "Accept-Language": "zh-CN,zh;q=0.8",
                "Upgrade-Insecure-Requests": "1",
                "Connection": "keep-alive"
            }
            origin_html, r = tools.get_html_by_requests(url, headers=headers)
            if not origin_html:
                continue

            # 精确到具体时分秒 需进入到article_url
            release_time = mblog['created_at']
            release_time = tools.format_time(release_time)
            # release_time = get_release_time(mblog)
            release_time = tools.format_date(release_time)

            come_from = tools.get_json_value(mblog, 'source')
            regexs = ['"text": "(.+?)",']
            content = ''.join(tools.get_info(origin_html, regexs))
            # content = tools.del_html_tag(content)
            content = content.replace('\\', '')

            regexs = ['"pic_ids": \[(.*?)\],']
            image_url = ''.join(tools.get_info(origin_html, regexs))
            image_url = tools.del_html_tag(image_url).replace('\"',
                                                              '').replace(
                                                                  '\\n', '')
            if image_url:
                image_url = image_url.split(',')
                for i in range(len(image_url)):
                    image_url[i] = 'http://wx2.sinaimg.cn/large/' + image_url[
                        i] + '.jpg'

                image_url = ','.join(image_url)

            regexs = ['"stream_url": "(.*?)"']
            video_url = ''.join(tools.get_info(origin_html, regexs))
            transpond_count = tools.get_json_value(mblog, 'reposts_count')
            praise_count = tools.get_json_value(mblog, 'attitudes_count')
            comments_count = tools.get_json_value(mblog, 'comments_count')

            log.debug('''
                原文地址:     %s
                博主ID:       %s
                文章id         %s
                发布时间:     %s
                来自:         %s
                内容:         %s
                图片地址:     %s
                视频地址:     %s
                评论数:       %s
                转发数:       %s
                点赞数:       %s
                ''' % (article_url, user_id, article_id, release_time,
                       come_from, content, image_url, video_url,
                       comments_count, transpond_count, praise_count))

            if self_base_parser.add_article(article_id,
                                            head_url,
                                            user_name,
                                            release_time,
                                            None,
                                            content,
                                            image_url,
                                            None,
                                            praise_count,
                                            comments_count,
                                            program_id=program_id,
                                            gender=gender,
                                            url=article_url,
                                            info_type=1,
                                            emotion=random.randint(0, 2),
                                            collect=0,
                                            source='新浪微博'):

                if comments_count > 0:
                    parser_comment(article_id)
            else:
                is_continue = False
                break

    base_parser.update_url('mms_urls', root_url, Constance.DONE)
Exemplo n.º 10
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']

    stid = remark['stid']
    title = remark['title']
    room_url = root_url
    image_url = remark['cover']

    infosv2_headers = {
        'Cookie':
        'MMID=8a15a5674fa198503dcac35dff04bee0; __v3_c_review_10052=1; __v3_c_last_10052=1487233333348; __v3_c_visitor=1486539373375345; Hm_lvt_96a25bfd79bc4377847ba1e9d5dfbe8a=1486539374,1487233333; cId=23443128402874; L_V_T=db9ad5b0-800d-4f43-b6fa-33aa5e32afbc; L_V_T.sig=jOJ6vGu87WNyc-iYOuqGG0O75do; s_id=a68f007aa8644cc112f2b026a915e5c4; webmomo.sig=k0F5PIijTCK14gJvogLq-fqt978; web-imi-bew=s%3A434904554.1rDSKSZKt%2B0YDpAAi%2B2B3XBLPWR8s4QItn0tZZlA4aA; web-imi-bew.sig=J4cE69g51WFdUOxLDC--7QO8_mE; io=XTckP8nk8qE9G3U3AW1E; Hm_lvt_c391e69b0f7798b6e990aecbd611a3d4=1487664000,1487664487,1487667595,1487668109; Hm_lpvt_c391e69b0f7798b6e990aecbd611a3d4=1487668223',
        'Host': 'web.immomo.com',
        'Origin': 'https://web.immomo.com',
        'Referer': 'https://web.immomo.com/live/%s' % stid,
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
        'X-Requested-With': 'XMLHttpRequest'
    }
    infosv2_params = {'stid': stid, 'src': 'url'}

    infosv2 = tools.requests.post(
        'https://web.immomo.com/webmomo/api/scene/profile/infosv2',
        data=infosv2_params,
        headers=infosv2_headers)
    infosv2 = infosv2.json()
    infosv2_data = infosv2['data']
    # print(infosv2_data)
    watched_count = infosv2_data['on']
    rid = infosv2_data['rid']
    name = infosv2_data['name']
    token = infosv2_data['token']
    video_path = infosv2_data['url']
    watched_count_url = {
        'url': 'https://web.immomo.com/webmomo/api/scene/profile/userinfo',
        'header': infosv2_headers,
        'data': infosv2_params
    }

    userinfo_params = {
        'dmid': stid,
        'rd': rid,
        'token': token,
        'source': 'profile'
    }

    userinfo = tools.requests.post(
        'https://web.immomo.com/webmomo/api/scene/profile/userinfo',
        data=userinfo_params,
        headers=infosv2_headers)
    userinfo = userinfo.json()
    userinfo_data = userinfo['data']
    sex = userinfo_data['sex']
    sex = 1 if sex == 'F' else 0  #0 男 1女
    age = userinfo_data['age']

    fanscount_headers = {
        'Host': 'live-api.immomo.com',
        'Accept': '*/*',
        'X-PTOKEN': '',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-Hans-CN;q=1, en-CN;q=0.9',
        'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8',
        'X-LV': '1',
        'Content-Length': '176',
        'User-Agent':
        'MomoChat/7.5.6 ios/664 (iPhone 7 Plus; iOS 10.2.1; zh_CN; iPhone9,2; S1)',
        'Connection': 'keep-alive',
        'X-KV': '88e95f44',
        'Cookie': 'SESSIONID=9C3DF7F1-C39D-06F7-EC1F-9DBE56DDBF15',
    }
    fanscount_date = {
        '_idfa_': 'D2E02B97-0F35-486F-9CD4-A2EC13BBC8FB',
        '_net_': 'wifi',
        '_uid_': 'e3bb287c00673c9a701c60bf79ca24b7',
        'lat': 39.90266719310277,
        'lng': 116.348690083085,
        'remoteid': stid,
        'roomid': rid,
    }

    fans_count_data = tools.requests.post(
        'https://live-api.immomo.com/v3/user/card/lite',
        data=fanscount_date,
        headers=fanscount_headers)
    fans_count_data = fans_count_data.json()
    # print(fans_count_data)
    fans_count = tools.get_json_value(fans_count_data, 'data.fansCount')
    city = tools.get_json_value(fans_count_data, 'data.city')
    log.debug('''
                房间名:     %s
                主播名:     %s
                贴图:       %s
                主播id:     %s
                城市:       %s
                房间url:    %s
                视频流地址: %s
                观看数:     %s
                粉丝数:     %s
                性别:       %s
                年龄:       %s
                ''' % (title, name, image_url, stid, city, room_url,
                       video_path, watched_count, fans_count, sex, age))

    base_parser.add_anchor_info('LiveApp_anchor_info',
                                SITE_ID,
                                title=title,
                                name=name,
                                image_url=image_url,
                                room_id=stid,
                                room_url=room_url,
                                video_path=video_path,
                                watched_count=watched_count,
                                fans_count=fans_count,
                                sex=sex,
                                age=age,
                                address=city,
                                watched_count_url=watched_count_url)
    base_parser.update_url('LiveApp_urls', root_url, Constance.DONE)
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']

    # 解析
    html, request = tools.get_html_by_requests(root_url, headers=HEADER)

    if not html:
        base_parser.update_url('urls', root_url, Constance.EXCEPTION)
        return

    news_box = tools.get_tag(html, name='div', attrs={'class': "news-box"})[0]

    news_list = tools.get_tag(news_box, name='li')
    for news in news_list:
        try:
            # 图片
            image = tools.get_tag(news, name='img')[0]
            image = tools.get_json_value(image, 'src')

            # url
            url = tools.get_tag(news, name='h3')[0]
            try:
                url = tools.get_json_value(url.a, 'href')
            except:
                url = ''

            # 标题
            title = tools.get_tag(news, name='h3')[0]
            title = tools.get_text(title)
            title = tools.del_html_tag(title)

            # 内容
            content = tools.get_tag(news,
                                    name='p',
                                    attrs={'class': "txt-info"})[0]
            content = tools.get_text(content)
            content = tools.del_html_tag(content)

            # 观看数
            watched_count = ''

            # 来源
            origin = tools.get_tag(news, name='div', attrs={'class': "s-p"})[0]
            origin = ''.join(tools.get_info(origin, '<a.*?>(.*?)<'))

            # 日期
            release_time = tools.get_tag(news,
                                         name='div',
                                         attrs={'class': "s-p"})[0]
            release_time = tools.get_json_value(release_time, 't')
            release_time = tools.timestamp_to_date(int(release_time))

            # 判断是否有视频 根据视频播放图标判断
            regex = '<div class="img-box">.*?<i></i>.*?</div>'
            play_icon = tools.get_info(news, regex)

        except:
            continue

        contained_key, contained_key_count = base_parser.get_contained_key(
            title, content, remark['search_keyword1'],
            remark['search_keyword2'], remark['search_keyword3'])

        log.debug(
            '''
            标题:   %s
            内容:   %s
            来源:   %s
            原文url:%s
            图片url:%s
            观看数: %s
            日期:   %s
            有视频: %d
            关键词: %s
            关键词数:%s
                  ''' %
            (title, content, origin, url, image, watched_count, release_time,
             play_icon and True or False, contained_key, contained_key_count))

        if not contained_key or not play_icon:
            continue

        base_parser.add_content_info('VA_content_info',
                                     SITE_ID,
                                     url,
                                     title,
                                     content,
                                     image_url=image,
                                     release_time=release_time,
                                     origin=origin,
                                     watched_count=watched_count,
                                     search_type=SEARCH_TYPE,
                                     keyword=contained_key,
                                     keyword_count=contained_key_count)

    base_parser.update_url('VA_urls', root_url, Constance.DONE)
Exemplo n.º 12
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    description = url_info['remark']

    def get_tkey(t):
        def ror(val, key):
            i = 0
            while (i < key):
                val = (0x7fffffff & (val >> 1)) | ((val & 1) << 31)
                i += 1

            return val

        key = 185025305
        val = ror(t, key % 17)
        val = val ^ key
        return val

    def getdownload(episode_download_url_json):
        episode_json = tools.get_json_by_requests(episode_download_url_json)
        #print(episode_download_url_json)
        episode_download_url = tools.get_json_value(episode_json,
                                                    'msgs.playurl.domain')
        episode_download_url = episode_download_url and episode_download_url[
            0] or ''
        #print('-----',episode_download_url)
        episode_download_url_definition = tools.get_json_value(
            episode_json, 'msgs.playurl.dispatch.1080p')
        episode_download_url_definition = episode_download_url_definition and episode_download_url_definition[
            0] or ''
        print(episode_download_url_definition, '*********')
        episode_download_url = episode_download_url + episode_download_url_definition
        episode_download_url += "&ctv=pc&m3v=1&termid=1&format=1&hwtype=un&ostype=Linux&tag=letv&sign=letv&expect=3&tn={}&pay=0&iscpn=f9051&rateid={}".format(
            random.random(), '1080p')
        episode_download_url_json = tools.get_json_by_requests(
            episode_download_url)
        episode_download_url = tools.get_json_value(episode_download_url_json,
                                                    'location')
        return episode_download_url

    if depth == 0:
        cs_regex = 'cs(.*?)_'
        o_regex = 'cs.*?_o(.*?)_p'
        cs = tools.get_info(source_url, cs_regex)
        cs_value = cs and cs[0] or ''
        o = tools.get_info(source_url, o_regex)
        o_value = o and o[0] or ''
        #print('1'+o_value+'2','***', cs_value)
        url = 'http://list.le.com/apin/chandata.json?cs=' + cs_value + '&_o=' + o_value + '&_p='
        base_parser.add_url('PROGRAM_urls', site_id, url, depth + 1)
    if depth == 1:
        page = '1'
        #电视剧
        if 'cs=2' in source_url:
            while True:
                json = tools.get_json_by_requests(source_url + page)
                json_list = tools.get_json_value(json, 'album_list')
                #print(source_url)
                for info in json_list:
                    image_url = tools.get_json_value(info, 'images.1080*608')
                    program_name = tools.get_json_value(info, 'name')
                    program_url = tools.get_json_value(info, 'aid')
                    program_url = 'http://www.le.com/tv/' + program_url + '.html'
                    episode = tools.get_json_value(info, 'nowEpisodes')

                    directors = tools.get_json_value(info, 'directory')
                    #print(type(directors))
                    directors = ','.join(tools.get_json(directors).values())

                    actors = tools.get_json_value(info, 'starring')
                    actors = ' '.join(actors.values())

                    summary = tools.get_json_value(info, 'description')

                    release_time = tools.get_json_value(info, 'releaseDate')
                    release_time = int(release_time) / 1000
                    x = time.localtime(release_time)
                    release_time = time.strftime("%Y-%m-%d", x)

                    log.debug(
                        '''
                                    depth                       = %s
                                    program_name                = %s
                                    program_url                 = %s
                                    image_url                   = %s
                                    episode                     = %s
                                    directors                   = %s
                                    actors                      = %s
                                    summary                     = %s
                                    release_time                = %s
                                 ''' %
                        (depth, program_name, program_url, image_url, episode,
                         directors, actors, summary, release_time))

                    program_id = base_parser.add_program_info(
                        'PROGRAM_info',
                        site_id,
                        program_name,
                        program_url,
                        image_url=image_url,
                        episode=episode,
                        directors=directors,
                        actors=actors,
                        summary=summary,
                        release_time=release_time)

                    episode_url = tools.get_json_value(info, 'vids')
                    episode_url = episode_url + ','
                    regex = '(\d*?),'
                    episode_urls = tools.get_info(episode_url, regex)
                    for episode_url_num in episode_urls:

                        episode_url = 'http://www.le.com/ptv/vplay/' + episode_url_num + '.html'

                        episode_download_url_json = 'http://player-pc.le.com/mms/out/video/playJson?id={}&platid=1&splatid=101&format=1&tkey={}&domain=www.le.com&dvtype=1000&devid=49BDB62DC27B044CCD48E49CCF38EAAE3B095825&region=cn&source=1000&accessyx=1'.format(
                            episode_url_num, get_tkey(int(time.time())))

                        episode_json = tools.get_json_by_requests(
                            episode_download_url_json)

                        episode_image_url = tools.get_json_value(
                            episode_json, 'msgs.playurl.pic')

                        episode_name = tools.get_json_value(
                            episode_json, 'msgs.playurl.title')

                        episode_num_regex = "(\d*?)"
                        episode_num = tools.get_info(episode_name,
                                                     episode_num_regex)
                        episode_num = episode_num and episode_num[0] or ''

                        episode_download_url = getdownload(
                            episode_download_url_json)

                        time_length = ''

                        episode_summary = ''

                        download_status = ''

                        log.debug(
                            '''
                                                   depth                       = %s
                                                   episode_num                 = %s
                                                   time_length                 = %s
                                                   episode_name                = %s
                                                   episode_url                 = %s
                                                   download_url                = %s
                                                   episode_summary             = %s
                                                   episode_image_url           = %s

                                                ''' %
                            (depth, episode_num, time_length, episode_name,
                             episode_url, episode_download_url,
                             episode_summary, episode_image_url))

                        base_parser.add_program_episode_info(
                            'PROGRAM_EPISODE_info', site_id, program_id,
                            episode_num, time_length, episode_name,
                            download_status, episode_download_url, episode_url,
                            episode_summary, episode_image_url, '')

                page = str(int(page) + 1)

                if not json_list:
                    return False

        #体育
        if 'cs=4' in source_url:
            while True:
                json = tools.get_json_by_requests(source_url + page)
                video_list = tools.get_json_value(json, 'video_list')
                for info in video_list:
                    episode_name = tools.get_json_value(info, 'name')

                    episode_num = tools.get_json_value(info, 'name')
                    regex_episode_num = '第(.*?)期'
                    episode_num = tools.get_info(episode_num,
                                                 regex_episode_num)
                    episode_num = ''.join(episode_num)

                    episode_summary = tools.get_json_value(info, 'description')

                    episode_image_url = tools.get_json_value(
                        info, 'images.1080*608')

                    episode_url_num = tools.get_json_value(info, 'vid')
                    episode_url = 'http://sports.le.com/video/' + episode_url_num + '.html'

                    #获取Vid 也就是num
                    episode_download_url_json = 'http://player-pc.le.com/mms/out/video/playJson?id={}&platid=1&splatid=101&format=1&tkey={}&domain=www.le.com&dvtype=1000&devid=49BDB62DC27B044CCD48E49CCF38EAAE3B095825&region=cn&source=1000&accessyx=1'.format(
                        episode_url_num, get_tkey(int(time.time())))

                    episode_download_url = getdownload(
                        episode_download_url_json)

                    program_name = tools.get_json_value(info, 'albumName')

                    summary = ''
                    program_url = ''
                    actors = ''
                    directors = ''
                    release_time = ''
                    image_url = ''
                    episode = ''
                    aid = tools.get_json_value(info, 'aid')
                    download_status = 102
                    time_length = ''

                    log.debug(
                        '''
                                depth                       = %s
                                program_name                = %s
                                program_url                 = %s
                                image_url                   = %s
                                episode                     = %s
                                directors                   = %s
                                actors                      = %s
                                summary                     = %s
                                release_time                = %s
                                aid                         = %s
                    ''' %
                        (depth, program_name, program_url, image_url, episode,
                         directors, actors, summary, release_time, aid))

                    program_id = base_parser.add_program_info(
                        'PROGRAM_info',
                        site_id,
                        program_name,
                        program_url,
                        image_url=image_url,
                        episode=episode,
                        directors=directors,
                        actors=actors,
                        summary=summary,
                        release_time=release_time,
                        _id=aid)
                    log.debug('''
                               depth                       = %s
                               episode_num                 = %s
                               time_length                 = %s
                               episode_name                = %s
                               episode_url                 = %s
                               download_url                = %s
                               episode_summary             = %s
                               episode_image_url           = %s
                            ''' %
                              (depth, episode_num, time_length, episode_name,
                               episode_url, episode_download_url,
                               episode_summary, episode_image_url))
                    base_parser.add_program_episode_info(
                        'PROGRAM_EPISODE_info', site_id, program_id, episode,
                        time_length, program_name, download_status,
                        episode_download_url, program_url, summary, image_url,
                        '')
                page = str(int(page) + 1)

                if not json_list:
                    return False

        # 综艺
        if 'cs=11' in source_url:
            while True:
                json = tools.get_json_by_requests(source_url + page)
                video_list = tools.get_json_value(json, 'video_list')
                for info in video_list:
                    episode_name = tools.get_json_value(info, 'name')

                    episode_num = tools.get_json_value(info, 'name')
                    regex_episode_num = '第(.*?)期'
                    episode_num = tools.get_info(episode_num,
                                                 regex_episode_num)
                    episode_num = ''.join(episode_num)

                    episode_summary = tools.get_json_value(info, 'description')

                    episode_image_url = tools.get_json_value(
                        info, 'images.1080*608')

                    episode_url_num = tools.get_json_value(info, 'vid')
                    episode_url = 'http://www.le.com/ptv/vplay/' + episode_url_num + '.html'

                    #获取Vid 也就是num
                    episode_download_url_json = 'http://player-pc.le.com/mms/out/video/playJson?id={}&platid=1&splatid=101&format=1&tkey={}&domain=www.le.com&dvtype=1000&devid=49BDB62DC27B044CCD48E49CCF38EAAE3B095825&region=cn&source=1000&accessyx=1'.format(
                        episode_url_num, get_tkey(int(time.time())))

                    episode_download_url = getdownload(
                        episode_download_url_json)

                    program_name = tools.get_json_value(info, 'albumName')

                    summary = ''
                    actors = ''
                    directors = ''
                    release_time = ''
                    image_url = ''
                    episode = ''
                    aid = tools.get_json_value(info, 'aid')
                    program_url = ''
                    download_status = 102
                    time_length = ''

                    log.debug(
                        '''
                                depth                       = %s
                                program_name                = %s
                                program_url                 = %s
                                image_url                   = %s
                                episode                     = %s
                                directors                   = %s
                                actors                      = %s
                                summary                     = %s
                                release_time                = %s
                                aid                         = %s
                    ''' %
                        (depth, program_name, program_url, image_url, episode,
                         directors, actors, summary, release_time, aid))

                    program_id = base_parser.add_program_info(
                        'PROGRAM_info',
                        site_id,
                        program_name,
                        program_url,
                        image_url=image_url,
                        episode=episode,
                        directors=directors,
                        actors=actors,
                        summary=summary,
                        release_time=release_time,
                        _id=aid)
                    log.debug('''
                               depth                       = %s
                               episode_num                 = %s
                               time_length                 = %s
                               episode_name                = %s
                               episode_url                 = %s
                               download_url                = %s
                               episode_summary             = %s
                               episode_image_url           = %s
                    ''' % (depth, episode_num, time_length, episode_name,
                           episode_url, episode_download_url, episode_summary,
                           episode_image_url))

                    base_parser.add_program_episode_info(
                        'PROGRAM_EPISODE_info', site_id, program_id,
                        episode_num, time_length, episode_name,
                        download_status, episode_download_url, episode_url,
                        episode_summary, episode_image_url, '')
                page = str(int(page) + 1)
                if not video_list:
                    return False

        # 音乐
        if 'cs=9' in source_url:
            while True:
                json = tools.get_json_by_requests(source_url + page)
                video_list = tools.get_json_value(json, 'video_list')
                for info in video_list:
                    episode_name = tools.get_json_value(info, 'name')

                    episode_num = tools.get_json_value(info, 'name')
                    regex_episode_num = '(\d*?):'
                    episode_num = tools.get_info(episode_num,
                                                 regex_episode_num)
                    episode_num = ''.join(episode_num)

                    episode_summary = tools.get_json_value(info, 'description')

                    episode_image_url = tools.get_json_value(
                        info, 'images.1080*608')

                    episode_url_num = tools.get_json_value(info, 'vid')
                    episode_url = 'http://www.le.com/ptv/vplay/' + episode_url_num + '.html'

                    #获取Vid 也就是num
                    episode_download_url_json = 'http://player-pc.le.com/mms/out/video/playJson?id={}&platid=1&splatid=101&format=1&tkey={}&domain=www.le.com&dvtype=1000&devid=49BDB62DC27B044CCD48E49CCF38EAAE3B095825&region=cn&source=1000&accessyx=1'.format(
                        episode_url_num, get_tkey(int(time.time())))

                    episode_download_url = getdownload(
                        episode_download_url_json)

                    program_name = tools.get_json_value(info, 'albumName')

                    summary = ''
                    actors = tools.get_json_value(info, 'actor').values()
                    actors = ''.join(actors)
                    #print('**********', actors)

                    directors = ''
                    release_time = ''
                    image_url = ''
                    episode = ''
                    aid = tools.get_json_value(info, 'aid')
                    program_url = ''
                    download_status = 102
                    time_length = ''

                    log.debug(
                        '''
                                depth                       = %s
                                program_name                = %s
                                program_url                 = %s
                                image_url                   = %s
                                episode                     = %s
                                directors                   = %s
                                actors                      = %s
                                summary                     = %s
                                release_time                = %s
                                aid                         = %s
                              ''' %
                        (depth, program_name, program_url, image_url, episode,
                         directors, actors, summary, release_time, aid))

                    program_id = base_parser.add_program_info(
                        'PROGRAM_info',
                        site_id,
                        program_name,
                        program_url,
                        image_url=image_url,
                        episode=episode,
                        directors=directors,
                        actors=actors,
                        summary=summary,
                        release_time=release_time,
                        _id=aid)
                    log.debug('''
                               depth                       = %s
                               episode_num                 = %s
                               time_length                 = %s
                               episode_name                = %s
                               episode_url                 = %s
                               download_url                = %s
                               episode_summary             = %s
                               episode_image_url           = %s
                            ''' %
                              (depth, episode_num, time_length, episode_name,
                               episode_url, episode_download_url,
                               episode_summary, episode_image_url))
                    base_parser.add_program_episode_info(
                        'PROGRAM_EPISODE_info', site_id, program_id,
                        episode_num, time_length, episode_name,
                        download_status, episode_download_url, episode_url,
                        episode_summary, episode_image_url, '')

                page = str(int(page) + 1)
                if not video_list:
                    return False
    # #     # 取当前页的文章信息
    # #     # 标题
    # #
    # #
    # #     everyone_html = tools.get_html_by_requests(program_url)
    # #
    # #     regexs_directors = '<span class="editor" style="color:#333;">(.*?)</span>'
    # #     directors = tools.get_info(everyone_html, regexs_directors)
    # #     directors = directors and directors[0] or ''
    # #
    # #     # 摘要
    # #     regexs_summary = '<p class="summaryList_long">(.*?)</p>'
    # #     summary = tools.get_info(everyone_html, regexs_summary)
    # #     summary = summary and summary[0] or ''
    # #
    # #     # 更新时间
    # #     regexs_release_time = ' <dt>发布时间:</dt>.*?<dd>(.*?)</dd>'
    # #     release_time = tools.get_info(everyone_html, regexs_release_time)
    # #     release_time = release_time and release_time[0] or ''
    # #
    # #     # 下载地址
    # #     regexs_download_url = 'videoUrl=(.*?)"'
    # #     download_url = tools.get_info(everyone_html, regexs_download_url)
    # #     download_url = download_url and download_url[0] or ''
    # #
    # #     download_status = 102
    # #     time_length = ''
    # #
    # #
    # #     if download_url:
    # #         program_id = base_parser.add_program_info('PROGRAM_info', site_id, program_name, program_url, image_url = image_url,
    # #                              episode = episode, directors = directors, actors = '', summary = summary,
    # #                              release_time = release_time)
    # #
    # #         sto_path = '/video/' + program_name + '.mp4'
    # #         is_download = tools.download_file(download_url, FILE_LOCAL_PATH, sto_path)
    # #         download_status = 101 if is_download else 102
    # #
    # #         base_parser.add_program_episode_info('PROGRAM_EPISODE_info', site_id, program_id, episode, time_length, program_name, download_status,
    # #                          download_url, program_url, summary, image_url, sto_path)
    # #
    # #
    # # # 更新source_url为done
    # # base_parser.update_url('PROGRAM_urls', source_url, Constance.DONE)

    # 电影
        if 'cs=1' in source_url:
            while True:
                json = tools.get_json_by_requests(source_url + page)
                json_list = tools.get_json_value(json, 'album_list')
                #print(source_url)
                for info in json_list:
                    image_url = tools.get_json_value(info, 'images.1080*608')
                    program_name = tools.get_json_value(info, 'name')
                    program_url = tools.get_json_value(info, 'aid')
                    program_url = 'http://www.le.com/movie/' + program_url + '.html'
                    episode = ' '

                    directors = tools.get_json_value(info, 'directory')
                    directors = ','.join(tools.get_json(directors).values())

                    actors = tools.get_json_value(info, 'starring')
                    actors = ' '.join(actors.values())

                    summary = tools.get_json_value(info, 'description')

                    release_time = tools.get_json_value(info, 'releaseDate')
                    release_time = int(release_time) / 1000
                    x = time.localtime(release_time)
                    release_time = time.strftime("%Y-%m-%d", x)

                    log.debug(
                        '''
                                    depth                       = %s
                                    program_name                = %s
                                    program_url                 = %s
                                    image_url                   = %s
                                    episode                     = %s
                                    directors                   = %s
                                    actors                      = %s
                                    summary                     = %s
                                    release_time                = %s
                                 ''' %
                        (depth, program_name, program_url, image_url, episode,
                         directors, actors, summary, release_time))

                    program_id = base_parser.add_program_info(
                        'PROGRAM_info',
                        site_id,
                        program_name,
                        program_url,
                        image_url=image_url,
                        episode=episode,
                        directors=directors,
                        actors=actors,
                        summary=summary,
                        release_time=release_time)

                    episode_url = tools.get_json_value(info, 'vids')
                    episode_url = episode_url + ','
                    regex = '(.*?),'
                    episode_urls = tools.get_info(episode_url, regex)

                    for episode_url_num in episode_urls:

                        episode_url = 'http://www.le.com/ptv/vplay/' + episode_url_num + '.html'

                        episode_download_url_json = 'http://player-pc.le.com/mms/out/video/playJson?id={}&platid=1&splatid=101&format=1&tkey={}&domain=www.le.com&dvtype=1000&devid=49BDB62DC27B044CCD48E49CCF38EAAE3B095825&region=cn&source=1000&accessyx=1'.format(
                            episode_url_num, get_tkey(int(time.time())))
                        print(episode_download_url_json)
                        episode_json = tools.get_json_by_requests(
                            episode_download_url_json)

                        episode_image_url = tools.get_json_value(
                            episode_json, 'msgs.playurl.pic')

                        episode_name = tools.get_json_value(
                            episode_json, 'msgs.playurl.title')

                        episode_num_regex = "第(.*?)期"
                        episode_num = tools.get_info(episode_name,
                                                     episode_num_regex)
                        episode_num = episode_num and episode_num[0] or ''

                        episode_download_url = getdownload(
                            episode_download_url_json)

                        time_length = ''

                        episode_summary = ''

                        download_status = ''

                        log.debug(
                            '''
                                                   depth                       = %s
                                                   episode_num                 = %s
                                                   time_length                 = %s
                                                   episode_name                = %s
                                                   episode_url                 = %s
                                                   download_url                = %s
                                                   episode_summary             = %s
                                                   episode_image_url           = %s

                                                ''' %
                            (depth, episode_num, time_length, episode_name,
                             episode_url, episode_download_url,
                             episode_summary, episode_image_url))

                        base_parser.add_program_episode_info(
                            'PROGRAM_EPISODE_info', site_id, program_id,
                            episode_num, time_length, episode_name,
                            download_status, episode_download_url, episode_url,
                            episode_summary, episode_image_url, '')

                page = str(int(page) + 1)

                if not json_list:
                    return False
Exemplo n.º 13
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']['keyword']
    monitor_type = url_info['remark']['monitor_type']
    official_accounts_id = remark
    retry_times = url_info['retry_times']

    headers = {
    "Host": "weixin.sogou.com",
    "Connection": "keep-alive",
    "Accept-Encoding": "gzip, deflate",
    "Accept-Language": "zh-CN,zh;q=0.8",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Cookie": "ABTEST=8|1506658658|v1; IPLOC=CN1100; SUID=C5C47C7B642E940A0000000059CDC962; SUID=C5C47C7B1508990A0000000059CDC963; weixinIndexVisited=1; SUV=00F95AA57B7CC4C559CDC963CE316529; SNUID=2B2A9295EDE8B7A2BCECB605EE30F1BE; JSESSIONID=aaadcwpP9yaKs-PCMhz6v",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
    "Upgrade-Insecure-Requests": "1"
    }

    # 获取代理
    proxies = base_parser.get_proxies()
    headers["User-Agent"] = base_parser.get_user_agent()

    # 解析
    # print(proxies)
    # html, r = tools.get_html_by_requests('http://ip.chinaz.com/getip.aspx', headers = headers, proxies = proxies)
    # print(html)

    html, request = tools.get_html_by_requests(root_url, headers = headers, proxies = proxies)
    if not html:
        base_parser.update_url('urls', root_url, Constance.TODO, retry_times + 1)
        return

    # print(html)
    regex = '<input type=text name="c" value="" placeholder="(.*?)" id="seccodeInput">'
    check_info = tools.get_info(html, regex, fetch_one = True)
    print(root_url)
    log.debug('取文章链接' + check_info)

    if check_info:
        base_parser.update_url('urls', root_url, Constance.TODO, retry_times + 1)
        return

    # 公众号信息块
    regex = '<!-- a -->(.*?)<!-- z -->'
    account_block = tools.get_info(html, regex, fetch_one = True)
    # url
    regex = '<a.*?account_name.*?href="(.*?)">'
    account_url = tools.get_info(account_block, regex, fetch_one = True)
    account_url = account_url.replace('&amp;',"&")
    log.debug('account_url = ' + account_url)

    if not account_url:
        base_parser.update_url('urls', root_url, Constance.EXCEPTION)
        return

    headers = {
        "Accept-Language": "zh-CN,zh;q=0.8",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate",
        "Host": "mp.weixin.qq.com",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
        "Upgrade-Insecure-Requests": "1",
        "Connection": "keep-alive"
    }

    # 代理
    proxies = base_parser.get_proxies()
    headers["User-Agent"] = base_parser.get_user_agent()
    proxies = {} #使用代理会出现验证码 暂时不使用

    html, request = tools.get_html_by_requests(account_url, headers = headers, proxies = proxies)
    regex = '<input class="weui_input frm_input" id="input" placeholder="(.*?)" maxlength="4">'
    check_info = tools.get_info(html, regex, fetch_one = True)
    log.debug('''
        取文章详细内容 %s
        url %s
        request.headers %s
        '''%(check_info, account_url, request.headers))
    # print(html)

    regex = 'var msgList = (.*?});'
    article_json = tools.get_info(html, regex, fetch_one = True)
    article_json = tools.get_json(article_json)

    article_list = article_json.get('list', {})
    for article in article_list:
        title = tools.get_json_value(article, 'app_msg_ext_info.title')
        is_have = mongodb.find('WWA_wechat_article', {'title' : title})
        if is_have:
            log.debug(title + " 已存在")
            continue

        summary = tools.get_json_value(article, 'app_msg_ext_info.digest')
        image_url = tools.get_json_value(article, 'app_msg_ext_info.cover')

        sexy_image_url = []

        # 下载图片
        local_image_url = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.jpg'
        is_download = tools.download_file(image_url, local_image_url)
        local_image_url = local_image_url if is_download else ''
        sexy_image_url.append(local_image_url)

        article_url = tools.get_json_value(article, 'app_msg_ext_info.content_url')
        article_url = tools.get_full_url('http://mp.weixin.qq.com', article_url)
        article_url = article_url.replace('&amp;',"&")

        release_time = tools.get_json_value(article, 'comm_msg_info.datetime')
        release_time = tools.timestamp_to_date(int(release_time)) if release_time else ''

        content_html, request = tools.get_html_by_requests(article_url, headers = headers, proxies = proxies)
        regex = '(<div class="rich_media_content " id="js_content">.*?)<script nonce'
        content = tools.get_info(content_html, regex, fetch_one = True)

        # # 取content里的图片 下载图片 然后替换内容中原来的图片地址
        regex = '<img.*?data-src="(.*?)"'
        images = tools.get_info(content, regex)
        for image in images:
            local_image_path = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.' + (image[image.find('wx_fmt=') + len('wx_fmt='):(image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) if image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) != -1 else None)] if 'wx_fmt=' in image else 'jpg')
            is_download = tools.download_file(image, local_image_path)
            if is_download:
                content = content.replace(image, local_image_path)
                sexy_image_url.append(local_image_path)
            tools.delay_time(5)

        # 敏感事件
        sensitive_id = ''
        if monitor_type == 1 or monitor_type == 2:
            sensitive_event_infos = oracledb.find('select t.id, t.keyword1, t.keyword2, t.keyword3 from tab_mvms_sensitive_event t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time')
            for sensitive_event_info in sensitive_event_infos:
                _id = sensitive_event_info[0]
                keyword1 = sensitive_event_info[1].split(',') if sensitive_event_info[1] else []
                keyword2 = sensitive_event_info[2].split(',') if sensitive_event_info[2] else []
                keyword3 = sensitive_event_info[3].split(',') if sensitive_event_info[3] else []

                if base_parser.is_violate(title + content, key1 = keyword1, key2 = keyword2, key3 = keyword3):
                    sensitive_id = _id
                    break

        # 违规事件
        violate_id = ''
        if monitor_type == 0 or monitor_type == 2:
            vioation_knowledge_infos = oracledb.find('select t.id, t.keyword1, t.keyword2, t.keyword3 from tab_mvms_violation_knowledge t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time')
            for vioation_knowledge_info in vioation_knowledge_infos:
                _id = vioation_knowledge_info[0]
                keyword1 = vioation_knowledge_info[1].split(',') if vioation_knowledge_info[1] else []
                keyword2 = vioation_knowledge_info[2].split(',') if vioation_knowledge_info[2] else []
                keyword3 = vioation_knowledge_info[3].split(',') if vioation_knowledge_info[3] else []

                if base_parser.is_violate(title + tools.del_html_tag(content), key1=keyword1, key2=keyword2, key3=keyword3):
                    violate_id = _id
                    break

        log.debug('''
            标题         %s
            简介         %s
            图片地址     %s
            文章地址     %s
            发布时间     %s
            内容         %s
            本地贴图地址 %s
            违规状态     %s
            敏感事件     %s
            图片鉴别地址 %s
            '''%(title, summary, image_url, article_url, release_time, content, local_image_url, violate_id, sensitive_id, sexy_image_url))

        base_parser.add_wechat_content_info('WWA_wechat_article', site_id, official_accounts_id, title, summary, image_url, article_url, release_time, content, video_url = '', local_image_url = local_image_url, violate_status = violate_id, sensitive_id = sensitive_id, sexy_image_url = sexy_image_url)

        # 同一天发布的
        oneday_article_list = article.get('app_msg_ext_info', {}).get('multi_app_msg_item_list', [])
        for article in oneday_article_list:
            title = tools.get_json_value(article, 'title')
            summary = tools.get_json_value(article, 'digest')
            image_url = tools.get_json_value(article, 'cover')

            sexy_image_url = []

            # 下载图片
            local_image_url = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.jpg'
            is_download = tools.download_file(image_url, local_image_url)
            local_image_url = local_image_url if is_download else ''
            sexy_image_url.append(local_image_url)

            article_url = tools.get_json_value(article, 'content_url')
            article_url = tools.get_full_url('http://mp.weixin.qq.com', article_url)
            article_url = article_url.replace('&amp;',"&")

            content_html, request = tools.get_html_by_requests(article_url, headers = headers, proxies = proxies)
            regex = '(<div class="rich_media_content " id="js_content">.*?)<script nonce'
            content = tools.get_info(content_html, regex, fetch_one = True)

            # 取content里的图片 下载图片 然后替换内容中原来的图片地址
            regex = '<img.*?data-src="(.*?)"'
            images = tools.get_info(content, regex)
            for image in images:
                local_image_path = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.' + (image[image.find('wx_fmt=') + len('wx_fmt='):(image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) if image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) != -1 else None)] if 'wx_fmt=' in image else 'jpg')
                is_download = tools.download_file(image, local_image_path)
                if is_download:
                    content = content.replace(image, local_image_path)
                    sexy_image_url.append(local_image_path)
                tools.delay_time(5)

            # 敏感事件
            sensitive_id = ''
            sensitive_event_infos = oracledb.find('select * from tab_mvms_sensitive_event')
            for sensitive_event_info in sensitive_event_infos:
                _id = sensitive_event_info[0]
                keyword1 = sensitive_event_info[3].split(',') if sensitive_event_info[3] else []
                keyword2 = sensitive_event_info[4].split(',') if sensitive_event_info[4] else []
                keyword3 = sensitive_event_info[5].split(',') if sensitive_event_info[5] else []

                if base_parser.is_violate(title + content, key1 = keyword1, key2 = keyword2, key3 = keyword3):
                    sensitive_id = _id
                    break

            # 违规事件
            violate_id = ''
            vioation_knowledge_infos = oracledb.find('select * from tab_mvms_violation_knowledge')
            for vioation_knowledge_info in vioation_knowledge_infos:
                _id = vioation_knowledge_info[0]
                keyword1 = vioation_knowledge_info[2].split(',') if vioation_knowledge_info[2] else []
                keyword2 = vioation_knowledge_info[3].split(',') if vioation_knowledge_info[3] else []
                keyword3 = vioation_knowledge_info[4].split(',') if vioation_knowledge_info[4] else []

                if base_parser.is_violate(title + tools.del_html_tag(content), key1=keyword1, key2=keyword2, key3=keyword3):
                    violate_id = _id
                    break

            log.debug('''
            标题         %s
            简介         %s
            图片地址     %s
            文章地址     %s
            发布时间     %s
            内容         %s
            本地贴图地址 %s
            违规状态     %s
            敏感事件     %s
            图片鉴别地址 %s
            '''%(title, summary, image_url, article_url, release_time, content, local_image_url, violate_id, sensitive_id, sexy_image_url))

            base_parser.add_wechat_content_info('WWA_wechat_article', site_id, official_accounts_id, title, summary, image_url, article_url, release_time, content, video_url = '', local_image_url = local_image_url, violate_status = violate_id, sensitive_id = sensitive_id, sexy_image_url = sexy_image_url)

    base_parser.update_url('WWA_wechat_article_url', root_url, Constance.DONE)
    tools.delay_time()
def parser(url_info):
    url = url_info['url']
    list_datas = tools.get_json_by_requests(url)
    list_datas = list_datas['list']

    for list_data in list_datas:
        title = list_data['title']
        watched_count = list_data['playsCounts']
        image_url = list_data['coverLarge']
        comment_count = list_data['commentsCount']
        charge_type = list_data['priceTypeId']
        is_finished = list_data['isFinished']
        article_type = list_data['tags']
        origin = list_data['provider']
        episodes = list_data['tracks']
        # uid = list_data['uid']
        author = list_data['nickname']
        album_id = list_data['albumId']
        abstract = list_data['intro']
        score = tools.get_json_value(list_data, 'score')
        # id = list_data['id']
        new_url_2 = 'http://mobile.ximalaya.com/mobile/v1/album/rich?albumId=%s' % album_id
        list_datas_2 = tools.get_json_by_requests(new_url_2)
        content = tools.get_json_value(list_datas_2, 'data.album.intro')
        release_time = tools.get_json_value(list_datas_2,
                                            'data.album.createdAt')
        release_time = tools.timestamp_to_date(release_time / 1000)
        update_time = tools.get_json_value(list_datas_2,
                                           'data.album.lastUptrackAt')
        update_time = tools.timestamp_to_date(update_time / 1000)
        subscribe_count = tools.get_json_value(list_datas_2,
                                               'data.album.subscribeCount')

        new_url_3 = 'http://mobile.ximalaya.com/mobile/v1/album/track?albumId=%s&device=android&isAsc=true&pageId=1&' \
                    'pageSize=5000&pre_page=1' % album_id

        list_datas_3 = tools.get_json_by_requests(new_url_3)
        lists = tools.get_json_value(list_datas_3, 'data.list')

        log.debug('''
                    书名:                 %s
                    作品类型:             %s
                    集数:                 %s
                    评分:                 %s   (免费作品均无评分)
                    订阅数:               %s
                    作者:                 %s
                    创建时间:             %s
                    最近更新日期:         %s
                    贴图:                 %s
                    播放次数:             %s
                    评论数:               %s   (免费作品均无评论)
                    收费类型:             %s   (0:免费,1:单期购买, 2:全集购买)
                    是否完结:             %s   (0、1:未完结, 2:完结)
                    提供者:               %s
                    简介:                 %s
                    完整介绍:             %s
                    ''' %
                  (title, article_type, episodes, score, subscribe_count,
                   author, release_time, update_time, image_url, watched_count,
                   comment_count, charge_type, is_finished, origin, abstract,
                   content))
        content_id = base_parser.add_wp_content_info(
            'WP_content_info',
            SITE_ID,
            title=title,
            article_type=article_type,
            episodes=episodes,
            score=score,
            subscribe_count=subscribe_count,
            author=author,
            release_time=release_time,
            update_time=update_time,
            image_url=image_url,
            watched_count=watched_count,
            comment_count=comment_count,
            charge_type=charge_type,
            is_finished=is_finished,
            origin=origin,
            abstract=abstract,
            content=content,
            data_type=DATA_TYPE)

        for list in lists:
            title = list['title']
            download_url = list['playPathAacv164']
            watched_count = list['playtimes']
            play_length = list['duration']
            comments_count = list['comments']
            create_time = list['createdAt']
            create_time = tools.timestamp_to_date(create_time / 1000)
            # log.debug('''
            #                    书名:                 %s
            #                    下载链接:             %s
            #                    播放次数:             %s
            #                    播放时长:             %s
            #                    评论数:                %s
            #                    创建时间:             %s
            #                    ''' % (
            # title, download_url, watched_count, play_length, comments_count, create_time))
            base_parser.add_wp_content_episode_info(
                'WP_content_episode_info',
                content_id=content_id,
                title=title,
                video_url=download_url,
                watched_count=watched_count,
                play_length=play_length,
                comments_count=comments_count,
                release_time=create_time,
                data_type=DATA_TYPE)

        base_parser.update_url('WP_urls', url, Constance.DONE)
def juji_parser(url, remark):
    program_id = remark

    html, res = tools.get_html_by_requests(url)

    tvid = tools.get_info(
        html, ['player-tvid="(\d{4,11})"', 'list-tvid="(\d{4,11})"'],
        fetch_one=True)
    pcInfo_url = "http://mixer.video.iqiyi.com/jp/mixin/videos/" + str(tvid)
    # print(pcInfo_url)
    html2, res = tools.get_html_by_requests(pcInfo_url)

    album_id = tools.get_info(html, [
        'player-albumid="(\d{4,11})', 'list-albumid="(\d{4,11})"',
        'albumId: ?(\d{4,11}),', 'param\[\'albumId\'\] ?= ?"(\d{4,11})"'
    ],
                              fetch_one=True)

    episode_name = tools.get_info(html, ['meta.+?"irTitle" content="(.+?)"'],
                                  fetch_one=True)

    image_url = tools.get_info(html,
                               ['<meta property="og:image" content="(.+?)"/>'],
                               fetch_one=True)
    image_url = image_url.replace('.jpg', '_160_90.jpg')

    play_count = tools.get_info(html2, ['"playCount":(.+?),'], fetch_one=True)
    time_length = tools.get_info(html2, ['"duration":\s*(.+?),'],
                                 fetch_one=True)
    episode_num = tools.get_info(html2, ['"order":\s*(.+?),'], fetch_one=True)

    current_time = tools.get_current_timestamp() * 1000
    current_time = str(current_time)

    download_json_url = 'http://iface2.iqiyi.com/video/3.0/v_download?app_k=8e48946f144759d86a50075555fd5862&app_v=8.1&qyid=D2E02B97-0F35-486F-9CD4-A2EC13BBC8FB&secure_p=iPhone&secure_v=1&dev_hw=%7B%22cpu%22:%22%22,%22mem%22:%222802%22%7D&net_sts=1&device_id=D2E02B97-0F35-486F-9CD4-A2EC13BBC8FB&dev_os=10.2.1&dev_ua=iPhone9,2&net_ip=%7B%22country%22:%22%E4%B8%AD%E5%9B%BD%22,%22province%22:%22%E5%8C%97%E4%BA%AC%22,%22city%22:%22%E5%8C%97%E4%BA%AC%22,%22cc%22:%22%E5%9B%BD%E5%86%85%E5%85%B6%E4%BB%96%22,%22area%22:%22%E5%8D%8E%E5%8C%97%22,%22timeout%22:0,%22respcode%22:0%7D&album_id=' + album_id + '&tvid=' + tvid + '&req_times=1&play_core=0&platform_id=12&app_p=iphone&app_t=0&usr_res=16&ppid=1229289410&cookie=53igk5Vn7X1xpazWBjzW2HUN4XGjNSP4aQypF7affdnBUaC6rknOS4dzvIcU1pMm2m2Qfb&lang=zh_CN&app_lm=cn&pps=0&req_sn=' + current_time
    json_ = tools.get_json_by_requests(download_json_url,
                                       headers=download_header)
    download_url = tools.get_json_value(json_, 'video.mp4_res.1.url')
    download_url, res = tools.get_html_by_requests(download_url)
    download_url = tools.get_info(download_url, ['"l":"(.+?)"'],
                                  fetch_one=True)

    log.debug('''
                                        节目id:            %s
                                        当前集数:          %s
                                        本集时长:          %s
                                        播放次数:          %s
                                        节目名称:          %s
                                        下载地址:          %s
                                        节目链接:          %s
                                        图片地址:          %s
                                        ''' %
              (program_id, episode_num, time_length, play_count, episode_name,
               download_url, url, image_url))

    base_parser.add_program_episode_info('PROGRAM_EPISODE_info',
                                         SITE_ID,
                                         program_id=program_id,
                                         episode_num=episode_num,
                                         time_length=time_length,
                                         episode_name=episode_name,
                                         download_url=download_url,
                                         episode_url=url,
                                         image_url=image_url,
                                         play_count=play_count)

    base_parser.update_url('PROGRAM_urls', url, Constance.DONE)
Exemplo n.º 16
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    description = url_info['remark']

    def r1(pattern, text):
        m = re.search(pattern, text)
        if m:
            return m.group(1)

    program_name = '风行星风范'
    actors = '姜武,秦海璐,黄海波,柳岩'
    release_time = '2011年07月23日'
    directors = ''
    program_url = 'http://www.fun.tv/vplay/g-98097/'
    summary = ''
    image_url = 'http://img3.funshion.com/sdw?oid=cc09e4ab792d4008d86efcbbbf4c55dc&w=200&h=280'
    id = '98097'
    json_episode_info = tools.get_json_by_requests(
        'http://pm.funshion.com/v5/media/episode?id=' + id + '&cl=aphone&uc=5')
    episode = len(json_episode_info)

    log.debug('''
                    depth                       = %s
                    program_name                = %s
                    program_url                 = %s
                    episode                     = %s
                    summary                     = %s
                    image_url                   = %s

                 ''' %
              (depth, program_name, program_url, episode, summary, image_url))

    program_id = base_parser.add_program_info('PROGRAM_info',
                                              site_id,
                                              program_name,
                                              program_url,
                                              image_url=image_url,
                                              episode=episode,
                                              directors='',
                                              actors=actors,
                                              summary='',
                                              release_time=release_time)

    if re.match(r'http://www.fun.tv/vplay/.*g-(\w+)', source_url):
        id = r1(r'http://www.fun.tv/vplay/.*g-(\d+)', source_url)
        json_info = tools.get_json_by_requests(
            'http://pm.funshion.com/v5/media/episode?id=' + id +
            '&cl=aphone&uc=5')
        json_episodes_info = tools.get_json_value(json_info, 'episodes')
        for json_episode_info in json_episodes_info:
            vid = tools.get_json_value(json_episode_info, 'id')

            episode_name = tools.get_json_value(json_episode_info, 'name')

            image_episode_info = tools.get_json_value(json_episode_info,
                                                      'still')

            episode_url = tools.get_json_value(json_episode_info, 'num')
            episode_url = 'http://pm.funshion.com/v5/media/share?id=98097&num=' + episode_url

            episode_num = tools.get_json_value(json_episode_info, 'num')

            #总集数
            episode = len(json_episode_info)

            time_length = ''
            episode_summary = ''
            download_status = ''

            download_url = ''

            log.debug(
                '''
                                    depth                       = %s
                                    episode_num                 = %s
                                    time_length                 = %s
                                    episode_name                = %s
                                    episode_url                 = %s
                                    download_url                = %s
                                    episode_summary             = %s
                                    image_episode_info          = %s

                                 ''' %
                (depth, episode_num, time_length, episode_name, episode_url,
                 download_url, episode_summary, image_episode_info))
            base_parser.add_program_episode_info(
                'PROGRAM_EPISODE_info', site_id, program_id, episode_num,
                time_length, episode_name, download_status, download_url,
                episode_url, episode_summary, image_episode_info, '')

        # 取当前页的文章信息
        # 标题

    # 更新source_url为done
    base_parser.update_url('PROGRAM_urls', source_url, Constance.DONE)
Exemplo n.º 17
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    program_id = url_info['remark']['program_id']
    program_name = url_info['remark']['program_name']
    chan_name = url_info['remark']['chan_name']

    is_continue = True

    for i in range(1, 2):  # 只取了第一页
        if not is_continue: break

        list_url = root_url + '&page=%d' % i
        html = tools.get_json_by_requests(list_url)

        cards = html.get('data', {}).get('cards')
        card_group = []
        for i in cards:
            card_group = tools.get_json_value(i, 'card_group')
            if card_group:
                break
        if not card_group:
            break

        for info in card_group:
            user_info = tools.get_json_value(info, 'user')
            user_id = tools.get_json_value(user_info, 'id')

            user_url = 'http://m.weibo.cn/api/container/getIndex?containerid=230283%s_-_INFO' % user_id
            user_url_html = tools.get_json_by_requests(user_url)
            user_url_cards = tools.get_json_value(user_url_html, 'data.cards')
            user_url_card_group = tools.get_json_value(user_url_cards[0],
                                                       'card_group')
            area = ''
            for i in user_url_card_group:
                if tools.get_json_value(i, 'item_name') == '所在地':
                    area = tools.get_json_value(i, 'item_content')
                else:
                    continue

            name = tools.get_json_value(user_info, 'screen_name')
            verified_reason = tools.get_json_value(user_info,
                                                   'verified_reason')

            is_verified = 0
            if verified_reason: is_verified = 1

            sex = tools.get_json_value(user_info, 'gender')
            if sex == 'f':
                sex = 1
            elif sex == 'm':
                sex = 0
            else:
                sex = ''

            image_url = tools.get_json_value(user_info, 'profile_image_url')
            url = tools.get_json_value(user_info, 'profile_url')
            summary = tools.get_json_value(user_info, 'description')
            user_url_2 = 'http://m.weibo.cn/api/container/getIndex?containerid=100505%s' % user_id
            user_url_html_2 = tools.get_json_by_requests(user_url_2)
            fans_count = tools.get_json_value(user_url_html_2,
                                              'userInfo.followers_count')
            follow_count = tools.get_json_value(user_url_html_2,
                                                'userInfo.follow_count')

            log.debug(
                '''
                节目id:     %s
                用户id:     %s
                微博昵称:   %s
                微博地址:   %s
                头像地址:   %s
                微博认证:   %s
                是否认证:   %s
                所在地:     %s
                性别:       %s
                简介:       %s
                粉丝数:     %s
                关注数:     %s
                ''' %
                (program_id, user_id, name, url, image_url, verified_reason,
                 is_verified, area, sex, summary, fans_count, follow_count))

            if program_name in name or program_name in verified_reason:  # 搜索到想搜的用戶 不往下進行
                self_base_parser.add_weibo_user(program_id, user_id, name, url,
                                                image_url, verified_reason,
                                                is_verified, area, sex,
                                                summary, fans_count,
                                                follow_count)

                # 更新 是否有官博字段
                sql = 'update TAB_MMS_PROGRAM set official_blog = 1 where program_id = %d' % program_id
                db.update(sql)

                is_continue = False
                break

    if is_continue:
        # 更新 是否有官博字段
        sql = 'update TAB_MMS_PROGRAM set official_blog = 0 where program_id = %d' % program_id
        db.update(sql)

    base_parser.update_url('mms_urls', root_url, Constance.DONE)
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']

    column_id = remark

    while True:
        try:
            json = tools.get_json_by_requests(root_url,
                                              headers=HEADERS,
                                              data=data,
                                              params=PARAMS)
            newslist = tools.get_json_value(json, 'newslist')
            if not newslist:
                break

            data['cachedCount'] += len(newslist)
            data['page'] += 1
            for news in newslist:
                # print(tools.dumps_json(news))
                title = tools.get_json_value(news, 'title')
                release_time = tools.get_json_value(news, 'time')
                abstract = tools.get_json_value(news, 'abstract')
                original_url = tools.get_json_value(news, 'url')
                img_url = tools.get_json_value(
                    news, 'thumbnails_qqnews')[0] if tools.get_json_value(
                        news, 'thumbnails_qqnews') else ''
                video_frame_url = tools.get_json_value(
                    news, 'video_channel.video.playurl')
                # 取content
                html = tools.get_html_by_urllib(original_url)
                content = tools.get_tag(html,
                                        name='div',
                                        attrs={'class': "main"},
                                        find_all=False)
                content = tools.del_html_tag(str(content))

                # 解析视频真实地址
                video_url = ''
                if video_frame_url:
                    video_vid = tools.get_info(html,
                                               'vid\s*=\s*"\s*([^"]+)"',
                                               fetch_one=True)
                    video_url = ''.join(qq.qq_download_by_vid(video_vid))

                # 判断是否违规
                # 敏感事件
                sensitive_id = ''
                sensitive_event_infos = oracledb.find(
                    'select * from tab_mvms_sensitive_event')
                for sensitive_event_info in sensitive_event_infos:
                    _id = sensitive_event_info[0]
                    keyword1 = sensitive_event_info[3].split(
                        ' ') if sensitive_event_info[3] else []
                    keyword2 = sensitive_event_info[4].split(
                        ' ') if sensitive_event_info[4] else []
                    keyword3 = sensitive_event_info[5].split(
                        ' ') if sensitive_event_info[5] else []

                    if base_parser.is_violate(title + content,
                                              key1=keyword1,
                                              key2=keyword2,
                                              key3=keyword3):
                        sensitive_id = _id

                # 违规事件
                violate_id = ''
                vioation_knowledge_infos = oracledb.find(
                    'select * from tab_mvms_violation_knowledge')
                for vioation_knowledge_info in vioation_knowledge_infos:
                    _id = vioation_knowledge_info[0]
                    keyword1 = vioation_knowledge_info[2].split(
                        ' ') if vioation_knowledge_info[2] else []
                    keyword2 = vioation_knowledge_info[3].split(
                        ' ') if vioation_knowledge_info[3] else []
                    keyword3 = vioation_knowledge_info[4].split(
                        ' ') if vioation_knowledge_info[4] else []

                    if base_parser.is_violate(title + content,
                                              key1=keyword1,
                                              key2=keyword2,
                                              key3=keyword3):
                        violate_id = _id

                log.debug(
                    '''
                title:          %s
                abstract :      %s
                img_url :       %s
                original_url:   %s
                release_time :  %s
                video_url:      %s
                content :       %s
                column_id:      %d
                sensitive_id:   %s
                violate_id:     %s

                ''' %
                    (title, abstract, img_url, original_url, release_time,
                     video_url, content, column_id, sensitive_id, violate_id))

                # 下载
                base_path = FILE_LOCAL_PATH
                is_download = 0

                # 下载图片
                img_name = ''
                if img_url:
                    img_name = 'images/' + tools.get_current_date(
                        date_format='%Y-%m-%d') + "/" + tools.get_current_date(
                            date_format='%Y%m%d%H%M%S.%f') + '.jpg'
                    is_download = tools.download_file(img_url, base_path,
                                                      img_name)
                    if not is_download:
                        img_name = ''

                # 下载视频
                video_name = ''
                if video_url:
                    video_name = 'videos/' + tools.get_current_date(
                        date_format='%Y-%m-%d') + "/" + tools.get_current_date(
                            date_format='%Y%m%d%H%M%S.%f') + '.mp4'
                    is_download = tools.download_file(video_url, base_path,
                                                      video_name)
                    if not is_download:
                        video_name = ''

                if original_url:
                    base_parser.add_va_app_content_info(
                        'VAApp_content_info', SITE_ID, title, abstract,
                        img_url, img_name, original_url, release_time,
                        video_url, video_name, content, column_id, is_download,
                        sensitive_id, violate_id, STORAGE_ID)

        except Exception as e:
            log.debug(e)
            pass

    base_parser.update_url('VAApp_urls', root_url, Constance.DONE)
Exemplo n.º 19
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    description = url_info['remark']

    html, request = tools.get_html_by_requests(source_url, code='GBK')
    episode_list = 'var url = "(.*?)"'
    episode_list_json = tools.get_info(html, episode_list)
    episode_list_json = episode_list_json and episode_list_json[0] or ''

    episode_list_json_url = episode_list_json + '&cb=jsonp' + str(
        int(time.time()))
    episode_list_json_url = episode_list_json_url.replace("\\", "")
    #print(episode_list_json_url)
    # base_parser.add_url('PROGRAM_urls', site_id, url, depth+1)

    # 取类型

    # 标题
    regexs_program_name = '<meta name="keywords" content="(.*?)" />'
    program_name = tools.get_info(html, regexs_program_name)
    program_name = program_name and program_name[0] or ''

    program_url = source_url

    episode_list_json_html, r = tools.get_html_by_requests(
        episode_list_json_url)

    regexs = 'jsonp\d*?\((.*)\)'
    episode_list_json = tools.get_info(episode_list_json_html, regexs)
    episode_list_json = episode_list_json and episode_list_json[0] or ''
    episode_list_json = tools.dumps_json(episode_list_json)

    episode_list_json_value_list = tools.get_json_value(
        episode_list_json, 'data.list')

    episode = len(episode_list_json_value_list)

    summary = ''

    log.debug('''
                    depth                       = %s
                    program_name                = %s
                    program_url                 = %s
                    episode                     = %s
                    summary                     = %s

                 ''' % (depth, program_name, program_url, episode, summary))

    program_id = base_parser.add_program_info('PROGRAM_info',
                                              site_id,
                                              program_name,
                                              program_url,
                                              image_url='',
                                              episode=episode,
                                              directors='',
                                              actors='',
                                              summary=summary,
                                              release_time='')

    for episode_info in episode_list_json_value_list:
        episode_name = tools.get_json_value(episode_info, 'title')

        episode_image_url = tools.get_json_value(episode_info, 'picurl')

        episode_url = tools.get_json_value(episode_info, 'podurl')

        episode_summary = tools.get_json_value(episode_info, 'desc')

        episode_num = tools.get_json_value(episode_info, 'title')

        episode_num_regex = '第(\d*?)期'
        episode_num = tools.get_info(episode_num, episode_num_regex)
        episode_num = episode_num and episode_num[0] or ''
        if episode_num:
            episode_num = '第' + episode_num + '期'

        download_url_json_str = tools.get_json_value(episode_info, 'vid')

        download_url_json_url = 'http://v.ku6.com/fetchVideo4Player/' + download_url_json_str + '.html'
        download_url_json = tools.get_json_by_requests(download_url_json_url)
        download_url = tools.get_json_value(download_url_json, 'data.f')

        download_status = 102
        time_length = ''

        if download_url:
            #     sto_path = '/video/' + program_name + '.mp4'
            #     is_download = tools.download_file(download_url, FILE_LOCAL_PATH, sto_path)
            #     download_status = 101 if is_download else 102
            log.debug('''
                                depth                       = %s
                                episode_num                 = %s
                                time_length                 = %s
                                episode_name                = %s
                                episode_url                 = %s
                                download_url                = %s
                                episode_summary             = %s
                                episode_image_url           = %s

                             ''' % (depth + 1, episode_num, time_length,
                                    episode_name, episode_url, download_url,
                                    episode_summary, episode_image_url))
            base_parser.add_program_episode_info(
                'PROGRAM_EPISODE_info', site_id, program_id, episode_num,
                time_length, episode_name, download_status, download_url,
                episode_url, episode_summary, episode_image_url, '')

        # 更新source_url为done
    base_parser.update_url('PROGRAM_urls', source_url, Constance.DONE)
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    monitor_type = url_info['remark']

    for i in range(2, 100):
        list_url = root_url + '&page=%d' % i
        html = tools.get_json_by_requests(list_url)

        cards = tools.get_json_value(html, 'cards')
        card_group = []
        for i in cards:
            card_group = tools.get_json_value(i, 'card_group')
            if card_group:
                break
        if not card_group:
            break

        for info in card_group:
            user_info = tools.get_json_value(info, 'user')
            _id = tools.get_json_value(user_info, 'id')

            user_url = 'http://m.weibo.cn/api/container/getIndex?containerid=230283%s_-_INFO' % _id
            user_url_html = tools.get_json_by_requests(user_url)
            user_url_cards = tools.get_json_value(user_url_html, 'cards')
            user_url_card_group = tools.get_json_value(user_url_cards[0],
                                                       'card_group')
            area = ''
            for i in user_url_card_group:
                if tools.get_json_value(i, 'item_name') == '所在地':
                    area = tools.get_json_value(i, 'item_content')
                else:
                    continue

            name = tools.get_json_value(user_info, 'screen_name')
            is_verified_reason = 101
            verified_reason = tools.get_json_value(user_info,
                                                   'verified_reason')
            if verified_reason:
                is_verified_reason = 102
            sex = tools.get_json_value(user_info, 'gender')
            if sex == 'f':
                sex = 1
            elif sex == 'm':
                sex = 0
            else:
                sex = ''
            image_url = tools.get_json_value(user_info, 'profile_image_url')
            url = tools.get_json_value(user_info, 'profile_url')
            summary = tools.get_json_value(user_info, 'description')
            user_url_2 = 'http://m.weibo.cn/api/container/getIndex?containerid=100505%s' % _id
            user_url_html_2 = tools.get_json_by_requests(user_url_2)
            fans_count = tools.get_json_value(user_url_html_2,
                                              'userInfo.followers_count')
            follow_count = tools.get_json_value(user_url_html_2,
                                                'userInfo.follow_count')

            log.debug('''
                         用户id:     %s
                         微博昵称:   %s
                         微博地址:   %s
                         头像地址:   %s
                         微博认证:   %s
                         是否认证:   %s
                         所在地:     %s
                         性别:       %s
                         简介:       %s
                         粉丝数:     %s
                         关注数:     %s
                         监测状态:   %s
                        ''' % (_id, name, url, image_url, verified_reason,
                               is_verified_reason, area, sex, summary,
                               fans_count, follow_count, monitor_type))
            base_parser.add_wwa_weibo_user_info('WWA_weibo_user_info', SITE_ID,
                                                _id, name, url, image_url,
                                                verified_reason,
                                                is_verified_reason, area, sex,
                                                summary, fans_count,
                                                follow_count, monitor_type)
        tools.delay_time()
    base_parser.update_url('WWA_weibo_user_urls', root_url, Constance.DONE)
    tools.delay_time()


# parser({'url': 'http://m.weibo.cn/api/container/getIndex?type=user&containerid=100103type%3D3%26q%3D%E9%87%8D%E5%BA%86%E7%94%B5%E8%A7%86%E5%8F%B0'})