示例#1
0
 def __init__(self, platform='haokan'):
     self.platform = platform
     self.video_data_template = Std_fields_video().video_data
     self.video_data_template['platform'] = platform
     self.count_false = 0
     pop_lst = ['channel', 'describe', 'isOriginal', 'repost_count']
     for key in pop_lst:
         self.video_data_template.pop(key)
     self.baijiahao = Crawler_baijiahao()
示例#2
0
 def __init__(self, timeout=None, platform='new_tudou'):
     if timeout == None:
         self.timeout = 10
     else:
         self.timeout = timeout
     self.platform = platform
     self.TotalVideo_num = None
     self.midstepurl = None
     std_fields = Std_fields_video()
     self.video_data = std_fields.video_data
     self.video_data['platform'] = self.platform
     unused_key_list = ['channel', 'describe', 'repost_count', 'isOriginal']
     for key in unused_key_list:
         self.video_data.pop(key)
     self.list_page_url_lst = [
         "http://www.tudou.com/api/getfeeds?secCateId=10016&utdid=T8v9EQPOimUCAXL%2FAz0YrDOB&page_size=24",
         "http://www.tudou.com/api/getfeeds?secCateId=10195&utdid=XA2EFIGslWoCAWp4y3KXcZh7&page_size=24",
         "http://www.tudou.com/api/getfeeds?secCateId=622736331&utdid=XA2EFIGslWoCAWp4y3KXcZh7&page_size=24",
         "http://www.tudou.com/api/getfeeds?secCateId=622769673&utdid=XA2EFIGslWoCAWp4y3KXcZh7&page_size=24",
         "http://www.tudou.com/api/getfeeds?secCateId=10116&utdid=XA2EFIGslWoCAWp4y3KXcZh7&page_size=24",
         "http://www.tudou.com/api/getfeeds?secCateId=622621940&utdid=XA2EFIGslWoCAWp4y3KXcZh7&page_size=24",
         "http://www.tudou.com/api/getfeeds?secCateId=10198&utdid=XA2EFIGslWoCAWp4y3KXcZh7&page_size=24",
         "http://www.tudou.com/api/getfeeds?secCateId=622336449&utdid=XA2EFIGslWoCAWp4y3KXcZh7&page_size=24",
         "http://www.tudou.com/api/getfeeds?secCateId=10051&utdid=XA2EFIGslWoCAWp4y3KXcZh7&page_size=24"
     ]
示例#3
0
 def __init__(self, timeout=None, platform='kwai'):
     if timeout is None:
         self.timeout = 10
     else:
         self.timeout = timeout
     self.platform = platform
     self.TotalVideo_num = None
     self.midstepurl = None
     std_fields = Std_fields_video()
     self.video_data = std_fields.video_data
     self.video_data['platform'] = self.platform
     unused_key_list = ['channel', 'describe', 'repost_count', 'isOriginal']
     for key in unused_key_list:
         self.video_data.pop(key)
     self.first_page_headers = {
         "Accept":
         "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
         "Accept-Encoding":
         "gzip, deflate, br",
         "Accept-Language":
         "zh-CN,zh;q=0.9",
         "Cache-Control":
         "max-age=0",
         "Connection":
         "keep-alive",
         "Host":
         "live.kuaishou.com",
         "Upgrade-Insecure-Requests":
         "1",
         "User-Agent":
         "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",
     }
     self.loginObj = Login()
     self.get_cookies_and_front = self.loginObj.get_cookies_and_front
示例#4
0
    def __init__(self, timeout=None, platform='bilibili'):
        if timeout == None:
            self.timeout = 10
        else:
            self.timeout = timeout
        self.platform = platform
        std_fields = Std_fields_video()
        self.video_data = std_fields.video_data
        self.video_data['platform'] = self.platform
        # remove fields that crawled data don't have
        pop_key_Lst = [
            'repost_count',
            'isOriginal',
        ]
        for popk in pop_key_Lst:
            self.video_data.pop(popk)

        self.lst_name_rid_dict = {
            '国产动画': '153',
            '搞笑': '138',
            '影视杂谈': '182',
            '纪录片': {
                '人文历史': '37',
                '科学探索自然': '178',
                '军事': '179',
                '社会美食旅行': '180'
            },
            '游戏': {
                '单机游戏': '17'
            }
        }

        self.headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Encoding':
            'gzip, deflate, br',
            'Accept-Language':
            'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
            'Cache-Control':
            'max-age=0',
            'Connection':
            'keep-alive',
            'Cookie':
            'fts=1502427559; buvid3=E8FDA203-70E1-48A6-BE29-E2B833F92DB314456infoc; biliMzIsnew=1; biliMzTs=null; sid=534m3oqx; CNZZDATA2724999=cnzz_eid%3D1621908673-1502776053-https%253A%252F%252Fwww.baidu.com%252F%26ntime%3D1521001760; pgv_pvi=2734144512; rpdid=olilkosokkdoswsqmikqw; LIVE_BUVID=c552bf4415d1fba581d231647ba7b1bf; LIVE_BUVID__ckMd5=d8118a88b8f0fa8b; UM_distinctid=161e545d99a9-07c0b73fa83f93-17357940-1fa400-161e545d99b224; DedeUserID=114627314; DedeUserID__ckMd5=073268b15392f951; SESSDATA=4b30a63b%2C1524982595%2Cd78acc24; bili_jct=06e47d618fff20d978b968f15b3271c5; finger=c650951b; BANGUMI_SS_24014_REC=202051; _dfcaptcha=ca1c709bb04bda0240e4771eb8d90871',
            'Host':
            'www.bilibili.com',
            'Upgrade-Insecure-Requests':
            '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0'
        }
示例#5
0
 def __init__(self, timeout=None, platform='抖音'):
     if timeout == None:
         self.timeout = 10
     else:
         self.timeout = timeout
     self.platform = platform
     self.TotalVideo_num = None
     self.midstepurl = None
     std_fields = Std_fields_video()
     self.video_data = std_fields.video_data
     self.video_data['platform'] = self.platform
     unused_key_list = ['channel', 'describe', 'repost_count', 'isOriginal']
     for key in unused_key_list:
         self.video_data.pop(key)
     # self.headers = {
     #     # 抖音
     #     # "Host": "aweme.snssdk.com",
     #     "Connection": "keep-alive",
     #     # "x-Tt-Token": "%sc05aa0c030d8872y921e4198d7d0e537c6t6684c162a890f23pb73e8f73c716vvc78cff1e65e0iu7e3c%s" % (random.randint(100000,999999),random.randint(100000000,999999999)),
     #     "sdk-version": "1",
     #     "User-Agent": "Aweme 9.2.1 rv:92113 (iPhone; iOS 13.3; zh_CN) Cronet",
     #     # "x-tt-trace-id": "00-0a1eeba%sb4ea508f44a29d%s-0a1ppwc%sb4ea50-01" % (random.randint(100,999),random.randint(10000000,99999999),random.randint(100,999)),
     #     "Accept-Encoding": "gzip, deflate",
     #     "X-SS-DP": "1128",
     #     # "Cookie": 'tt_webid=6636348501838333443; __tea_sdk__user_unique_id=6636348501838333443; _ga=GA1.3.400580664.1533338821; sid_guard=7055c05aa0c030d8872df921e4198d7d%7C1573351823%7C5184000%7CThu%2C+09-Jan-2020+02%3A10%3A23+GMT; uid_tt=cce96b02e4a2ad0dcc2ce9c2edad93a3; sid_tt=7055c05aa0c030d8872df921e4198d7d; sessionid=7055c05aa0c030d8872df921e4198d7d; install_id=93682014831; ttreq=1$9164e2a9cc97597197bec6e03559f7e9d05982c5; odin_tt=cc424af4c816fe1492acb2b887cef5fae585799744a9168f8204fce3d0f011e4694ff93a6a6f3e83bc3e126418aafa54',
     #     # "X-Khronos": "1576422795",
     #     # "X-Gorgon": "83009990000046140d8188c11cfdc1dd7b3f0507077b39112481",
     #
     # }
     self.headers = {
         # 抖音极速版
         # "Host": "api3-normal-c-hl.amemv.com",
         "Connection": "keep-alive",
         # "X-SS-TC": "0",
         "User-Agent":
         "Linux; U; Android 8.1.0; zh-CN; EML-AL00 Build/HUAWEIEML-AL00",
         # "x-tt-trace-id": "00-0a1eeba%sb4ea508f44a29d%s-0a1ppwc%sb4ea50-01" % (random.randint(100,999),random.randint(10000000,99999999),random.randint(100,999)),
         "Accept-Encoding": "gzip",
         "X-SS-REQ-TICKET":
         str(int(datetime.datetime.now().timestamp() * 1e3)),
         # "Cookie": 'tt_webid=6636348501838333443; __tea_sdk__user_unique_id=6636348501838333443; _ga=GA1.3.400580664.1533338821; sid_guard=7055c05aa0c030d8872df921e4198d7d%7C1573351823%7C5184000%7CThu%2C+09-Jan-2020+02%3A10%3A23+GMT; uid_tt=cce96b02e4a2ad0dcc2ce9c2edad93a3; sid_tt=7055c05aa0c030d8872df921e4198d7d; sessionid=7055c05aa0c030d8872df921e4198d7d; install_id=93682014831; ttreq=1$9164e2a9cc97597197bec6e03559f7e9d05982c5; odin_tt=cc424af4c816fe1492acb2b887cef5fae585799744a9168f8204fce3d0f011e4694ff93a6a6f3e83bc3e126418aafa54',
         "X-Khronos": str(int(datetime.datetime.now().timestamp())),
         "sdk-version": "1",
         # "X-Gorgon": "83009990000046140d8188c11cfdc1dd7b3f0507077b39112481",
     }
     self.api_list = [
         "api3-normal-c-hl.amemv.com",
         "api3-normal-c-lf.amemv.com",
         "api3-normal-c-lf.amemv.com",
         "aweme.snssdk.com",
     ]
示例#6
0
 def __init__(self, timeout=None, platform='喜马拉雅'):
     if timeout == None:
         self.timeout = 10
     else:
         self.timeout = timeout
     self.platform = platform
     self.TotalVideo_num = None
     self.midstepurl = None
     std_fields = Std_fields_video()
     self.video_data = std_fields.video_data
     self.video_data['platform'] = self.platform
     unused_key_list = ['channel', 'describe', 'repost_count', 'isOriginal']
     for key in unused_key_list:
         self.video_data.pop(key)
示例#7
0
 def __init__(self, timeout=None, platform='抖音'):
     if timeout == None:
         self.timeout = 10
     else:
         self.timeout = timeout
     self.platform = platform
     self.TotalVideo_num = None
     self.midstepurl = None
     std_fields = Std_fields_video()
     self.video_data = std_fields.video_data
     self.video_data['platform'] = self.platform
     unused_key_list = ['channel', 'describe', 'repost_count', 'isOriginal']
     for key in unused_key_list:
         self.video_data.pop(key)
         self.headers = {
                 "Accept-Encoding": "gzip",
                 # "X-SS-REQ-TICKET": "1589357171319",
                 "X-SS-REQ-TICKET": str(int(datetime.datetime.now().timestamp() * 1e3)),
                 "sdk-version": "1",
                 "User-Agent": "ttnet okhttp/3.10.0.2",
                 # "Cookie": "odin_tt=a079fbd0c726109f9f513d911e5869bed7b45822e99af630b65dbe6a889561095770f8357f6ce1a69c10ced468695be448e75eeaae4f9fd4cae68b90db6d661d; install_id=1697284012668695; ttreq=1$31eace644c19346ed8397afb3953495afac05b2b",
                 "X-Gorgon": "0401e0ce4001b09c16b91c4741bd4eb2ca69dfd4d031374a8e72",
                 # "X-Khronos": "1589357171",
                 "X-Khronos": str(int(datetime.datetime.now().timestamp())),
                 "Host": "aweme.snssdk.com",
                 "Connection": "Keep-Alive",
         }
     # self.headers = {
     #         # 抖音极速版
     #         # "Host": "api3-normal-c-hl.amemv.com",
     #         "Connection": "keep-alive",
     #         # "X-SS-TC": "0",
     #         "User-Agent": "Android 8.1.0; zh-CN; EML-AL00 Build/HUAWEIEML-AL00",
     #         # "x-tt-trace-id": "00-0a1eeba%sb4ea508f44a29d%s-0a1ppwc%sb4ea50-01" % (random.randint(100,999),random.randint(10000000,99999999),random.randint(100,999)),
     #         "Accept-Encoding": "gzip",
     #         "X-SS-REQ-TICKET": str(int(datetime.datetime.now().timestamp() * 1e3)),
     #         # "Cookie": 'tt_webid=6636348501838333443; __tea_sdk__user_unique_id=6636348501838333443; _ga=GA1.3.400580664.1533338821; sid_guard=7055c05aa0c030d8872df921e4198d7d%7C1573351823%7C5184000%7CThu%2C+09-Jan-2020+02%3A10%3A23+GMT; uid_tt=cce96b02e4a2ad0dcc2ce9c2edad93a3; sid_tt=7055c05aa0c030d8872df921e4198d7d; sessionid=7055c05aa0c030d8872df921e4198d7d; install_id=93682014831; ttreq=1$9164e2a9cc97597197bec6e03559f7e9d05982c5; odin_tt=cc424af4c816fe1492acb2b887cef5fae585799744a9168f8204fce3d0f011e4694ff93a6a6f3e83bc3e126418aafa54',
     #         "X-Khronos": str(int(datetime.datetime.now().timestamp())),
     #         # "sdk-version": "1",
     #         # "X-Gorgon": "83009990000046140d8188c11cfdc1dd7b3f0507077b39112481",
     #
     # }
     self.api_list = [
             "api3-normal-c-hl.amemv.com",
             "api3-normal-c-lf.amemv.com",
             "api3-normal-c-lq.amemv.com",
             "aweme.snssdk.com",
     ]
示例#8
0
 def __init__(self, timeout=None, platform='网易新闻'):
     if timeout == None:
         self.timeout = 10
     else:
         self.timeout = timeout
     self.platform = platform
     self.TotalVideo_num = None
     self.midstepurl = None
     std_fields = Std_fields_video()
     self.video_data = std_fields.video_data
     self.video_data['platform'] = self.platform
     unused_key_list = ['channel', 'describe', 'repost_count', 'isOriginal']
     for key in unused_key_list:
         self.video_data.pop(key)
     self.headers = {
         "Accept-Encoding": "gzip",
         "Connection": "keep-alive",
         "Host": "c.m.163.com",
         "User-Agent": "NewsApp/34.1.1 Android/6.0.1 (HUAWEI/BLA-AL00)"
     }
示例#9
0
class Crawler_haokan():
    def __init__(self, platform='haokan'):
        self.platform = platform
        self.video_data_template = Std_fields_video().video_data
        self.video_data_template['platform'] = platform
        self.count_false = 0
        pop_lst = ['channel', 'describe', 'isOriginal', 'repost_count']
        for key in pop_lst:
            self.video_data_template.pop(key)
        self.baijiahao = Crawler_baijiahao()

    def releaser_page_web(self,
                          releaserUrl,
                          output_to_file=False,
                          filepath=None,
                          releaser_page_num_max=30,
                          output_to_es_raw=False,
                          output_to_es_register=False,
                          push_to_redis=False,
                          es_index=None,
                          doc_type=None,
                          fetchFavoriteCommnt=True):
        pid = os.getpid()
        releaser_id = self.get_releaser_id(releaserUrl)
        print('releaser_id is %s' % releaser_id)
        result_lst = []
        # video_info = self.video_data
        page_num = 0
        has_more = True
        ctime = ""
        count_false = 0
        # proxies = None
        proxies = get_proxy_dic()
        while page_num <= releaser_page_num_max and has_more:

            post_url = 'https://haokan.baidu.com/haokan/wiseauthor?app_id={0}&_api=1&_skip={1}&ctime={2}&_limit=10&video_type=media&sort_type=sort_by_time'.format(
                releaser_id, page_num, ctime)
            headers = {
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36',
                "referer":
                "https://haokan.baidu.com/haokan/wiseauthor?app_id=1564003728536358",
                "sec-fetch-mode": "cors",
                "sec-fetch-site": "same-origin",
                "accept": "*/*",
                "accept-encoding": "gzip, deflate, br",
                "accept-language": "zh,zh-CN;q=0.9",
                "content-type": "application/x-www-form-urlencoded"
            }
            try:
                if page_num == 0:
                    for loop in range(5):
                        get_page = requests.get(releaserUrl,
                                                headers=headers,
                                                timeout=3,
                                                proxies=proxies)
                        # print(get_page.text)
                        page_dic, fans_num = self.web_first_pag(get_page.text)
                        if page_dic['apiData']['video']['results']:
                            page_num += 1
                            break
                else:
                    get_page = requests.get(post_url,
                                            headers=headers,
                                            timeout=3)
                    page_dic = get_page.json()
                    page_num += 1
                    # print(page_dic)
            except:
                continue
            try:
                info_lst = page_dic['apiData']['video']['results']
            except:
                info_lst = []
            try:
                ctime = page_dic['apiData']['video']['ctime']
                has_more = page_dic['apiData']['video']['has_more']
                if not has_more:
                    has_more = False
            except:
                has_more = False
            if info_lst != []:
                count_false = 0
                print("Process %s is processing %s at page %s" %
                      (pid, releaser_id, page_num))
                time.sleep(int(random.uniform(1, 2)))
                for line in info_lst:
                    video_data = copy.deepcopy(self.video_data_template)
                    video_data['title'] = line['content']['title']
                    video_id = line['content']['vid']
                    video_data['video_id'] = video_id
                    # partial_url = '{"nid":"sv_%s"}' % video_id
                    # partial_url_encode = urllib.parse.quote_plus(partial_url)
                    video_data['url'] = line['content']["video_short_url"]
                    video_data['play_count'] = line['content']['playcnt']
                    video_data['favorite_count'] = int(
                        line['content']['praiseNum'])
                    try:
                        video_data['comment_count'] = int(
                            line['content']['commentNum'])
                    except:
                        video_data['comment_count'] = 0
                    video_data['releaser_followers_count'] = int(fans_num)
                    # print('like num is %s' % video_data['favorite_count'])
                    try:
                        video_data['duration'] = trans_duration(
                            line['content']['duration'])
                    except:
                        video_data['duration'] = 0
                    video_data['releaser'] = line['content']['author']
                    video_data['releaser_id_str'] = "haokan_%s" % (
                        line['content']['authorid'])
                    video_data[
                        'releaserUrl'] = 'https://haokan.baidu.com/haokan/wiseauthor?app_id=' + line[
                            'content']['authorid']
                    fetch_time = int(time.time() * 1e3)
                    video_data['fetch_time'] = fetch_time
                    releaser_time_str = line['content']['publish_time']
                    video_data['release_time'] = trans_strtime_to_timestamp(
                        input_time=releaser_time_str)
                    print(
                        video_id, releaser_time_str,
                        datetime.datetime.fromtimestamp(
                            video_data['release_time'] / 1000), page_num)
                    yield video_data
            else:
                count_false += 1
                if count_false < 5:
                    continue
                else:
                    break

    def video_page(self, url, vid=None, proxies=None):
        """
        For Haokan App, video_page method ONLY accept pass in vid, rather than
        video url.
        """
        if vid is None:
            return None

        # post_url = ('https://sv.baidu.com/haokan/api?'
        #             'cmd=comment/getreply&log=vhk&tn=1001128v&ctn=1001128v'
        #             '&imei=279014587228348'
        #             '&cuid=C577C0F8F6AA9FFE3E41CB0B3E507A14|843822785410972'
        #             '&os=android&osbranch=a0&ua=810_1440_270&ut=ALP-AL00_6.0.1_23_HUAWEI'
        #             '&apiv=4.6.0.0&appv=409011&version=4.9.1.10'
        #             '&life=1546591253&clife=1546591253&hid=B3697DD2F02F9A031714A93CCDF0A4C7'
        #             '&imsi=0&network=1'
        #             '&sids=1373_1-1436_4-1629_2-1647_1-1646_2-1708_1-1715_2'
        #             '-1736_1-1738_3-1739_1-1748_3-1754_2-1757_1-1767_1'
        #             '-1772_2-1776_1-1778_1-1780_3-1782_1-1786_2-1803_1'
        #             '-1805_2-1806_3-1814_2 HTTP/1.1')
        post_url = 'https://sv.baidu.com/haokan/api?tn=1008350o&ctn=1008350o&os=ios&cuid=E{0}FD33EC4EBA7B853AF10A50A02D705F02DECEFMBGNNIETE&osbranch=i0&ua=640_1136_326&ut=iPhone5%2C4_10.3.3&net_type=-1&apiv=5.1.0.10&appv=1&version=5.1.0.10&life=1563498146&clife=1563498146&sids=&idfa=E3FC9054-384B-485F-9B4C-936F33D7D090&hid=9F5E84EAEEE51F4C190ACE7AABEB915F&young_mode=0&log=vhk&location=&cmd=video/detail'.format(
            random.randint(1000, 9999))
        # raw header str:
        # header_str = ('Charset: UTF-8'
        #             'User-Agent: Mozilla/5.0 (Linux; Android 6.0.1; ALP-AL00 Build/V417IR; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/52.0.2743.100 Mobile Safari/537.36 haokan/4.9.1.10 (Baidu; P1 6.0.1)/IEWAUH_32_1.0.6_00LA-PLA/1001128v/C577C0F8F6AA9FFE3E41CB0B3E507A14%7C843822785410972/1/4.9.1.10/409011/1'
        #             'XRAY-TRACEID: 9624a81f-15e0-486e-b79f-d97b30c5b7d0'
        #             'XRAY-REQ-FUNC-ST-DNS: okHttp;1546598993248;0'
        #             'Content-Type: application/x-www-form-urlencoded'
        #             'Content-Length: 267'
        #             'Host: sv.baidu.com'
        #             'Connection: Keep-Alive'
        #             'Accept-Encoding: gzip'
        #             'Cookie: BAIDUID=1EA157CF3563181B98E5ABC1DED982D6:FG=1; BAIDUZID=805xaZQOUQRP3LqnkFs1bl2Bv-TD-CMHnotPgI4vkWabaQgbAx_tx4yMxTHzMBqpC0hwc6ZRa4xUFEkFwB3jxCO_Lg8d5s9gk9OSOeIowQ2k; BAIDUCUID=luvyi0aLHf0RuSajY8S2ug8fvi0u82uugi2IigiS2i80Pv8hYavG8jafv8gqO28EA'
        #             )
        headers = {
            "Accept":
            "*/*",
            "Accept-Encoding":
            "gzip, deflate",
            'Charset':
            'UTF-8',
            "Accept-Language":
            "zh-Hans-CN;q=1",
            "User-Agent":
            "Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_3 like Mac OS X) AppleWebKit/603.3.8 (KHTML, like Gecko) Mobile/14G60 haokan/5.1.0.10 (Baidu; P2 10.3.3)/3.3.01_4,5enohP/381d/E7919FD33EC4EBA7B853AF10A50A02D705F02DECEFMBGNNIETE/1 HTTP/1.1",
            # "XRAY-REQ-FUNC-ST-DNS": "okHttp;1562813246444;0",
            # "XRAY-TRACEID": "5bd68916-4696-4bb3-b3a3-57a0c6a15949",
            'Content-Type':
            'application/x-www-form-urlencoded',
            # 'Content-Length': '267',
            'Host':
            'sv.baidu.com',
            'Connection':
            'Keep-Alive',
            "X-Bfe-Quic":
            "enable=1",
            "Cookie":
            "BAIDUCUID=luBHiY8JSig3iHiZ0iSLi0O_v80Gi2iqlav6u_aCS8g1aH8H_iS9a0ivWu0dtQODzbXmA; BAIDUID=F2385E8E821854CA8BE4E30920EED52F:FG=1"
        }

        # raw post data is like:
        # 'comment%2Fgetreply=method%3Dget%26url_key%3D13089959609189000356%26pn%3D1%26rn%3D10%26child_rn%3D2%26need_ainfo%3D0%26type%3D0%26vid%3D13089959609189000356&video%2Fdetail=method%3Dget%26url_key%3D13089959609189000356%26log_param_source%3D%26vid%3D13089959609189000356'
        # which can be decoded as urlencode rule as
        # post_str_decoded = ('comment/getreply=method=get'
        #                     '&url_key=13089959609189000356&pn=1'
        #                     '&rn=10'
        #                     '&child_rn=2'
        #                     '&need_ainfo=0'
        #                     '&type=0'
        #                     '&vid=13089959609189000356'
        #                     '&video/detail=method=get'
        #                     '&url_key=13089959609189000356'
        #                     '&log_param_source='
        #                     '&vid=13089959609189000356')
        # We cannot directly nest dict within dict for post data, or
        # the '{' and '}' will be treated as straight character rather than
        # dictionary boundary, which will lead to un-expected results.
        # The correct way to do this is two-set urlencode.
        comment_getreplyDict = {
            'method': 'get',
            # 'url_key': '13089959609189000356&pn=1',
            'url_key': '%s&pn=1' % vid,
            'rn': '10',
            'child_rn': '2',
            'need_ainfo': '0',
            'type': '0',
            # 'vid': '13089959609189000356',
            'vid': vid,
        }
        comment_getreplyEncodedStr = urlencode(comment_getreplyDict)
        video_detailDict = {
            'method': 'get',
            # 'url_key': '13089959609189000356',
            'url_key': vid,
            'log_param_source': '',
            # 'vid': '13089959609189000356'
            'vid': vid,
        }
        video_detailEncodedStr = urlencode(video_detailDict)
        post_data = {
            'comment/getreply': comment_getreplyEncodedStr,
            'video/detail': video_detailEncodedStr
        }
        try:
            if not proxies:
                get_page = requests.post(post_url,
                                         data=post_data,
                                         headers=headers,
                                         timeout=0.5)
                # print(get_page.text)
                page_dict = get_page.json()
            else:
                get_page = requests.post(post_url,
                                         data=post_data,
                                         headers=headers,
                                         timeout=1,
                                         proxies=proxies)
                # print(get_page.text)
                page_dict = get_page.json()
        except:
            return None
        self.count_false = 0
        video_dict = copy.deepcopy(self.video_data_template)

        try:
            videoD = page_dict['video/detail']['data']
            commntD = page_dict['comment/getreply']['data']
        except:
            return None
        try:
            video_dict['comment_count'] = int(commntD['comment_count'])
            video_dict['favorite_count'] = videoD['like_num']
        except Exception:
            return None
        else:
            video_dict['duration'] = videoD['duration']
            fetch_time = int(time.time() * 1e3)
            video_dict['fetch_time'] = fetch_time
            video_dict['play_count'] = videoD['playcnt']
            video_dict['release_time'] = videoD['publishTime'] * 1e3
            video_dict['releaser'] = videoD['author']
            video_dict['title'] = videoD['title']
            video_dict['video_id'] = vid
            partial_url = '{"nid":"sv_%s"}' % vid
            partial_url_encode = urllib.parse.quote_plus(partial_url)
            video_dict['url'] = (
                'https://sv.baidu.com/videoui/page/videoland?context=%s' %
                partial_url_encode)
            releaser_id = videoD['appid']
            video_dict[
                'releaserUrl'] = 'https://haokan.baidu.com/haokan/wiseauthor?app_id=' + releaser_id
        return video_dict

    def get_releaser_id(self, releaserUrl):
        return get_releaser_id(platform=self.platform, releaserUrl=releaserUrl)

    def get_releaser_follower_num(self, releaserUrl):
        return self.baijiahao.get_releaser_follower_num(releaserUrl)

    def get_releaser_follower_num_web(self, releaserUrl):
        releaser_id = self.get_releaser_id(releaserUrl)
        url = "https://sv.baidu.com/haokan/api?cmd=baijia/authorInfo&log=vhk&tn=1008621v&ctn=1008621v&bdboxcuid=&os=android&osbranch=a0&ua=810_1440_270&ut=MI%20NOTE%203_6.0.1_23_Xiaomi&apiv=4.6.0.0&appv=414011&version=4.14.1.10&life=1555296294&clife=1558350548&hid=02112F128209DD6BAF39CA37DE9C05E6&imsi=0&network=1&location={%22prov%22:%22%22,%22city%22:%22%22,%22county%22:%22%22,%22street%22:%22%22,%22latitude%22:39.911017,%22longitude%22:116.413562}&sids=1957_2-2193_3-2230_4-2320_1-2326_2-2353_1-2359_3-2376_1-2391_1-2433_4-2436_5-2438_1-2442_1-2443_2-2452_1-2457_2-2470_1-2480_2-2511_1-2525_4-2529_1-2537_1-2538_1-2540_1-2555_2-2563_1-2565_2-2568_1-2574_1-2575_1-2577_1-2582_1"
        headers = {
            "Host": "sv.baidu.com",
            "Connection": "keep-alive",
            "Content-Length": "60",
            "Charset": "UTF-8",
            "User-Agent":
            'Mozilla/5.0 (Linux; Android 6.0.1; MI NOTE 3 Build/V417IR; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/52.0.2743.100 Mobile Safari/537.36 haokan/4.14.1.10 (Baidu; P1 6.0.1)/imoaiX_32_1.0.6_3+ETON+IM/1008621v/51BF00514520A03B32E6CA9D7443D8F8%7C504550857697800/1/4.14.1.10/414011/1',
            "X-Bfe-Quic": "enable=1",
            "XRAY-REQ-FUNC-ST-DNS": "okHttp;1558350575755;0",
            "XRAY-TRACEID": "be54291d-c13a-4a88-8337-9e70ad75d7d8",
            "Cookie":
            "BAIDUID=A6DC59055E4FC518778A19436C23B49A:FG=1; BDUSS=ERoRGxXUGc4em1id21XSlM0TXQ0Q3hXMkEwYUVqamRLV05kLVBLNTZqNk55RUpkRUFBQUFBJCQAAAAAAAAAAAEAAACRoQmabGVtbzAwMDAwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAI07G12NOxtdV; BAIDUZID=0dekJxLpZKeY1N0xwEj1dNj2RgYQ8Xy88CJFeivgViMYGUyBFD6dbcwsi4KXbfeoBkvmSUHWhe4-j42mUUFJXf5OQX9FG8tN1pm2M3RMArNE; BAIDUCUID=gaHRu_u_v8gga2830u2uu_uCHilEi-uk_av9i0PDHtifa28fga26fgayvf_NP2ijA",
            "Content-Type": "application/x-www-form-urlencoded",
            "Accept-Encoding": "gzip, deflate"
        }
        post_dic = {"baijia/authorInfo": "method=get&app_id=%s" % releaser_id}
        get_page = requests.post(url, data=post_dic, headers=headers)
        res = get_page.json()
        try:
            follower_num = res.get("baijia/authorInfo").get("data").get(
                "fansCnt")
            print('%s follower number is %s' % (releaserUrl, follower_num))
            return follower_num
        except:
            print("can't can followers")

    def web_first_pag(self, page_text):
        res = re.findall("window.__PRELOADED_STATE__ = {(.*)};",
                         page_text,
                         flags=re.DOTALL)
        #print(res)
        try:
            res = json.loads("{%s}" % res[0])
            apiData = {"apiData": {"video": res["video"]}}
            fans = res["author"]["fansCnt"]
            return apiData, fans
        except:
            return None, None

    def releaser_page(self,
                      releaserUrl,
                      output_to_file=False,
                      filepath=None,
                      releaser_page_num_max=10000,
                      output_to_es_raw=False,
                      output_to_es_register=False,
                      push_to_redis=False,
                      es_index=None,
                      doc_type=None,
                      fetchFavoriteCommnt=True,
                      proxies_num=None):
        for res in self.baijiahao.releaser_page_web_by_time(
                releaserUrl,
                output_to_file=output_to_file,
                filepath=filepath,
                releaser_page_num_max=releaser_page_num_max,
                output_to_es_raw=output_to_es_raw,
                output_to_es_register=output_to_es_register,
                push_to_redis=push_to_redis,
                es_index=es_index,
                doc_type=doc_type,
                proxies_num=proxies_num):
            yield res

    # @logged
    def releaser_page_app(self,
                          releaserUrl,
                          output_to_file=False,
                          filepath=None,
                          releaser_page_num_max=10000,
                          output_to_es_raw=False,
                          output_to_es_register=False,
                          push_to_redis=False,
                          es_index=None,
                          doc_type=None,
                          fetchFavoriteCommnt=True,
                          proxies_num=None):
        """
        post_url never change, what matters is the post_dic
        """
        pid = os.getpid()
        releaser_id = self.get_releaser_id(releaserUrl)
        print('releaser_id is %s' % releaser_id)
        result_lst = []
        # video_info = self.video_data
        page_num = 0
        has_more = True
        if proxies_num:
            proxies = get_proxy_dic(max_proxies=proxies_num)
        else:
            proxies = get_proxy_dic()
        #proxies = {'http': 'http://*****:*****@58.252.195.58:19223/', 'https': 'http://*****:*****@58.252.195.58:19223/'}
        post_url = 'https://sv.baidu.com/haokan/api?tn=1008350o&ctn=1008350o&imei=&cuid=E7142FD33EC4EBA7B853AF10A50A02D{0}02DECEFMBGNNICXT&os=ios&osbranch=i0&ua=640_1136_326&ut=iPhone5%2C4_10.3.3&net_type=-1&apiv=5.1.0.10&appv=1&version=5.1.0.10&life=1563337077&clife=1563337077&sids=&idfa=E3FC9054-384B-485F-9B4C-936F33D7D099&hid=9F5E84EAEEE51F4C190ACE7AABEB915F&young_mode=0&log=vhk&location=&cmd=baijia/listall'.format(
            random.randint(1000, 9999))
        count_false = 0
        while page_num <= releaser_page_num_max and has_more:
            page_num += 1
            post_str = ('method=get&app_id=' + releaser_id + '&_skip=' +
                        str(page_num) + '&_limit=20&_timg_cover=100,150,1000'
                        '&video_type=media&sort_type=sort_by_time')
            post_dic = {'baijia/listall': post_str}

            headers = {
                'Charset': 'UTF-8',
                'User-Agent':
                'Mozilla/5.0 (Linux; Android 6.0.1; ALP-AL00 Build/V417IR; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/52.0.2743.100 Mobile Safari/537.36 haokan/4.9.1.10 (Baidu; P1 6.0.1)/IEWAUH_32_1.0.6_00LA-PLA/1001128v/C577C0F8F6AA9FFE3E41CB0B3E507A14%7C843822785410972/1/4.9.1.10/409011/1',
                "XRAY-REQ-FUNC-ST-DNS": "okHttp;1562565506087;0",
                "XRAY-TRACEID": "bbb62604-87cb-4796-a14b-fece64f239af",
                'Content-Type': 'application/x-www-form-urlencoded',
                # 'Content-Length': '267',
                'Host': 'sv.baidu.com',
                'Connection': 'Keep-Alive',
                "Accept-Encoding": "gzip, deflate",
                "X-Bfe-Quic": "enable=1"
                # 'Cookie': 'BAIDUID=1EA157CF3563181B98E5ABC1DED982D6:FG=1; BAIDUZID=805xaZQOUQRP3LqnkFs1bl2Bv-TD-CMHnotPgI4vkWabaQgbAx_tx4yMxTHzMBqpC0hwc6ZRa4xUFEkFwB3jxCO_Lg8d5s9gk9OSOeIowQ2k; BAIDUCUID=luvyi0aLHf0RuSajY8S2ug8fvi0u82uugi2IigiS2i80Pv8hYavG8jafv8gqO28EA'
            }

            try:
                if not proxies:
                    get_page = requests.post(post_url,
                                             data=post_dic,
                                             headers=headers,
                                             timeout=3)
                    page_dic = get_page.json()
                    print(page_dic)
                else:
                    get_page = requests.post(post_url,
                                             data=post_dic,
                                             headers=headers,
                                             timeout=3,
                                             proxies=proxies)
                    page_dic = get_page.json()
                    print(page_dic)
            except:
                proxies = get_proxy_dic()
                continue
            try:
                info_lst = page_dic['baijia/listall']['data']['results']
            except:
                info_lst = []
            if info_lst != []:
                count_false = 0
                print("Process %s is processing %s at page %s" %
                      (pid, releaser_id, page_num))
                time.sleep(int(random.uniform(1, 2)))
                for line in info_lst:
                    video_data = copy.deepcopy(self.video_data_template)
                    video_data['title'] = line['content']['title']
                    video_id = line['content']['vid']
                    video_data['video_id'] = video_id
                    # partial_url = '{"nid":"sv_%s"}' % video_id
                    # partial_url_encode = urllib.parse.quote_plus(partial_url)
                    video_data['url'] = line['content']["video_short_url"]
                    video_data['play_count'] = line['content']['playcnt']
                    video_data['favorite_count'] = line['content']['like_num']
                    # print('like num is %s' % video_data['favorite_count'])
                    try:
                        video_data['duration'] = line['content']['duration']
                    except:
                        video_data['duration'] = 0
                    video_data['releaser'] = line['content']['author']
                    video_data['releaser_id_str'] = "haokan_%s" % (releaser_id)
                    video_data[
                        'releaserUrl'] = 'https://haokan.baidu.com/haokan/wiseauthor?app_id=' + releaser_id
                    fetch_time = int(time.time() * 1e3)
                    video_data['fetch_time'] = fetch_time
                    releaser_time_str = line['content']['publish_time']
                    # video_data['release_time'] = trans_strtime_to_timestamp(input_time=releaser_time_str)
                    # video_data['release_time'] = line['content']['dtime']
                    newVideoDict = None
                    for retry in range(3):
                        newVideoDict = self.video_page('',
                                                       vid=video_id,
                                                       proxies=proxies)
                        if newVideoDict:
                            break
                    if newVideoDict:
                        video_data['favorite_count'] = newVideoDict[
                            'favorite_count']
                        video_data['comment_count'] = newVideoDict[
                            'comment_count']
                        video_data['release_time'] = int(
                            newVideoDict['release_time'])
                        # print('like num after video_page fetching is %s' % video_data['favorite_count'])
                        print(
                            video_id, releaser_time_str,
                            datetime.datetime.fromtimestamp(
                                video_data['release_time'] / 1000), page_num)
                        result_lst.append(video_data)
                        yield video_data
            else:
                count_false += 1
                if count_false <= 5:
                    proxies = get_proxy_dic(max_proxies=1)
                    continue
                else:
                    has_more = False

    def releaser_id_to_uk(self, releaser_id):
        headers = {
            'Charset':
            'UTF-8',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
            'Content-Type':
            'application/x-javascript; charset=utf-8',
            'Host':
            'author.baidu.com',
            'Connection':
            'Keep-Alive',
            'Accept-Encoding':
            'gzip',
            'Cookie':
            'BAIDUID=5B4BD931D455EA625D8B5E20BD348270:FG=1; BIDUPSID=5B4BD931D455EA625D8B5E20BD348270; PSTM=1540776027; BDORZ=FFFB88E999055A3F8A630C64834BD6D0; H_PS_PSSID=1423_27211_21123_28131_27750_28139_20718; BDSFRCVID=Y2PsJeCCxG37oNO9K0MmeTd-epk7qPMdDVTa3J; H_BDCLCKID_SF=tR333R7oKRu_HRjYbb__-P4DHUjHfRO2X5REVMTHBPOkeqOJ2Mt5jP4NXNriJnOCfgjtXxcc5q_MoCDzbpnp05tpeGLsaPoy2K6XsJoq2RbhKROvhjntK6uQ-nnjhjnWLbneaJ5n0-nnhI3vXxPByTODyfQwXpoO0KcG_UFhHR3rsftRy6CaePk_hURK2D6aKC5bL6rJabCQe4_ZK-brKbTM0tvrbMT-027OKK85ahrcbqkxXtvI5lRBKtOh3j3zt4jMMh5xthF0hDvd-tnO-t6H-xQ0KnLXKKOLVMI-LPOkeqOJ2Mt5jP4NXNriJUrL5GnbsR5M2K3aVh6gQhjx-jtpexbH55utfnID3J; delPer=0; PSINO=2'
        }
        p = {
            'context': str({
                "from": 0,
                "app_id": releaser_id
            }).replace('\'', '\"')
        }
        rq_get = requests.get('https://author.baidu.com/profile?pagelets=root',
                              headers=headers,
                              params=p)
        # print(rq_get.url)
        # print(rq_get.text[24:-2])
        info = json.loads(rq_get.text[24:-2])
        spts_list = info['scripts']
        spts = ' '.join(spts_list)
        uk_list = re.findall(r"\"uk\":\"(.+?)\"", spts)
        uk = uk_list[0]
        return uk

    def releaser_page_via_m(self,
                            releaserUrl,
                            output_to_file=False,
                            filepath=None,
                            releaser_page_num_max=30,
                            output_to_es_raw=False,
                            output_to_es_register=False,
                            push_to_redis=False,
                            es_index=None,
                            doc_type=None):
        releaser_id = self.get_releaser_id(releaserUrl)
        uk = self.releaser_id_to_uk(releaser_id)
        print("platform: %s releaser_id: %s uk: %s" %
              (self.platform, releaser_id, uk))
        result_lst = []
        video_info = self.video_data_template
        page_count = 1
        headers = {
            'Charset':
            'UTF-8',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
            'Content-Type':
            'application/x-javascript; charset=utf-8',
            'Host':
            'author.baidu.com',
            'Connection':
            'Keep-Alive',
            'Accept-Encoding':
            'gzip',
            'Cookie':
            'BAIDUID=5B4BD931D455EA625D8B5E20BD348270:FG=1; BIDUPSID=5B4BD931D455EA625D8B5E20BD348270; PSTM=1540776027; BDORZ=FFFB88E999055A3F8A630C64834BD6D0; H_PS_PSSID=1423_27211_21123_28131_27750_28139_20718; BDSFRCVID=Y2PsJeCCxG37oNO9K0MmeTd-epk7qPMdDVTa3J; H_BDCLCKID_SF=tR333R7oKRu_HRjYbb__-P4DHUjHfRO2X5REVMTHBPOkeqOJ2Mt5jP4NXNriJnOCfgjtXxcc5q_MoCDzbpnp05tpeGLsaPoy2K6XsJoq2RbhKROvhjntK6uQ-nnjhjnWLbneaJ5n0-nnhI3vXxPByTODyfQwXpoO0KcG_UFhHR3rsftRy6CaePk_hURK2D6aKC5bL6rJabCQe4_ZK-brKbTM0tvrbMT-027OKK85ahrcbqkxXtvI5lRBKtOh3j3zt4jMMh5xthF0hDvd-tnO-t6H-xQ0KnLXKKOLVMI-LPOkeqOJ2Mt5jP4NXNriJUrL5GnbsR5M2K3aVh6gQhjx-jtpexbH55utfnID3J; delPer=0; PSINO=2'
        }

        params1 = {
            'type': 'video',
            'tab': '9',
            'uk': uk,
            # 'ctime': '15448673604154',
            # '_': '1545633915094',
            'callback': 'jsonp5'
        }
        rq_get1 = requests.get('https://author.baidu.com/list',
                               params=params1,
                               headers=headers)
        page_info1 = json.loads(rq_get1.text[7:-2])
        releaser = page_info1['user']['display_name']

        def handle_one_video(one, video_info, releaser, releaserUrl, platform):
            video_data = copy.deepcopy(video_info)

            video_itemid = one['attr']['itemId']
            find_asyncData = one['asyncData']

            video_data['platform'] = platform
            video_data['releaser'] = releaser
            video_data['releaserUrl'] = releaserUrl
            video_data['title'] = one['title']
            video_data['url'] = r'https://sv.baidu.com/videoui/page/videoland?context=' + parse.quote(
                    '{"nid":"sv_%s"}' % \
                    one['id'][3:])
            video_data['duration'] = trans_duration(one['timeLong'])
            video_data['video_id'] = one['article_id']
            video_data['release_time'] = int(one['publish_at']) * 1000
            fetch_time = int(time.time() * 1e3)
            video_data['fetch_time'] = fetch_time

            params2 = {
                'params': json.dumps([find_asyncData]),
                'uk': uk,
                '_': str(int(time.time()) * 1000)
            }
            rq_get2 = requests.get(
                'https://mbd.baidu.com/webpage?type=homepage&action=interact&format=jsonp&callback=jsonp2',
                params=params2)
            page_info2 = json.loads(rq_get2.text[7:-1])
            try:
                video_data['play_count'] = int(
                    page_info2['data']['user_list'][video_itemid]['read_num'])
            except:
                video_data['play_count'] = 0
            try:
                video_data['favorite_count'] = int(
                    page_info2['data']['user_list'][video_itemid]
                    ['praise_num'])
            except:
                video_data['favorite_count'] = 0
            try:
                video_data['comment_count'] = int(
                    page_info2['data']['user_list'][video_itemid]
                    ['comment_num'])
            except:
                video_data['comment_count'] = 0
            return video_data

        while page_info1['data'][
                'has_more'] == 1 and page_count < releaser_page_num_max:
            time.sleep(random.randint(4, 6))
            print("get data at page: %s" % str(page_count))
            ctime = page_info1['data']['ctime']
            for one in page_info1['data']['list']:
                one_result = handle_one_video(one, video_info, releaser,
                                              releaserUrl, self.platform)
                result_lst.append(one_result)
                if len(result_lst) >= 100:
                    output_result(result_Lst=result_lst,
                                  platform=self.platform,
                                  output_to_file=output_to_file,
                                  filepath=filepath,
                                  push_to_redis=push_to_redis,
                                  output_to_es_register=output_to_es_register,
                                  output_to_es_raw=output_to_es_raw,
                                  es_index=es_index,
                                  doc_type=doc_type)
                    result_lst.clear()
            params1['ctime'] = ctime
            rq_next_page = requests.get('https://author.baidu.com/list',
                                        params=params1,
                                        headers=headers)
            page_info1 = json.loads(rq_next_page.text[7:-2])
            page_count += 1
        for one in page_info1['data']['list']:
            one_result = handle_one_video(one, video_info, releaser,
                                          releaserUrl, self.platform)
            result_lst.append(one_result)
        output_result(result_Lst=result_lst,
                      platform=self.platform,
                      output_to_file=output_to_file,
                      filepath=filepath,
                      push_to_redis=push_to_redis,
                      output_to_es_register=output_to_es_register,
                      output_to_es_raw=output_to_es_raw,
                      es_index=es_index,
                      doc_type=doc_type)

    def releaser_page_by_time(self, start_time, end_time, url, allow,
                              **kwargs):
        count_false = 0
        for res in self.baijiahao.releaser_page_web_by_time(url):
            video_time = res["release_time"]
            if video_time:
                if start_time < video_time:
                    if video_time < end_time:
                        yield res
                else:
                    count_false += 1
                    if count_false > allow:
                        break
                    else:
                        yield res
        count_false = 0
        for res in self.baijiahao.releaser_dynamic_page_web_by_time(url):
            video_time = res["release_time"]
            if video_time:
                if start_time < video_time:
                    if video_time < end_time:
                        yield res
                else:
                    count_false += 1
                    if count_false > allow:
                        break
                    else:
                        yield res
示例#10
0
    def __init__(self, platform='iqiyi'):
        self.platform = platform
        std_fields = Std_fields_video()
        self.video_data = std_fields.video_data
        self.video_data['platform'] = self.platform
        # remove fields that crawled data don't have
        pop_key_Lst = [
            'describe', 'isOriginal', 'repost_count', 'video_id', 'channel',
            'play_count'
        ]
        for popk in pop_key_Lst:
            self.video_data.pop(popk)

        self.list_page_Lst = [{
            'channel': '公益',
            'list_page_url': 'http://gongyi.iqiyi.com/',
            'page_type': 'gongyi'
        }, {
            'channel': '电影',
            'list_page_url':
            'http://list.iqiyi.com/www/1/----------------iqiyi--.html',
            'page_type': 'ordinary_list_page'
        }, {
            'channel': '片花',
            'list_page_url':
            'http://list.iqiyi.com/www/10/1007----------------iqiyi--.html',
            'page_type': 'ordinary_list_page'
        }, {
            'channel': '教育',
            'list_page_url':
            'http://list.iqiyi.com/www/12/----------------iqiyi--.html',
            'page_type': 'ordinary_list_page'
        }, {
            'channel': '时尚',
            'list_page_url':
            'http://list.iqiyi.com/www/13/----------------iqiyi--.html',
            'page_type': 'ordinary_list_page'
        }, {
            'channel': '儿童',
            'list_page_url':
            'http://list.iqiyi.com/www/15/----------------iqiyi--.html',
            'page_type': 'ordinary_list_page'
        }, {
            'channel': '网络电影',
            'list_page_url':
            'http://list.iqiyi.com/www/16/----------------iqiyi--.html',
            'page_type': 'ordinary_list_page'
        }, {
            'channel': '体育',
            'list_page_url':
            'http://list.iqiyi.com/www/17/----------------iqiyi--.html',
            'page_type': 'ordinary_list_page'
        }, {
            'channel': '电视剧',
            'list_page_url':
            'http://list.iqiyi.com/www/2/----------------iqiyi--.html',
            'page_type': 'ordinary_list_page'
        }, {
            'channel': '生活',
            'list_page_url':
            'http://list.iqiyi.com/www/21/----------------iqiyi--.html',
            'page_type': 'ordinary_list_page'
        }, {
            'channel': '搞笑',
            'list_page_url':
            'http://list.iqiyi.com/www/22/----------------iqiyi--.html',
            'page_type': 'ordinary_list_page'
        }, {
            'channel': '财经',
            'list_page_url':
            'http://list.iqiyi.com/www/24/----------------iqiyi--.html',
            'page_type': 'ordinary_list_page'
        }, {
            'channel': '资讯',
            'list_page_url':
            'http://list.iqiyi.com/www/25/----------------iqiyi--.html',
            'page_type': 'ordinary_list_page'
        }, {
            'channel': '汽车',
            'list_page_url':
            'http://list.iqiyi.com/www/26/----------------iqiyi--.html',
            'page_type': 'ordinary_list_page'
        }, {
            'channel': '原创',
            'list_page_url':
            'http://list.iqiyi.com/www/27/----------------iqiyi--.html',
            'page_type': 'ordinary_list_page'
        }, {
            'channel': '军事',
            'list_page_url':
            'http://list.iqiyi.com/www/28/----------------iqiyi--.html',
            'page_type': 'ordinary_list_page'
        }, {
            'channel': '母婴',
            'list_page_url':
            'http://list.iqiyi.com/www/29/----------------iqiyi--.html',
            'page_type': 'ordinary_list_page'
        }, {
            'channel': '纪录片',
            'list_page_url':
            'http://list.iqiyi.com/www/3/----------------iqiyi--.html',
            'page_type': 'ordinary_list_page'
        }, {
            'channel': '科技',
            'list_page_url':
            'http://list.iqiyi.com/www/30/----------------iqiyi--.html',
            'page_type': 'ordinary_list_page'
        }, {
            'channel': '脱口秀',
            'list_page_url':
            'http://list.iqiyi.com/www/31/----------------iqiyi--.html',
            'page_type': 'ordinary_list_page'
        }, {
            'channel': '健康',
            'list_page_url':
            'http://list.iqiyi.com/www/32/----------------iqiyi--.html',
            'page_type': 'ordinary_list_page'
        }, {
            'channel': '动漫',
            'list_page_url':
            'http://list.iqiyi.com/www/4/----------------iqiyi--.html',
            'page_type': 'ordinary_list_page'
        }, {
            'channel': '音乐',
            'list_page_url':
            'http://list.iqiyi.com/www/5/----------------iqiyi--.html',
            'page_type': 'ordinary_list_page'
        }, {
            'channel': '综艺',
            'list_page_url':
            'http://list.iqiyi.com/www/6/----------------iqiyi--.html',
            'page_type': 'ordinary_list_page'
        }, {
            'channel': '娱乐',
            'list_page_url':
            'http://list.iqiyi.com/www/7/----------------iqiyi--.html',
            'page_type': 'ordinary_list_page'
        }, {
            'channel': '旅游',
            'list_page_url':
            'http://list.iqiyi.com/www/9/----------------iqiyi--.html',
            'page_type': 'ordinary_list_page'
        }, {
            'channel': '广告',
            'list_page_url':
            'http://list.iqiyi.com/www/20/----------------iqiyi--.html',
            'page_type': 'ordinary_list_page'
        }, {
            'channel': '风云榜',
            'list_page_url': 'http://top.iqiyi.com',
            'page_type': 'top'
        }, {
            'channel': '直播中心',
            'list_page_url': 'http://www.iqiyi.com/live/all',
            'page_type': 'live'
        }, {
            'channel': '拍客',
            'list_page_url': 'http://www.iqiyi.com/paike/list/mrtj.html',
            'page_type': 'paike'
        }, {
            'channel': '奇秀直播',
            'list_page_url': 'http://x.pps.tv/',
            'page_type': 'qixiu'
        }]
        self.list_page_url_dict = {}
        self.legal_list_page_urls = []
        self.legal_channels = []
        for lst_p in self.list_page_Lst:
            page_type = lst_p['page_type']
            channel = lst_p['channel']
            list_page_url = lst_p['list_page_url']
            if page_type == 'ordinary_list_page':
                self.list_page_url_dict[channel] = list_page_url
                self.legal_list_page_urls.append(list_page_url)
                self.legal_channels.append(channel)
            else:
                pass