예제 #1
0
    def process_one_video(self, line):
        video_info = copy.deepcopy(self.video_data)
        try:
            video_info['title'] = line.find('a', {'target': 'video'})['title']
        except:
            video_info['title'] = None
        try:
            url = line.find('a', {'target': 'video'})['href']
            video_info['url'] = 'https:' + url
        except:
            video_info['url'] = None
        try:
            play_count_str = line.find('span', {'class': 'v-num'}).text
            video_info['play_count'] = trans_play_count(play_count_str)
        except:
            video_info['play_count'] = 0
#            logging.warning("can't get play_count at page %s" % video_info['url'])
        try:
            release_time_str = line.find('span', {
                'class': 'v-publishtime'
            }).text
            video_info['release_time'] = trans_strtime_to_timestamp(
                input_time=release_time_str, missing_year=True)
        except:
            release_time_str = 0
#            logging.warning("can't get release_time at page %s" % video_info['url'])
        try:
            dura_str = line.find('span', {'class': 'v-time'}).text
            video_info['duration'] = trans_duration(dura_str)
        except:
            video_info['duration'] = 0
#            logging.warning("can't get duration at page %s" % video_info['url'])
        fetch_time = int(time.time() * 1e3)
        video_info['fetch_time'] = fetch_time
        return video_info
예제 #2
0
def sogou_info_page(keyword):
    result_lst = []
    for page_num in range(1, 11):
        search_url = 'http://news.sogou.com/news?&query=' + keyword + '&page=' + str(
            page_num)
        get_page = requests.get(search_url, headers=headers)
        page = get_page.text
        soup = BeautifulSoup(page, 'html.parser')
        news_lst = soup.find_all('div', {'class': 'vrwrap'})
        for line in news_lst:
            try:
                title = line.div.h3.a.text
                url = line.div.h3.a['href']
                source_and_release_time = line.find('p', {
                    'class': 'news-from'
                }).text
                source_and_release_time_lst = source_and_release_time.split(
                    '\xa0')
                source = source_and_release_time_lst[0]
                release_time_str = source_and_release_time_lst[-1]
                release_time = trans_strtime_to_timestamp(release_time_str)
                try:
                    content = line.find('span').text
                except:
                    print('no content at %s' % title)
                    content = 'missing'
                fetch_time = int(time.time() * 1000)
                try:
                    similar_news = line.find('a', {'id': 'news_similar'}).text
                except:
                    print('no similar news at %s' % title)
                    similar_news = 'missing'
                news_info = {
                    'title': title,
                    'url': url,
                    'source': source,
                    'release_time': release_time,
                    'fetch_time': fetch_time,
                    'content': content,
                    'similar_news': similar_news,
                    'keyword': keyword
                }
                result_lst.append(news_info)
                print('get data at page %s' % page_num)
            except:
                ('the error occured at position %s' % news_lst.index(line))
    return result_lst
예제 #3
0
    def video_page(self,
                   url,
                   output_to_file=False,
                   filepath=None,
                   releaser_page_num_max=30,
                   output_to_es_raw=False,
                   es_index=None,
                   doc_type=None,
                   output_to_es_register=False,
                   push_to_redis=False,
                   *args,
                   **kwargs):
        """
        get video info from api instead of web page html
        the most scroll page is 1000
        """
        releaser = ""
        count = 1
        result_list = []
        page_count = 0
        size_num = 0
        # releaser_id = self.get_releaser_id(url)
        while count < releaser_page_num_max:
            if size_num > 1000:
                size_num = 0

            size_num += 20
            count += 1
            url_dic = {
                'channel': 'T1457068979049',
                'subtab': 'Video_Recom',
                'size': "10",
                'offset': size_num,
                'fn': '3',
                'devId':
                'sklfRdL61S9GUQ4M7DSzdvA6U6LFEZr0pAEonUVTJrYHNFmgkLuyUgNU6zUV7MVx',
                'version': '33.2.1',
                'net': 'wifi',
                'ts': '1557126556',
                'sign':
                'YTk73p++NeCfCJRpZkThWxGYX0gVcFWjUVLCRIRwftV48ErR02zJ6/KXOnxX046I',
                'encryption': '1',
                'canal': 'lite_wifi_cpa10',
                'mac': 'racUMC0A9havm+He6jH3YAvVdjgSXYDtwEDZ03eH1l8='
            }

            releaserUrl = 'https://c.m.163.com/recommend/getChanListNews?%s' % urllib.parse.urlencode(
                url_dic)
            print(releaserUrl)
            page_count += 20
            get_page = requests.get(releaserUrl, headers=self.headers)
            page_dic = get_page.json()
            data_list = page_dic.get("视频")
            # print(data_list)
            # print(releaserUrl)
            if data_list == []:
                print("no more data at releaser: %s page: %s " %
                      (releaser, count))
                pcursor = "no_more"
                continue
            else:
                print("get data at  page: %s" % (count))

                for info_dic in data_list:
                    skipID = info_dic.get("vid")
                    video_dic = copy.deepcopy(self.video_data)
                    video_dic['title'] = info_dic.get('title')
                    video_dic[
                        'url'] = "https://c.m.163.com/news/v/%s.html" % skipID
                    video_dic['releaser'] = info_dic.get('topicName')
                    video_dic[
                        'releaserUrl'] = "https://c.m.163.com/news/sub/%s.html" % info_dic.get(
                            "videoTopic").get("tid")
                    video_dic[
                        'releaser_id_str'] = "网易新闻_%s" % self.get_releaser_id(
                            video_dic['releaserUrl'])
                    try:
                        video_dic['release_time'] = int(info_dic.get('ptime'))
                    except:
                        video_dic['release_time'] = trans_strtime_to_timestamp(
                            info_dic.get('ptime'))
                    video_dic['play_count'] = info_dic.get("playCount")
                    video_dic['comment_count'] = info_dic.get('replyCount')
                    video_dic['favorite_count'] = info_dic.get('voteCount')
                    if not video_dic['play_count']:
                        video_dic['play_count'] = 0
                    if not video_dic['favorite_count']:
                        video_dic['favorite_count'] = 0
                    video_dic['video_id'] = skipID
                    video_dic['fetch_time'] = int(time.time() * 1e3)
                    video_dic['duration'] = info_dic.get("length")
                    video_dic['video_img'] = self.get_video_image(info_dic)

                    result_list.append(video_dic)
                    if len(result_list) >= 100:
                        output_result(
                            result_Lst=result_list,
                            platform=self.platform,
                            output_to_file=output_to_file,
                            filepath=filepath,
                            output_to_es_raw=output_to_es_raw,
                            es_index=es_index,
                            doc_type=doc_type,
                            output_to_es_register=output_to_es_register)
                        # print((result_list))
                        result_list.clear()
        if result_list != []:
            output_result(result_Lst=result_list,
                          platform=self.platform,
                          output_to_file=output_to_file,
                          filepath=filepath,
                          output_to_es_raw=output_to_es_raw,
                          es_index=es_index,
                          doc_type=doc_type,
                          output_to_es_register=output_to_es_register)
            # print((result_list))
            result_list.clear()
        return result_list
예제 #4
0
    def releaser_page_app(self,
                          releaserUrl,
                          output_to_file=False,
                          filepath=None,
                          releaser_page_num_max=4000,
                          output_to_es_raw=False,
                          es_index=None,
                          doc_type=None,
                          output_to_es_register=False,
                          push_to_redis=False,
                          proxies_num=None):
        """
        get video info from api instead of web page html
        the most scroll page is 1000
        """

        headers = {
            'Host':
            'apis.tudou.com',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language':
            'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
            'Accept-Encoding':
            'gzip, deflate',
            'Connection':
            'keep-alive',
            'Cookie':
            ('isg=BIeH6gcJlwZw_xQESm9jlG-vFTuRJGXxikf0g1l0mJY9yKeKYVuAvzKJbkgzOzPm;'
             'cna=XA2EFIGslWoCAWp4y3KXcZh7; ykss=cdbd115c102a68710215ad93;'
             '__ysuid=1543316262167mjE; P_ck_ctl=62DE1D55DFE1C0F4F27A8662E6575F08;'
             '__ayvstp=32'),
            'Upgrade-Insecure-Requests':
            '1',
            'Cache-Control':
            'max-age=0'
        }

        count = 1
        #        has_more = True
        retry_time = 0
        result_list = []
        releaser_id = self.get_releaser_id(releaserUrl)
        releaser = self.get_releaser_name(releaserUrl)
        releaserUrl = 'https://id.tudou.com/i/%s/videos' % releaser_id
        self.video_data['releaser'] = releaser
        self.video_data['releaserUrl'] = releaserUrl
        url_dic = {"uid": releaser_id, "pL": "20"}
        print("working on releaser: %s releaser_id: %s" %
              (releaser, releaser_id))
        while count <= releaser_page_num_max and retry_time < 5:
            proxies = get_proxy(proxies_num)
            url_dic['pg'] = str(count)
            url_dic['pn'] = str(count)
            api_url = 'http://apis.tudou.com/subscribe/v1/video?%s' % urllib.parse.urlencode(
                url_dic)
            # print(api_url)
            if proxies:
                get_page = requests.get(api_url,
                                        headers=headers,
                                        proxies=proxies,
                                        timeout=5)
            else:
                get_page = requests.get(api_url, headers=headers, timeout=5)
            page_dic = get_page.json()
            #            has_more = page_dic.get('has_more')
            try:
                data_list = page_dic['entity']
            except:
                retry_time += 1
                proxies = get_proxy(1)
                time.sleep(0.25)
                print("no more data at releaser: %s page: %s try_time: %s" %
                      (releaser, count, retry_time))
                continue
            if data_list == []:
                retry_time += 1
                proxies = get_proxy(1)
                time.sleep(0.25)
                print("no more data at releaser: %s page: %s try_time: %s" %
                      (releaser, count, retry_time))
                continue
            else:
                retry_time = 0
                print("get data at releaser: %s page: %s" % (releaser, count))
                count += 1
                for info_dic in data_list:
                    video_info = copy.deepcopy(self.video_data)
                    one_video = info_dic.get('detail')
                    if one_video is not None:
                        get_title = one_video.get('base_detail')
                        if get_title is not None:
                            video_info['title'] = get_title.get('title')
                        detail_info = one_video.get('video_detail')
                        if detail_info is not None:
                            video_id = detail_info.get('video_id')
                            if video_id is not None:
                                video_info['video_id'] = video_id
                                video_info[
                                    'url'] = 'https://video.tudou.com/v/%s.html' % video_id
                            video_info['duration'] = detail_info.get(
                                'duration')
                            video_info['releaser_id_str'] = "new_tudou_%s" % (
                                releaser_id)
                            video_info['comment_count'] = int(
                                detail_info.get('comment_count'))
                            video_info['favorite_count'] = int(
                                detail_info.get('praiseNumber'))
                            #favorite_count in database means 点赞数, while in web page the factor
                            #named praiseNumber
                            #in web page facorite_count means 收藏数
                            video_info['shoucang_count'] = (
                                detail_info.get('favorite_count'))
                            # print('play_count', detail_info.get('vv_count'))
                            video_info['play_count'] = detail_info.get(
                                'vv_count')
                            video_info['video_img'] = self.get_video_image(
                                detail_info)
                            release_time_str = detail_info.get('publish_time')
                            print(release_time_str)
                            #                            print(video_info['release_time'])
                            if '天前' in release_time_str:
                                video_info['release_time'] = self.video_page(
                                    video_info['url'])['release_time']
                            else:
                                video_info[
                                    'release_time'] = trans_strtime_to_timestamp(
                                        input_time=release_time_str,
                                        missing_year=True)
                            video_info['fetch_time'] = int(time.time() * 1e3)
                            yield video_info
                    else:
                        continue
예제 #5
0
    def releaser_page_web(self,
                          releaserUrl,
                          output_to_file=False,
                          filepath=None,
                          releaser_page_num_max=30,
                          output_to_es_raw=False,
                          output_to_es_register=False,
                          push_to_redis=False,
                          es_index=None,
                          doc_type=None,
                          fetchFavoriteCommnt=True):
        pid = os.getpid()
        releaser_id = self.get_releaser_id(releaserUrl)
        print('releaser_id is %s' % releaser_id)
        result_lst = []
        # video_info = self.video_data
        page_num = 0
        has_more = True
        ctime = ""
        count_false = 0
        # proxies = None
        proxies = get_proxy_dic()
        while page_num <= releaser_page_num_max and has_more:

            post_url = 'https://haokan.baidu.com/haokan/wiseauthor?app_id={0}&_api=1&_skip={1}&ctime={2}&_limit=10&video_type=media&sort_type=sort_by_time'.format(
                releaser_id, page_num, ctime)
            headers = {
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36',
                "referer":
                "https://haokan.baidu.com/haokan/wiseauthor?app_id=1564003728536358",
                "sec-fetch-mode": "cors",
                "sec-fetch-site": "same-origin",
                "accept": "*/*",
                "accept-encoding": "gzip, deflate, br",
                "accept-language": "zh,zh-CN;q=0.9",
                "content-type": "application/x-www-form-urlencoded"
            }
            try:
                if page_num == 0:
                    for loop in range(5):
                        get_page = requests.get(releaserUrl,
                                                headers=headers,
                                                timeout=3,
                                                proxies=proxies)
                        # print(get_page.text)
                        page_dic, fans_num = self.web_first_pag(get_page.text)
                        if page_dic['apiData']['video']['results']:
                            page_num += 1
                            break
                else:
                    get_page = requests.get(post_url,
                                            headers=headers,
                                            timeout=3)
                    page_dic = get_page.json()
                    page_num += 1
                    # print(page_dic)
            except:
                continue
            try:
                info_lst = page_dic['apiData']['video']['results']
            except:
                info_lst = []
            try:
                ctime = page_dic['apiData']['video']['ctime']
                has_more = page_dic['apiData']['video']['has_more']
                if not has_more:
                    has_more = False
            except:
                has_more = False
            if info_lst != []:
                count_false = 0
                print("Process %s is processing %s at page %s" %
                      (pid, releaser_id, page_num))
                time.sleep(int(random.uniform(1, 2)))
                for line in info_lst:
                    video_data = copy.deepcopy(self.video_data_template)
                    video_data['title'] = line['content']['title']
                    video_id = line['content']['vid']
                    video_data['video_id'] = video_id
                    # partial_url = '{"nid":"sv_%s"}' % video_id
                    # partial_url_encode = urllib.parse.quote_plus(partial_url)
                    video_data['url'] = line['content']["video_short_url"]
                    video_data['play_count'] = line['content']['playcnt']
                    video_data['favorite_count'] = int(
                        line['content']['praiseNum'])
                    try:
                        video_data['comment_count'] = int(
                            line['content']['commentNum'])
                    except:
                        video_data['comment_count'] = 0
                    video_data['releaser_followers_count'] = int(fans_num)
                    # print('like num is %s' % video_data['favorite_count'])
                    try:
                        video_data['duration'] = trans_duration(
                            line['content']['duration'])
                    except:
                        video_data['duration'] = 0
                    video_data['releaser'] = line['content']['author']
                    video_data['releaser_id_str'] = "haokan_%s" % (
                        line['content']['authorid'])
                    video_data[
                        'releaserUrl'] = 'https://haokan.baidu.com/haokan/wiseauthor?app_id=' + line[
                            'content']['authorid']
                    fetch_time = int(time.time() * 1e3)
                    video_data['fetch_time'] = fetch_time
                    releaser_time_str = line['content']['publish_time']
                    video_data['release_time'] = trans_strtime_to_timestamp(
                        input_time=releaser_time_str)
                    print(
                        video_id, releaser_time_str,
                        datetime.datetime.fromtimestamp(
                            video_data['release_time'] / 1000), page_num)
                    yield video_data
            else:
                count_false += 1
                if count_false < 5:
                    continue
                else:
                    break