示例#1
0
    def put_download_info(self):
        while True:
            if not self.download_queue.empty():
                contents = self.download_queue.get()
                download_l = []

                for content in contents:
                    video_id = content.get('id', 'Null')
                    # 提取并解析视频下载url
                    video_down = js2py_main(content.get('video', 'Null'))
                    if video_down != 0:
                        video_down_url = video_down
                    else:
                        video_down_url = 'Null'
                    if video_id != 'Null' and video_down_url != 'Null' and '{}.mp4'.format(
                            video_id) not in self.file_list:
                        download_thread = Thread(target=self.download_video,
                                                 args=(video_down_url,
                                                       video_id))
                        download_thread.start()
                        download_l.append(download_thread)
                    else:
                        print('{}下载失败!'.format(url))

                for i in download_l:
                    i.join()

                time.sleep(random.uniform(0.5, 1))
            elif self.flag:
                time.sleep(random.uniform(5, 10))
            else:
                break
示例#2
0
文件: meipai.py 项目: xx0746/Spiders
    def get_info(self):
        """
        获取视频和用户的json数据
        """
        # 获取视频保存路径下所有文件
        file_list = os.listdir(r'E:\Study\项目\005爬虫\Spiders\美拍\下载')

        for i in range(self.num):
            headers = {
                'User-Agent': UserAgent().random,
                'Accept':
                'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
                'Accept-Encoding': 'gzip, deflate, br',
                'Accept-Language': 'zh-CN,zh;q=0.9',
                'Cache-Control': 'max-age=0',
                'Connection': 'keep-alive',
                'Cookie':
                "MUSID=7tbos07jmslt847697h7tbsml7; sid=7tbos07jmslt847697h7tbsml7; UM_distinctid=171e49370c4f5-0a8941f624bb11-c373667-144000-171e49370c54cf; MP_WEB_GID=365077837234755; virtual_device_id=f0223988f9b30ab2f30f2d9d81247731; pvid=Ef8Wjj2%2FatZsiVFvt%2FIOVi1bbz2voMqm; CNZZDATA1256786412=910314024-1588676805-%7C1588995742",
                'Host': 'www.meipai.com',
                'Sec-Fetch-Dest': 'document',
                'Sec-Fetch-Mode': 'navigate',
                'Sec-Fetch-Site': 'none',
                'Sec-Fetch-User': '******',
                'Upgrade-Insecure-Requests': '1',
            }
            json_page = requests.get(self.url, headers=headers)
            # 当状态码为200,即请求成功
            if json_page.status_code == 200:
                # 获取json格式的数据
                all_json = json_page.json()
                contents = all_json.get('medias')
                for content in contents:
                    item = dict()
                    # 视频id
                    item['video_id'] = content.get('id', 'Null')
                    # 当视频id在self.save_video_id中或者视频已经下载保存到本地时
                    if item['video_id'] in self.save_video_id or '{}.mp4'.format(
                            item['video_id']) in file_list:
                        print("{}.mp4已存在!".format(item['video_id']))
                    else:
                        # 不在self.save_video_id中将视频id保存
                        self.save_video_id.append(item['video_id'])

                        # 用户id
                        user = content.get('user', 'Null')
                        if user != 'Null':
                            item['user_id'] = user.get('id', 0)

                        item['client_id'] = content.get('client_id', 'Null')
                        # 说明文字
                        item['caption'] = content.get('caption', 'Null')
                        # 视频链接
                        item['url'] = content.get('url', 'Null')
                        # 种类
                        item['category'] = content.get('category', 'Null')
                        # 视频时长
                        item['time'] = content.get('time', 0)
                        # 是否是长视频
                        item['is_long'] = content.get('is_long', 0)
                        # 发布时间
                        item['created_at'] = content.get('created_at', 'Null')
                        # 评论总数
                        item['comments_count'] = content.get(
                            'comments_count', 0)
                        # 点赞
                        item['likes_count'] = str(content.get(
                            'likes_count',
                            0)).replace('<em class="my-count-em">',
                                        '').replace('</em>', '')
                        # 转发
                        item['reposts_count'] = content.get('reposts_count', 0)
                        # 提取并解析视频下载url
                        video_down = js2py_main(content.get('video', 'Null'))

                        print('{}视频信息获取完成!'.format(item['video_id']))
                        # 获取用户信息
                        self.get_user(user)

                        if video_down != 0:
                            print('{}视频下载url解析完成!'.format(item['video_id']))
                            item['video_down_url'] = video_down
                        else:
                            item['video_down_url'] = 'Null'

                        print('{}视频信息开始保存到数据库!'.format(item['video_id']))
                        self.save_video_info(item)
                        print('{}视频信息已经保存到数据库!'.format(item['video_id']))
                        # 获取该视频的评论
                        print('{}.mp4开始获取评论!'.format(item['video_id']))
                        self.get_comment_content(item['video_id'])
                        # 下载视频
                        self.download_video(item['video_down_url'],
                                            item['video_id'])
                    # 每个视频信息输出之间有一行空格
                    print()

            time.sleep(random.uniform(0.5, 2))
示例#3
0
    def get_video_info(self):
        """
        获取视频信息
        """
        while True:
            if not self.video_queue.empty():
                contents = self.video_queue.get()
                for content in contents:
                    try:
                        item = dict()
                        # 视频id
                        item['video_id'] = content.get('id', 'Null')
                        # 当视频id在save_video_id中或者视频已经下载保存到本地时
                        if item['video_id'] in self.save_video_id:
                            print("{}视频信息已在数据库中!".format(item['video_id']))
                        else:
                            # 不在save_video_id中将视频id保存
                            self.save_video_id.append(item['video_id'])

                            # 用户id
                            user = content.get('user', 'Null')
                            if user != 'Null':
                                item['user_id'] = user.get('id', 0)

                            item['client_id'] = content.get(
                                'client_id', 'Null')
                            # 说明文字
                            item['caption'] = content.get('caption', 'Null')
                            # 视频链接
                            item['url'] = content.get('url', 'Null')
                            # 种类
                            item['category'] = content.get('category', 'Null')
                            # 视频时长
                            item['time'] = content.get('time', 0)
                            # 是否是长视频
                            item['is_long'] = content.get('is_long', 0)
                            # 分辨率
                            item['pic_size'] = content.get('pic_size', 0)
                            # 发布时间
                            item['created_at'] = content.get(
                                'created_at', 'Null')
                            # 评论总数
                            item['comments_count'] = str(
                                content.get('comments_count', 0)).replace(
                                    '<em class="my-count-em">',
                                    '').replace('</em>', '')
                            # 点赞
                            item['likes_count'] = str(
                                content.get('likes_count', 0)).replace(
                                    '<em class="my-count-em">',
                                    '').replace('</em>', '')
                            # 转发
                            item['reposts_count'] = str(
                                content.get('reposts_count', 0)).replace(
                                    '<em class="my-count-em">',
                                    '').replace('</em>', '')
                            # 提取并解析视频下载url
                            video_down = js2py_main(
                                content.get('video', 'Null'))
                            if video_down != 0:
                                item['video_down_url'] = video_down
                            else:
                                item['video_down_url'] = 'Null'
                            # 保存视频信息
                            self.save_video_info(item)
                    except Exception as e:
                        print('一条视频信息出错{}'.format(e))

                time.sleep(random.uniform(0.5, 1.5))

            elif self.flag:
                time.sleep(random.uniform(5, 10))
            else:
                break