def put_download_info(self): while True: if not self.download_queue.empty(): contents = self.download_queue.get() download_l = [] for content in contents: video_id = content.get('id', 'Null') # 提取并解析视频下载url video_down = js2py_main(content.get('video', 'Null')) if video_down != 0: video_down_url = video_down else: video_down_url = 'Null' if video_id != 'Null' and video_down_url != 'Null' and '{}.mp4'.format( video_id) not in self.file_list: download_thread = Thread(target=self.download_video, args=(video_down_url, video_id)) download_thread.start() download_l.append(download_thread) else: print('{}下载失败!'.format(url)) for i in download_l: i.join() time.sleep(random.uniform(0.5, 1)) elif self.flag: time.sleep(random.uniform(5, 10)) else: break
def get_info(self): """ 获取视频和用户的json数据 """ # 获取视频保存路径下所有文件 file_list = os.listdir(r'E:\Study\项目\005爬虫\Spiders\美拍\下载') for i in range(self.num): headers = { 'User-Agent': UserAgent().random, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Cookie': "MUSID=7tbos07jmslt847697h7tbsml7; sid=7tbos07jmslt847697h7tbsml7; UM_distinctid=171e49370c4f5-0a8941f624bb11-c373667-144000-171e49370c54cf; MP_WEB_GID=365077837234755; virtual_device_id=f0223988f9b30ab2f30f2d9d81247731; pvid=Ef8Wjj2%2FatZsiVFvt%2FIOVi1bbz2voMqm; CNZZDATA1256786412=910314024-1588676805-%7C1588995742", 'Host': 'www.meipai.com', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'none', 'Sec-Fetch-User': '******', 'Upgrade-Insecure-Requests': '1', } json_page = requests.get(self.url, headers=headers) # 当状态码为200,即请求成功 if json_page.status_code == 200: # 获取json格式的数据 all_json = json_page.json() contents = all_json.get('medias') for content in contents: item = dict() # 视频id item['video_id'] = content.get('id', 'Null') # 当视频id在self.save_video_id中或者视频已经下载保存到本地时 if item['video_id'] in self.save_video_id or '{}.mp4'.format( item['video_id']) in file_list: print("{}.mp4已存在!".format(item['video_id'])) else: # 不在self.save_video_id中将视频id保存 self.save_video_id.append(item['video_id']) # 用户id user = content.get('user', 'Null') if user != 'Null': item['user_id'] = user.get('id', 0) item['client_id'] = content.get('client_id', 'Null') # 说明文字 item['caption'] = content.get('caption', 'Null') # 视频链接 item['url'] = content.get('url', 'Null') # 种类 item['category'] = content.get('category', 'Null') # 视频时长 item['time'] = content.get('time', 0) # 是否是长视频 item['is_long'] = content.get('is_long', 0) # 发布时间 item['created_at'] = content.get('created_at', 'Null') # 评论总数 item['comments_count'] = content.get( 'comments_count', 0) # 点赞 item['likes_count'] = str(content.get( 'likes_count', 0)).replace('<em class="my-count-em">', '').replace('</em>', '') # 转发 item['reposts_count'] = content.get('reposts_count', 0) # 提取并解析视频下载url video_down = js2py_main(content.get('video', 'Null')) print('{}视频信息获取完成!'.format(item['video_id'])) # 获取用户信息 self.get_user(user) if video_down != 0: print('{}视频下载url解析完成!'.format(item['video_id'])) item['video_down_url'] = video_down else: item['video_down_url'] = 'Null' print('{}视频信息开始保存到数据库!'.format(item['video_id'])) self.save_video_info(item) print('{}视频信息已经保存到数据库!'.format(item['video_id'])) # 获取该视频的评论 print('{}.mp4开始获取评论!'.format(item['video_id'])) self.get_comment_content(item['video_id']) # 下载视频 self.download_video(item['video_down_url'], item['video_id']) # 每个视频信息输出之间有一行空格 print() time.sleep(random.uniform(0.5, 2))
def get_video_info(self): """ 获取视频信息 """ while True: if not self.video_queue.empty(): contents = self.video_queue.get() for content in contents: try: item = dict() # 视频id item['video_id'] = content.get('id', 'Null') # 当视频id在save_video_id中或者视频已经下载保存到本地时 if item['video_id'] in self.save_video_id: print("{}视频信息已在数据库中!".format(item['video_id'])) else: # 不在save_video_id中将视频id保存 self.save_video_id.append(item['video_id']) # 用户id user = content.get('user', 'Null') if user != 'Null': item['user_id'] = user.get('id', 0) item['client_id'] = content.get( 'client_id', 'Null') # 说明文字 item['caption'] = content.get('caption', 'Null') # 视频链接 item['url'] = content.get('url', 'Null') # 种类 item['category'] = content.get('category', 'Null') # 视频时长 item['time'] = content.get('time', 0) # 是否是长视频 item['is_long'] = content.get('is_long', 0) # 分辨率 item['pic_size'] = content.get('pic_size', 0) # 发布时间 item['created_at'] = content.get( 'created_at', 'Null') # 评论总数 item['comments_count'] = str( content.get('comments_count', 0)).replace( '<em class="my-count-em">', '').replace('</em>', '') # 点赞 item['likes_count'] = str( content.get('likes_count', 0)).replace( '<em class="my-count-em">', '').replace('</em>', '') # 转发 item['reposts_count'] = str( content.get('reposts_count', 0)).replace( '<em class="my-count-em">', '').replace('</em>', '') # 提取并解析视频下载url video_down = js2py_main( content.get('video', 'Null')) if video_down != 0: item['video_down_url'] = video_down else: item['video_down_url'] = 'Null' # 保存视频信息 self.save_video_info(item) except Exception as e: print('一条视频信息出错{}'.format(e)) time.sleep(random.uniform(0.5, 1.5)) elif self.flag: time.sleep(random.uniform(5, 10)) else: break