def getVideoList(self, seterId, setLink): ''' 获取影片集下的所有影片列表 ''' Util.info("Task:{} getVideoList seterId {}".format(self._taskName, seterId)) self._taskObj.getVideoList(seterId, setLink) Util.info("Task:{} getVideoList end".format(self._taskName))
def exists(self, name, setId): exists = self._db.find_one({ "name": Util.conv2(name, self.videoListFields['name']), 'setId': Util.conv2(setId, self.videoListFields['setId']) }) return True if exists else False
def exists(self, title, platform): exists = self._db.find_one({ "title": Util.conv2(title, self.videoSetFields['title']), 'platform': Util.conv2(platform, self.videoSetFields['platform']) }) return True if exists else False
def __init__(self, args): ''' -O FILE, --output-filename FILE Set output filename -o DIR, --output-dir DIR Set output directory ''' Util.info('Do {} downloader'.format(self._downloader)) self._args = args
def getSetContent(self, link): ''' 获取本影片集信息, 保存影片集信息 ''' Util.info("Task:{} getSetContent link {}".format(self._taskName, link)) seterId = self._taskObj.getSetContent(link) Util.info("Task:{} getSetContent end".format(self._taskName)) if False != seterId: self.getVideoList(seterId, link)
def getSetContent(self, link): ''' 获取本影片集信息 ''' movieInfo = {} r = Util.getPage(link, proxy=self._isProxy) html = BeautifulSoup(r.text, self.getConfig('FETCH_LIBRARY')) # print(html) movieInfo['link'] = link # 原始链接 # 是否是 vip 片 movieInfo['is_vip'] = 1 if html.find_all('img', {'class': 'icon-viedo-mr'}) else 0 movieInfo['title'] = html.find('a', {'class': 'info-intro-title'}).text # summary 如果有完整的取完整的 summaryList = html.find_all('span', {'class': "briefIntroTxt"}) movieInfo['summary'] = summaryList[-1].text movieInfo['img'] = html.find('div', {'class': 'info-img'}).img['src'] # 小图 movieInfo['img_large'] = movieInfo['img'].replace('195_260', '480_360') # 大图 try: movieInfo['area'] = html.find("p", {'class': "episodeIntro-area"}).a.text except AttributeError: pass try: movieInfo['lang'] = html.find('p', {'class': 'episodeIntro-lang'}).a.text except AttributeError: pass try: movieInfo['category'] = [x.text for x in html.find('p', {'class': "episodeIntro-type"}).find_all('a')] except AttributeError: pass try: movieInfo['hot'] = html.find("span", {"class": "heat-info"}).text.replace('热度', '').strip() except AttributeError: pass # 获取影片集评分 tvid = html.find("span", {'class': "effect-score"})['data-score-tvid'] movieInfo['score'] = self.getSetScore(tvid, tvid) # 统一单次导入 movieInfo['episode_over'] = 2 # 影片集本平台不重复添加 setExists = self.getModel('VideoSet').exists(movieInfo['title'], self.platform) if True == setExists: Util.info('Set exists {}'.format(movieInfo['title'])) return False # 保存set seterId = self.getModel('VideoSet').newSet(movieInfo, self.platform) return seterId
def parseArgs(self): ''' 使用参数执行 -d Download -t Test 不写入信息, 仅测试抓取并立即返回 --download=NAME Download --task=TASKNAME Task --background=BGNAME Background OPTIONAL --process=PROCESSNAME 指定调用方法 --params=PARAMS 指定方法的参数 ''' try: opts, args = getopt.getopt(sys.argv[1:],"hdt:",["download=", 'task=', 'background=', 'process=']) Util.info((opts, args)) for opt, arg in opts: if opt == '-h': print('Run.py') print('Add a tasker') print(' -d <download> --download=<download name>') print(' --task=<task name>') print(' --background=<background name>') print('Optional add a process') print(' --process=<process name>') print('Optional add some process params') print(' --params=<params>') sys.exit() elif opt in ("-d", "--download"): self._taskType = 'download' self._taskName = arg elif opt == "--task": self._taskType = 'task' self._taskName = arg elif opt == "--background": self._taskType = 'background' self._taskName = arg if opt == "--process": self._process = arg if opt == "--params": self._args = arg # 测试 if opt == '-t': self._test = True except getopt.GetoptError: print('Key -h see keymap.') sys.exit() finally: pass
def getUnDlVideo(self, setId, uid): return self._db.find_one({ "setId": Util.conv2(setId, self.videoListFields['setId']), "plays." + str(uid): { '$exists': False } })
def getCategoryList(self, fLink): ''' 获取分类页数据 ''' Util.info('Task:Iqiyi getCategoryList') Util.info("Do on Cate: {}".format(fLink)) r = Util.getPage(fLink, proxy=self._isProxy) # 获取分类页内容 html = BeautifulSoup(r.text, self.getConfig('FETCH_LIBRARY')) # a = html.find('div', {'class': 'mod-page'}) mainContent = html.find('ul', {'class': 'site-piclist site-piclist-180236 site-piclist-auto'}) allLi = mainContent.find_all('li') cateData = [] for li in allLi: try: # 其他信息去除,都从获取影片集信息中获得 c_link = li.a['href'] cateData.append(c_link.strip()) del c_link except KeyError: Util.info("未找到影片集链接跳过") continue del r, html, mainContent, allLi, fLink # 每取一页处理一页 return cateData
def createRepo(self, lastRepoId): # repo name self._apiCreateRepo['params']['name'] = '{}'.format(lastRepoId) info = Util.postPage( self._apiCreateRepo['url'].format(self._config.WAREHOUSE['token']), json.dumps(self._apiCreateRepo['params'])) # 已经存在 if 422 == info.status_code: Util.info('仓库已存在') return True elif 201 == info.status_code: self.createPage(lastRepoId) self.setRepoCname(lastRepoId) return True else: Util.error('本地仓库创建失败') return False
def setRepoCname(self, repoName): # self._apiUpdateCname['params']['cname'] = "{}".format(self._config.WAREHOUSE['host']) cnameRes = Util.putPage( self._apiUpdateCname['url'].format( self._config.WAREHOUSE['repoName'], repoName, self._config.WAREHOUSE['token']), json.dumps(self._apiUpdateCname['params']), headers={ 'Accept': 'application/vnd.github.mister-fantastic-preview+json' }) if 400 == cnameRes.status_code: Util.info('仓库创建成功') return True else: Util.info('仓库创建失败 {}'.format(cnameRes)) return False
def getDledVideoListCount(self, setId, uid): listCount = self._db.find({ "setId": Util.conv2(setId, self.videoListFields['setId']), "plays." + str(uid): { '$exists': True } }).count() return listCount
def __init__(self): super().__init__() Util.info('Run init') self.parseArgs() # 测试用 # self._taskType = 'Download' # self._taskName = '' # self._process = 'dlFile' # self._args = {'videoId': '5c19fddde203c64bdc19299b'} self._taskType = 'Background' self._taskName = 'ToWarehouse' self._process = 'process' self._args = {'file': '0125/VEYCaSRIBFn.mp4', 'id': '5c19fddde203c64bdc19299b'} # 测试用 end # 根据传入参数拉起一个任务 taskFactory = self.getFactory(self._taskType)(self._taskName, isTest = self._test) # 拉起执行方法 getattr(taskFactory, self._process)(self._args)
def __init__(self, args): ''' --get-duration Simulate, quiet but print video length --get-filename Simulate, quiet but print output filename --get-format Simulate, quiet but print output format --proxy URL Use the specified HTTP/HTTPS/SOCKS proxy. To enable SOCKS proxy, specify a proper scheme. For example socks5://127.0.0.1:1080/. Pass in an empty string (--proxy "") for direct connection -f, --format FORMAT Video format code, see the "FORMAT SELECTION" for all the info --all-formats Download all available video formats --prefer-free-formats Prefer free video formats unless a specific one is requested -o, --output TEMPLATE Output filename template, see the "OUTPUT TEMPLATE" for all the info ''' Util.info('Do {} downloader'.format(self._downloader)) self._args = args
def modifyEpisode(self, data, _id): if not data['episode']: return False if not isinstance(_id, ObjectId): _id = ObjectId(_id) modify = self._db.update_one({"_id": _id}, { "$set": { "episode": Util.conv2(data['episode'], self.videoSetFields['episode']) } }) return True if modify else False
def getCategoryList(self, args): ''' 一次性获取所有分类内容 ''' Util.info("Task:{} getCategoryList args {}".format(self._taskName, args)) if not self._taskObj.cateLinks or len(self._taskObj.cateLinks) == 0: Util.error('Task:{} cateLinks can not empty on getCategoryList'.format(self._taskName)) startPage = args[0] if args[0] else 1 endPage = args[1] if args[1] else 10 # 取所有链接 for link in self._taskObj.cateLinks: # 每一页 for page in range(startPage, startPage + endPage): print("Do page {}".format(page)) # 带 page 的 链接 fLink = link.format(str(page)) # 影片集内容 setList = self._taskObj.getCategoryList(fLink) # 没有待处理数据 if len(setList) <= 0: Util.info('Category {} with page {} empty'.format(link, page)) continue # 测试用 # print(setList) # exit() # 测试用 end # 处理每个影片集 for setInfo in setList: self.getSetContent(setInfo)
def getSetScore(self, tvid, qipu_ids): ''' 获取影片集评分 ''' try: res = Util.getPage(self._scoreUrl.format(qipu_ids, tvid), proxy=self._isProxy) resText = res.text startTxt = '"sns_score":' resStart = resText.find(startTxt) resText = resText[resStart + len(startTxt):resText.find("}]});")] return resText except KeyError: return 0
def commitFiles(self, file, repoName, fileSize): ''' file 待提交文件 repoName 待提交创建名称 fileSize 待提交文件大小 MB ''' os.chdir(os.path.join(self._config.TASK['repoDir'], repoName)) os.system('git pull origin gh-pages') fileList = [] # 超过单个文件大小限制的 if fileSize > self._maxSize: fileDuration = subprocess.check_output( self._getDuration.format( os.path.join(self._config.TASK['fileDir'], file))) fileDuration = fileDuration.decode('UTF-8') fileDuration = fileDuration[ fileDuration.find('duration=', 0) + 9:fileDuration. find('size=', fileDuration.find('duration=', 0))] segmentTime = float(fileDuration) / float(fileSize) * self._maxSize Util.info('Files duration') Util.info(segmentTime, float(fileDuration), float(fileSize), self._maxSize) # 创建新的批量文件名 newFileName_000.mp4 newFileName = Util.genRandName(11) os.system( self._doSegment.format( os.path.join(self._config.TASK['fileDir'], file), segmentTime, newFileName)) # 最大文件数 fileNum = math.ceil(float(fileDuration) / segmentTime) # 确认文件列表 print('filenummmmm', fileNum) for num in range(fileNum): if True == os.path.isfile("%s_%03d.mp4" % (newFileName, num)): fileList.append("https://%s/%s/%s_%03d.mp4" % (self._config.WAREHOUSE['host'], repoName, newFileName, num)) else: shutil.copyfile( os.path.join(self._config.TASK['fileDir'], file), os.path.join(self._config.TASK['repoDir'], repoName, os.path.basename(file))) fileList.append("https://{}/{}/{}".format( self._config.WAREHOUSE['host'], repoName, os.path.basename(file))) os.system( 'git add *.mp4 && git add *.jpg && git add *.png && git add *.jpeg' ) os.system('git commit -m {}'.format(os.path.basename(file))) os.system('git push origin gh-pages') Util.info("文件添加至仓库成功") return fileList
def dlFile(self, link, rdlPath, rfileName, dlfileName): Util.info("Dl without proxy") try: subprocess.check_call([ self._args['params']['youGet'], link, '-o', rdlPath, '-O', dlfileName ]) except subprocess.CalledProcessError as err: Util.error(err) # 记录 Error 至 db Util.info('Youget:影片未成功下载') return False return dlfileName
def newList(self, data): if not isinstance(data, list): Util.error('Data must be a list') return False requireFields = ['setId', 'name', 'summary', 'link', 'img'] # assert common.checkRequire(data, requireFields) requireCheckRe = Util.checkRequire(data, requireFields) if True != requireCheckRe: Util.error('{} Require field {} not found'.format( 'saveVideoList', requireCheckRe)) return False data = Util.removeUnsafeFields(data, self.videoListFields.keys(), self.videoListFields) return self._db.insert_many(data)
def dlFile(self, link, rdlPath, dlfileName): Util.info("Dl without proxy") # 正常平台下载 try: filename = subprocess.check_output([ self._args['params']['youtubeDl'], link, '--get-filename', '-o', '{}'.format(dlfileName) ]) subprocess.check_call([ self._args['params']['youtubeDl'], link, '-o', '{}/{}'.format(rdlPath, dlfileName), '-f', 'bestvideo+bestaudio/best' ]) except subprocess.CalledProcessError as err: Util.error(err) # 记录 Error 至 db Util.info('Youtubedl:影片未成功下载') return False return filename.decode('UTF-8').strip()
def newSet(self, data, platform): if not isinstance(data, dict): Util.error('Data must be a dict') return False requireFields = [ 'title', 'link', 'summary', 'link', 'img', 'episode_over', 'is_vip', 'area', 'lang' ] # assert common.checkRequire(data, requireFields) requireCheckRe = Util.checkRequire(data, requireFields) if True != requireCheckRe: Util.error('{} Require field {} not found'.format( 'saveVideoSet', requireCheckRe)) return False data = Util.removeUnsafeFields(data, self.videoSetFields.keys(), self.videoSetFields) # 哪个平台的 data['platform'] = int(platform) setId = self._db.insert(data) return setId
def process(self, args): if 'file' not in args: Util.error('File can not found') return False if 'id' not in args: Util.error("Video id can not found") return False size = os.path.getsize( os.path.join(self._config.TASK['fileDir'], args['file'])) # warehouse 单个文件不能超过 45MB sizeMb = size / 1024 / 1024 # if sizeMb > 45: # Util.error('{} 文件超过单个文件大小限制 size: {}MB'.format(args['file'], sizeMb)) # return False settingInfo = self.getModel('Setting').getSetting( self._config.WAREHOUSE['uid']) if 'lastRepoId' not in settingInfo: lastRepoId = "1" else: lastRepoId = str(settingInfo['lastRepoId'] + 1) # 本地文件夹不存在时 if not os.path.exists( os.path.join(self._config.TASK['repoDir'], lastRepoId)): if False == self.createRepo(lastRepoId): # 创建仓库失败 return False # 开始提交文件 fileList = self.commitFiles(args['file'], lastRepoId, sizeMb) Util.info('更新远程地址 {}'.format(fileList)) # 更新远程地址至 远程 UID self.getModel('VideoList').newPlay(args['id'], self._config.WAREHOUSE['uid'], fileList) return True
def setFreeSpace(self, uid, space): return self._db.update_one({"uid": str(uid)}, { "$set": { 'freeSpace': Util.conv2(space, self.settingFields['freeSpace']) } })
def getVideoList(self, seterId, setLink): r = Util.getPage(setLink, proxy=self._isProxy) html = BeautifulSoup(r.text, self.getConfig('FETCH_LIBRARY')) # 处理 影片集下的所有影片列表 # album albumHtml = r.text try: albumId = albumHtml[albumHtml.index("albumId:") + 8 : albumHtml.index("tvId:")] except ValueError: return False # albumHtml = html.gethtml.get_text().index("albumId:")html.get_text().index(",tvId:") albumId = albumId.replace(',', '').replace("\"", "").strip() del r, html # 通过 json 获取所有集数 # 所有视频列表 videoList = [] page = 0 while True: page = page + 1 url = self._moviceListUrl.format(str(albumId), str(page)) Util.info("Set video list : {}".format(url)) r = Util.getPage(url, proxy=self._isProxy) # 变为 json r = json.loads(r.text.replace('var tvInfoJs=', '')) if not r or 'data' not in r: Util.info('Video is empty {}'.format(url)) continue # 所有内容 """ {'mdown': 0, 'vn': '汪汪队立大功全集 第1集', 'vpic': 'http://pic4.qiyipic.com/image/20170923/55/5b/v_111715119_m_601_m1.jpg', 'lgh': [], 'vurl': 'http://www.iqiyi.com/v_19rralnqpo.html', 'purType': 0, 'payMark': 0, 'id': 608736400, 'plcdown': {'17': 0, '15': 0}, 'desc': '精通科技的10岁男孩Ryder在拯救了6条小狗之后,将他们训练成了一组本领高强的狗狗巡逻队。每个小狗都性格鲜明,也各有特长。斑点狗Marshall擅长火中急救;斗牛犬Rubble精通工程机械;牧羊犬Chase是个超级特工;混血儿Rocky是个维修能手;拉布拉多犬Zuma最熟悉水中救援;而可卡颇犬Skye掌握着各种航空技术。拥有这么多解决问题的能力,在加上Ryder提供的炫酷装备支持,不管遇到多么困难和危险的救援任务,他们还总是忘不了相互玩闹,制作轻松的气氛,而每次幽默乐观的狗狗能总能顺利完成任务。', 'pds': '1', 'vt': '海上救援', 'shortTitle': '汪汪队立大功全集 第1集', 'isProduced': 0, 'pd': 1, 'tvQipuId': 608736400, 'type': '1', 'vid': 'e50a9d800b84f5bc42b0b87a82df5dac', 'exclusive': 0, 'videoFocuses': [], 'publishTime': 1485254688000, 'timeLength': 660, 'wmarkPos': 0} """ if 0 == r['data']['pn']: break # 更新总集数数据 if page == 1: self.getModel('VideoSet').modifyEpisode({'episode': r['data']['allNum'] if r['data']['allNum'] else 1}, seterId) cateData = r['data']['vlist'] for data in cateData: # 查询单集是否存在 movieExists = self.getModel("VideoList").exists(data['vn'], seterId) # 单集不存在 并且有数据的情况下写 videolist if True == movieExists: Util.info('单集 {} 已存在于 setId {}'.format(data['vn'], seterId)) continue if 'vurl' in data and 'vn' in data: videoList.append({ 'setId': seterId, 'name': data['vn'], 'summary': data['desc'], 'link': data['vurl'], 'img': data['vpic'], 'duration': '%02d:%02d' % (data['timeLength'] // 60,data['timeLength'] % 60), 'created_at': int(data['publishTime']) }) # 所有分集信息 if len(videoList) > 0: self.getModel("VideoList").newList(videoList) Util.info('成功保存单集 {} 部'.format(len(videoList))) else: Util.info('setId: {} 没有找到任何影片,删除影片集'.format(seterId)) # 如果 没有找到任何分集信息 删除该剧集 self.getModel("VideoSet").remove(seterId) return True
def dlFile(self, args={}): ''' 下载影片 videoId 指定下载视频 id dlMachine 指定下载方法 youget / youtubedl 默认自动,即不可用时切换 ''' if 'videoId' in args: # 指定视频 videoInfo = self.getModel('VideoList').getVideo(args['videoId']) else: # 获取一个未下载的视频 @todo 平台/获取规则 修改 videoInfo = self.getModel('VideoSet').getUnDlRes( self.configList['uid'], 1) if not videoInfo: Util.info('该设备 {} 没有需要下载的资源'.format(self.configList['uid'])) return False Util.info("Download:{} dlFile".format(self._taskName)) Util.info("正在下载影片 {}, videoId: {} setId: {}".format( videoInfo['name'], videoInfo['_id'], videoInfo['setId'])) # 月日 文件夹 dlPath = time.strftime("%m%d", time.localtime()) # 绝对路径 rdlPath = os.path.join(self.configList['params']['dir'], dlPath) if not os.path.exists(rdlPath): os.mkdir(rdlPath) # 文件名重新命名 fileName = Util.genRandName(11) # 10位文件夹的 video 为 17版本, 11位的为 18版本 # 是否使用代理 doDl = 'dlFile' if int(videoInfo['platform']) in self.configList['proxyIds']: doDl = 'dlFileWithProxy' Util.info("Download to {}".format(os.path.join(rdlPath, fileName))) # 下载过程 dlFileName = getattr(self._taskObj, doDl)(videoInfo['link'], rdlPath, fileName) # 下载完成后首先确认文件是否存在 videoExists = False for ext in self._videoExt: if os.path.exists( os.path.join(rdlPath, "{}.{}".format(dlFileName, ext))): dlFileName = "{}.{}".format(dlFileName, ext) videoExists = True break switchMatchine = True # 下载成功 if False == videoExists: # 每次执行允许切换一次 if False == switchMatchine: return False switchMatchine = False # 换下载方法进行下载 tmpMatchine = self._dlMatchines tmpMatchine.remove(self._taskName.lower()) return self.getNewMatchine(tmpMatchine[0].capitalize()) else: # 开始转码 转为 web 可用格式 webVideo = self.getService('Background.Convert').toMp4({ 'dlPath': dlPath, 'inputFile': dlFileName }) # 下载完成写入新记录 self.getModel('VideoList').newPlay(videoInfo['_id'], self.configList['uid'], webVideo) # 影片集 总下载数 + 1 self.getModel("VideoSet").setCanPlayNum(videoInfo['setId'], self.configList['uid']) # 下载至 warehouse if self.configList['uid'] == self.configList['warehouse']['uid']: pass Util.info("Download:{} dlFile end".format(self._taskName)) self.getFreeDisk()