async def old_version_download_callback(self, response): log.debug("Old version download callback") file_path = response.request.file_path metadata = response.request.metadata if file_path.endswith("video.flv"): order = file_path.split("/")[-1][0] log.info("视频片段[{}]下载完毕: AV<{}>".format(order, metadata["id"])) self.old_version_records[metadata["id"]]["fragments"].append( file_path) if len(self.old_version_records[metadata["id"]]["fragments"]) == len( metadata["video_info"]): fragments_path = self.old_version_records[ metadata["id"]]["fragments"] fragments_path = sorted(fragments_path, key=lambda x: int(x.split("/")[-1][0])) ts_path_list = [] for fragment_path in fragments_path: ts_path = fragment_path[:-3] + "ts" os.system( "ffmpeg -i {flv_path} -y -loglevel warning -c copy -bsf:v h264_mp4toannexb -f mpegts {ts_path}" .format(flv_path=fragment_path, ts_path=ts_path)) ts_path_list.append(ts_path) output_path = os.path.join(metadata["storage_dir"], "{}.mp4".format(metadata["video_name"])) os.system( 'ffmpeg -i "concat:{concat_param}" -y -loglevel warning -c copy -bsf:a aac_adtstoasc -movflags +faststart {output_path}' .format(concat_param="|".join(ts_path_list), output_path=output_path)) for temp in ts_path_list + fragments_path: os.remove(temp) log.info("视频分片合并完毕: AV<{}>".format(metadata["id"]))
async def handle_request(self, request): log.debug("Handle request {}".format(request)) try: if isinstance(request, FileRequest): response = await self.download(request) else: response = await self.fetch(request) # 注意:callback方法必须是一个awaitable对象 response_processed_result = request.callback(response) if isinstance(response_processed_result, typing.AsyncGenerator): async for result in response_processed_result: if isinstance(result, Request): await self.add_request(result) else: result = await response_processed_result if isinstance(result, Request): await self.add_request(result) except Exception as e: log.exception(e) log.error("Catch exception: {} {}".format(e, request)) self.failed_request_count += 1 else: log.debug("Fetched response from {}".format(request)) self.finished_request_count += 1
def _register_all_service(): from media_downloader.application.manager import CrawlerManager from media_downloader.application.manager import DownloaderManager from media_downloader.application.service import DownloadingService Mapper.register(DownloadingService, CrawlerManager(), DownloaderManager()) log.debug("Inversion Of Controller Registered DownloadingService")
def _register_all_manager(): from media_downloader.application.manager import CrawlerManager from media_downloader.application.manager import DownloaderManager Mapper.register(CrawlerManager) log.debug("Inversion Of Controller Registered CrawlerManager") Mapper.register(DownloaderManager) log.debug("Inversion Of Controller Registered DownloaderManager")
async def fetch(self, request: Request): '''根据指定的请求对象发起HTTP请求''' log.debug("Send request {}".format(request)) try: response = await self.http_client.fetch(request) except HTTPClientError as e: return Response(e.response, request) else: return Response(response, request)
async def status_print_loop(self): while True: running_time = int(time.time()) - self.start_time if running_time > 0 and running_time % 60 == 0: log.info(self.get_running_messages()) await asyncio.sleep(1) if self.QUIT is True: log.info(self.get_running_messages()) break log.debug("Quit status print loop")
async def run(self, downloader_name=None): downloader_name = downloader_name if downloader_name else "" log.debug("Run downloader {}".format(downloader_name)) while self.is_running: try: request = self.request_queue.get_nowait() except QueueEmpty: await asyncio.sleep(self.__polling_interval) else: await self.handle_request(request) if self.pushed_request_count == self.finished_request_count + self.failed_request_count: self.is_running = False await asyncio.sleep(self.delay) log.debug("Quit downloader {}".format(downloader_name))
async def task_scheduling_loop(self): while True: try: crawler_name, crawler_params = self.schedule_queue.get_nowait() except QueueEmpty: await asyncio.sleep(1) else: crawler = self.crawler_manager.create_crawler( crawler_name, crawler_params) downloader = self.downloader_manager.get_or_create_downloader( crawler) await self.downloader_manager.schedule(crawler, downloader) finally: if self.QUIT and self.schedule_queue.qsize() == 0: break log.debug("Quit task scheduling loop")
async def download(self, request: FileRequest): log.debug("Downloading '{}'".format(request.file_path)) actual_length = 0 file = open(request.file_path, "wb") def write_data(data): nonlocal actual_length actual_length += len(data) file.write(data) tornado_request = Request( url=request.url, callback=None, headers=request.headers, streaming_callback=write_data, request_timeout=60*10, connect_timeout=60*10 ) tornado_response = await self.http_client.fetch(tornado_request) file.close() if actual_length == int(tornado_response.headers["Content-Length"]): log.debug("Success download '{}'".format(request.file_path)) else: raise Exception("Error download '{}': content-length error".format(request.file_path)) return FileResponse(tornado_response, request, file_size=actual_length, file_path=request.file_path)
def callback(_): log.debug("Downloading Mission Done!") self.QUIT = True
async def detail_callback(self, response): media_headers = deepcopy(self.media_headers) media_headers[ "Referer"] = response.request.url # 设置media下载请求头的Referer地址 metadata = response.request.metadata if "DIRECTLY_PARSING" in metadata: content = json.loads(response.body) log.info("详情页解析成功:[{}]".format(response.request.url)) log.debug(metadata) async for request in self.content_parser(content, metadata, media_headers): yield request else: # _content = re.search(r"<script>window.__playinfo__=(.*?)</script>", response.body.decode()) # if _content is None: _videoData = re.search(r"\"videoData\":(.*?),\"upData\"", response.body.decode()) if _videoData is None: # 如果没有匹配到任何数据,记录日志,并直接退出 log.warning("详情页解析失败: <{}>[{}]".format(response.code, response.request.url)) else: videoData = json.loads(_videoData.group(1)) storage_dir = os.path.join( metadata["base_storage_dir"], "av{}-{}".format( metadata["av_number"], videoData["title"].replace("/", "-").replace(" ", ""))) if not os.path.exists(storage_dir): os.mkdir(storage_dir) metadata["storage_dir"] = storage_dir content_base_url = "https://api.bilibili.com/x/player/playurl?avid={aid}&cid={cid}&qn=0&type=&otype=json&fnver=0&fnval=16" pages = videoData["pages"] if len(pages) == 1: url = content_base_url.format(aid=videoData["aid"], cid=videoData["cid"]) metadata[ "DIRECTLY_PARSING"] = "标示该请求是通过提取 videoData 中的 aid 和 cid 来发起单独的请求解析到 JSON Content 的" metadata["video_name"] = videoData["title"].replace( "/", "-").replace(" ", "") metadata["id"] = "{}-{}".format(videoData["aid"], videoData["cid"]) yield Request( url, headers=media_headers, callback=self.detail_callback, metadata=metadata, # proxy_host=PROXY_HOST, # proxy_port=PROXY_PORT, # proxy_username=PROXY_USERNAME, # proxy_password=PROXY_PASSWORD, ) else: for page in pages: url = content_base_url.format(aid=videoData["aid"], cid=page["cid"]) metadata[ "DIRECTLY_PARSING"] = "标示该请求是通过提取 videoData 中的 aid 和 cid 来发起单独的请求解析到 JSON Content 的" metadata["video_name"] = "{}-{}".format( page["page"], page["part"].replace("/", "-").replace(" ", "")) metadata["id"] = "{}-{}".format( videoData["aid"], page["cid"]) yield Request( url, headers=media_headers, callback=self.detail_callback, metadata=deepcopy(metadata), # proxy_host=PROXY_HOST, # proxy_port=PROXY_PORT, # proxy_username=PROXY_USERNAME, # proxy_password=PROXY_PASSWORD, )