Exemplo n.º 1
0
    async def old_version_download_callback(self, response):
        log.debug("Old version download callback")
        file_path = response.request.file_path
        metadata = response.request.metadata
        if file_path.endswith("video.flv"):
            order = file_path.split("/")[-1][0]
            log.info("视频片段[{}]下载完毕: AV<{}>".format(order, metadata["id"]))
            self.old_version_records[metadata["id"]]["fragments"].append(
                file_path)

        if len(self.old_version_records[metadata["id"]]["fragments"]) == len(
                metadata["video_info"]):
            fragments_path = self.old_version_records[
                metadata["id"]]["fragments"]
            fragments_path = sorted(fragments_path,
                                    key=lambda x: int(x.split("/")[-1][0]))

            ts_path_list = []
            for fragment_path in fragments_path:
                ts_path = fragment_path[:-3] + "ts"
                os.system(
                    "ffmpeg -i {flv_path} -y -loglevel warning -c copy -bsf:v h264_mp4toannexb -f mpegts {ts_path}"
                    .format(flv_path=fragment_path, ts_path=ts_path))
                ts_path_list.append(ts_path)
            output_path = os.path.join(metadata["storage_dir"],
                                       "{}.mp4".format(metadata["video_name"]))
            os.system(
                'ffmpeg -i "concat:{concat_param}" -y -loglevel warning -c copy -bsf:a aac_adtstoasc -movflags +faststart {output_path}'
                .format(concat_param="|".join(ts_path_list),
                        output_path=output_path))
            for temp in ts_path_list + fragments_path:
                os.remove(temp)
            log.info("视频分片合并完毕: AV<{}>".format(metadata["id"]))
Exemplo n.º 2
0
    async def handle_request(self, request):
        log.debug("Handle request {}".format(request))
        try:
            if isinstance(request, FileRequest):
                response = await self.download(request)
            else:
                response = await self.fetch(request)
            # 注意:callback方法必须是一个awaitable对象
            response_processed_result = request.callback(response)

            if isinstance(response_processed_result, typing.AsyncGenerator):
                async for result in response_processed_result:
                    if isinstance(result, Request):
                        await self.add_request(result)
            else:
                result = await response_processed_result
                if isinstance(result, Request):
                    await self.add_request(result)
        except Exception as e:
            log.exception(e)
            log.error("Catch exception: {} {}".format(e, request))
            self.failed_request_count += 1
        else:
            log.debug("Fetched response from {}".format(request))
            self.finished_request_count += 1
Exemplo n.º 3
0
    def _register_all_service():
        from media_downloader.application.manager import CrawlerManager
        from media_downloader.application.manager import DownloaderManager
        from media_downloader.application.service import DownloadingService

        Mapper.register(DownloadingService, CrawlerManager(),
                        DownloaderManager())
        log.debug("Inversion Of Controller Registered DownloadingService")
Exemplo n.º 4
0
    def _register_all_manager():
        from media_downloader.application.manager import CrawlerManager
        from media_downloader.application.manager import DownloaderManager

        Mapper.register(CrawlerManager)
        log.debug("Inversion Of Controller Registered CrawlerManager")

        Mapper.register(DownloaderManager)
        log.debug("Inversion Of Controller Registered DownloaderManager")
Exemplo n.º 5
0
 async def fetch(self, request: Request):
     '''根据指定的请求对象发起HTTP请求'''
     log.debug("Send request {}".format(request))
     try:
         response = await self.http_client.fetch(request)
     except HTTPClientError as e:
         return Response(e.response, request)
     else:
         return Response(response, request)
Exemplo n.º 6
0
 async def status_print_loop(self):
     while True:
         running_time = int(time.time()) - self.start_time
         if running_time > 0 and running_time % 60 == 0:
             log.info(self.get_running_messages())
         await asyncio.sleep(1)
         if self.QUIT is True:
             log.info(self.get_running_messages())
             break
     log.debug("Quit status print loop")
Exemplo n.º 7
0
 async def run(self, downloader_name=None):
     downloader_name = downloader_name if downloader_name else ""
     log.debug("Run downloader {}".format(downloader_name))
     while self.is_running:
         try:
             request = self.request_queue.get_nowait()
         except QueueEmpty:
             await asyncio.sleep(self.__polling_interval)
         else:
             await self.handle_request(request)
             if self.pushed_request_count == self.finished_request_count + self.failed_request_count:
                 self.is_running = False
         await asyncio.sleep(self.delay)
     log.debug("Quit downloader {}".format(downloader_name))
Exemplo n.º 8
0
 async def task_scheduling_loop(self):
     while True:
         try:
             crawler_name, crawler_params = self.schedule_queue.get_nowait()
         except QueueEmpty:
             await asyncio.sleep(1)
         else:
             crawler = self.crawler_manager.create_crawler(
                 crawler_name, crawler_params)
             downloader = self.downloader_manager.get_or_create_downloader(
                 crawler)
             await self.downloader_manager.schedule(crawler, downloader)
         finally:
             if self.QUIT and self.schedule_queue.qsize() == 0:
                 break
     log.debug("Quit task scheduling loop")
Exemplo n.º 9
0
    async def download(self, request: FileRequest):
        log.debug("Downloading '{}'".format(request.file_path))
        actual_length = 0
        file = open(request.file_path, "wb")
        def write_data(data):
            nonlocal actual_length
            actual_length += len(data)
            file.write(data)
        tornado_request = Request(
            url=request.url,
            callback=None,
            headers=request.headers,
            streaming_callback=write_data,
            request_timeout=60*10,
            connect_timeout=60*10
        )
        tornado_response = await self.http_client.fetch(tornado_request)
        file.close()

        if actual_length == int(tornado_response.headers["Content-Length"]):
            log.debug("Success download '{}'".format(request.file_path))
        else:
            raise Exception("Error download '{}': content-length error".format(request.file_path))
        return FileResponse(tornado_response, request, file_size=actual_length, file_path=request.file_path)
Exemplo n.º 10
0
 def callback(_):
     log.debug("Downloading Mission Done!")
     self.QUIT = True
Exemplo n.º 11
0
    async def detail_callback(self, response):
        media_headers = deepcopy(self.media_headers)
        media_headers[
            "Referer"] = response.request.url  # 设置media下载请求头的Referer地址

        metadata = response.request.metadata

        if "DIRECTLY_PARSING" in metadata:
            content = json.loads(response.body)
            log.info("详情页解析成功:[{}]".format(response.request.url))
            log.debug(metadata)
            async for request in self.content_parser(content, metadata,
                                                     media_headers):
                yield request
        else:
            # _content = re.search(r"<script>window.__playinfo__=(.*?)</script>", response.body.decode())

            # if _content is None:
            _videoData = re.search(r"\"videoData\":(.*?),\"upData\"",
                                   response.body.decode())
            if _videoData is None:
                # 如果没有匹配到任何数据,记录日志,并直接退出
                log.warning("详情页解析失败: <{}>[{}]".format(response.code,
                                                       response.request.url))
            else:
                videoData = json.loads(_videoData.group(1))

                storage_dir = os.path.join(
                    metadata["base_storage_dir"], "av{}-{}".format(
                        metadata["av_number"],
                        videoData["title"].replace("/", "-").replace(" ", "")))
                if not os.path.exists(storage_dir):
                    os.mkdir(storage_dir)
                metadata["storage_dir"] = storage_dir
                content_base_url = "https://api.bilibili.com/x/player/playurl?avid={aid}&cid={cid}&qn=0&type=&otype=json&fnver=0&fnval=16"
                pages = videoData["pages"]
                if len(pages) == 1:
                    url = content_base_url.format(aid=videoData["aid"],
                                                  cid=videoData["cid"])
                    metadata[
                        "DIRECTLY_PARSING"] = "标示该请求是通过提取 videoData 中的 aid 和 cid 来发起单独的请求解析到 JSON Content 的"
                    metadata["video_name"] = videoData["title"].replace(
                        "/", "-").replace(" ", "")
                    metadata["id"] = "{}-{}".format(videoData["aid"],
                                                    videoData["cid"])
                    yield Request(
                        url,
                        headers=media_headers,
                        callback=self.detail_callback,
                        metadata=metadata,
                        # proxy_host=PROXY_HOST,
                        # proxy_port=PROXY_PORT,
                        # proxy_username=PROXY_USERNAME,
                        # proxy_password=PROXY_PASSWORD,
                    )
                else:
                    for page in pages:
                        url = content_base_url.format(aid=videoData["aid"],
                                                      cid=page["cid"])
                        metadata[
                            "DIRECTLY_PARSING"] = "标示该请求是通过提取 videoData 中的 aid 和 cid 来发起单独的请求解析到 JSON Content 的"
                        metadata["video_name"] = "{}-{}".format(
                            page["page"],
                            page["part"].replace("/", "-").replace(" ", ""))
                        metadata["id"] = "{}-{}".format(
                            videoData["aid"], page["cid"])
                        yield Request(
                            url,
                            headers=media_headers,
                            callback=self.detail_callback,
                            metadata=deepcopy(metadata),
                            # proxy_host=PROXY_HOST,
                            # proxy_port=PROXY_PORT,
                            # proxy_username=PROXY_USERNAME,
                            # proxy_password=PROXY_PASSWORD,
                        )