def _get_youtube_info(self, use_proxy=True, use_cache=True, options=None): youtube_info = None # 1. Try to get from cache if allowed: if os.path.exists(self.cache_path) and use_cache: LOGGER.info("==> [%s] Retrieving cached information...", self.__str__()) youtube_info = json.load(open(self.cache_path)) # 2. Fetch info from youtube_dl if not youtube_info: LOGGER.info("==> [%s] Requesting info from youtube...", self.__str__()) os.makedirs(self.cache_dir, exist_ok=True) try: youtube_resource = YouTubeResource(self.url, useproxy=use_proxy) except youtube_dl.utils.ExtractorError as e: if "unavailable" in str(e): LOGGER.error("==> [%s] Resource unavailable for URL: %s", self.__str__, self.url) return None if youtube_resource: try: # Save YouTube info to JSON cache file youtube_info = youtube_resource.get_resource_info(options) if youtube_info: json.dump(youtube_info, open(self.cache_path, 'w'), indent=4, ensure_ascii=False, sort_keys=True) else: LOGGER.error("==> [%s] Failed to extract YouTube info", self.__str__()) except Exception as e: LOGGER.error("==> [%s] Failed to get YouTube info: %s", self.__str__(), e) return None return youtube_info
def download_info(self, use_cache=True): """ Download video info to json file """ match = YOUTUBE_ID_REGEX.match(self.url) if not match: LOGGER.error('==> URL ' + self.url + ' does not match YOUTUBE_ID_REGEX') return False youtube_id = match.group('youtube_id') if not os.path.isdir(YOUTUBE_CACHE_DIR): os.mkdir(YOUTUBE_CACHE_DIR) vinfo_json_path = os.path.join(YOUTUBE_CACHE_DIR, youtube_id + '.json') # First try to get from cache: vinfo = None if os.path.exists(vinfo_json_path) and use_cache: vinfo = json.load(open(vinfo_json_path)) LOGGER.info("Retrieving cached video information...") # else get using youtube_dl: if not vinfo: LOGGER.info("Downloading %s from youtube...", self.url) try: video = YouTubeResource(self.url) except youtube_dl.utils.ExtractorError as e: if "unavailable" in str(e): LOGGER.error("Video not found at URL: %s", self.url) return False if video: try: vinfo = video.get_resource_info() json.dump(vinfo, open(vinfo_json_path, 'w'), indent=4, ensure_ascii=False, sort_keys=True) return True except Exception as e: LOGGER.error("Failed to get video info: %s", e) return False else: return False self.uid = vinfo[ 'id'] # video must have id because required to set youtube_id later self.title = vinfo.get('title', '') self.description = vinfo.get('description', '') if not vinfo['license']: self.license = "Licensed not available" elif "Creative Commons" in vinfo['license']: self.license_common = True else: self.license = vinfo['license'] return True
def download_info(self): match = YOUTUBE_ID_REGEX.match(self.url) if not match: print('==> URL ' + self.url + ' does not match YOUTUBE_ID_REGEX') return False youtube_id = match.group('youtube_id') if not os.path.isdir(YOUTUBE_CACHE_DIR): os.mkdir(YOUTUBE_CACHE_DIR) vinfo_json_path = os.path.join(YOUTUBE_CACHE_DIR, youtube_id+'.json') # First try to get from cache: vinfo = None if os.path.exists(vinfo_json_path): vinfo = json.load(open(vinfo_json_path)) if not vinfo: # the json data for "Video unavailable" is `null` so can skip them return False print("Using cached video info for youtube_id", youtube_id) # else get using YouTubeResource if not vinfo: print("Downloading {} from youtube...".format(self.url)) try: video = YouTubeResource(self.url) except youtube_dl.utils.ExtractorError as e: if "unavailable" in str(e): print("Video not found at URL: {}".format(self.url)) return False if video: try: vinfo = video.get_resource_info() # Save the remaining "temporary scraped values" of attributes with actual values # from the video metadata. json.dump(vinfo, open(vinfo_json_path, 'w'), indent=4, ensure_ascii=False, sort_keys=True) except Exception as e: print(e) return False else: return False self.uid = vinfo['id'] # video must have id because required to set youtube_id later self.title = vinfo.get('title', '') self.description = vinfo.get('description', '') if not vinfo['license']: self.license = "Licensed not available" elif "Creative Commons" in vinfo['license']: self.license_common = True else: self.license = vinfo['license'] return True
def get_playlist_info(self): """ Get playlist info from either local json cache or URL """ if not os.path.isdir(YOUTUBE_CACHE_DIR): os.mkdir(YOUTUBE_CACHE_DIR) playlist_info = None if os.path.exists(self.playlist_info_json_path) and self.use_cache: LOGGER.info( "[Playlist %s] Retrieving cached playlist information...", self.playlist_id) playlist_info = json.load(open(self.playlist_info_json_path)) if not playlist_info: playlist_url = YOUTUBE_PLAYLIST_URL_FORMAT.format(self.playlist_id) playlist_resource = YouTubeResource(playlist_url) if playlist_resource: try: playlist_info = playlist_resource.get_resource_info( dict(ignoreerrors=True, skip_download=True)) # Traverse through the video list to remove duplicates video_set = set() videos = playlist_info.get('children') for video in videos: if video['id'] in video_set: videos.remove(video) else: video_set.add(video['id']) json.dump(playlist_info, open(self.playlist_info_json_path, 'w'), indent=4, ensure_ascii=False, sort_keys=False) LOGGER.info("[Playlist %s] Successfully get playlist info", self.playlist_id) return playlist_info except Exception as e: LOGGER.error( "[Playlist %s] Failed to get playlist info: %s", self.playlist_id, e) return None return playlist_info
def download_from_web(web_url, download_settings, file_format=file_formats.MP4, ext="", download_ext=""): """ Download `web_url` using YoutubeDL using `download_settings` options. Args: download_settings (dict): options to pass onto YoutubeDL file_format (str): one of "mp4" or "vtt" ext (str): extensions to use as part of `outtmpl` given to YoutubeDL download_ext (str): extensions to append to `outtmpl` after downloading This is function operates differently when downloadin videos and substitles. For videos we set the `outtmpl` to the actual filename that will be downloaded, and the function must be called with ext = ".mp4" and download_ext="". For subtitles we set the `outtmpl` to extension-less string, and YoutubeDL automatically appends the language code and vtt extension, so the function must be called with ext="" and download_ext=".{youtube_lang}.vtt" :return: filename derived from hash of file contents {md5hash(file)}.ext """ key = generate_key("DOWNLOADED", web_url, settings=download_settings) cache_file = get_cache_filename(key) if cache_file: return cache_file # Get hash of web_url to act as temporary storage name url_hash = hashlib.md5() url_hash.update(web_url.encode('utf-8')) tempfilename = "{}{ext}".format(url_hash.hexdigest(), ext=ext) outtmpl_path = os.path.join(tempfile.gettempdir(), tempfilename) download_settings["outtmpl"] = outtmpl_path destination_path = outtmpl_path + download_ext # file dest. after download # Delete files in case previously downloaeded if os.path.exists(outtmpl_path): os.remove(outtmpl_path) if os.path.exists(destination_path): os.remove(destination_path) # Download the web_url which can be either a video or subtitles if not config.USEPROXY: # Connect to YouTube directly with youtube_dl.YoutubeDL(download_settings) as ydl: ydl.download([web_url]) if not os.path.exists(destination_path): raise youtube_dl.utils.DownloadError('Failed to download ' + web_url) else: # Connect to YouTube via an HTTP proxy yt_resource = YouTubeResource(web_url, useproxy=True, options=download_settings) result1 = yt_resource.get_resource_info() if result1 is None: raise youtube_dl.utils.DownloadError('Failed to get resource info') download_settings[ "writethumbnail"] = False # overwrite default behaviour if file_format == file_formats.VTT: # We need to use the proxy when downloading subtitles result2 = yt_resource.download(options=download_settings, useproxy=True) else: # For video files we can skip the proxy for faster download speed result2 = yt_resource.download(options=download_settings) if result2 is None or not os.path.exists(destination_path): raise youtube_dl.utils.DownloadError( 'Failed to download resource ' + web_url) # Write file to local storage filename = "{}.{}".format(get_hash(destination_path), file_format) with open(destination_path, "rb") as dlf, open(config.get_storage_path(filename), 'wb') as destf: shutil.copyfileobj(dlf, destf) FILECACHE.set(key, bytes(filename, "utf-8")) return filename