def __init__(self, source_id, name=None, type_name="Youtube", lang="ar", embeded=False, section_title=None, author=None, license=None): if embeded is True: source_id = YouTubeResourceNode.transform_embed(source_id) else: source_id = self.clean_url(source_id) YouTubeResource.__init__(self, source_id) Node.__init__(self, title=None, source_id=source_id, lang=lang, author=author, license=license) LOGGER.info(" + Resource Type: {}".format(type_name)) LOGGER.info(" - URL: {}".format(self.source_id)) self.filename = None self.type_name = type_name self.filepath = None self.name = name self.section_title = section_title self.file_format = file_formats.MP4 self.is_valid = False
def _get_youtube_info(self, use_proxy=True, use_cache=True, options=None): youtube_info = None # 1. Try to get from cache if allowed: if os.path.exists(self.cache_path) and use_cache: LOGGER.info("==> [%s] Retrieving cached information...", self.__str__()) youtube_info = json.load(open(self.cache_path)) # 2. Fetch info from youtube_dl if not youtube_info: LOGGER.info("==> [%s] Requesting info from youtube...", self.__str__()) os.makedirs(self.cache_dir, exist_ok=True) try: youtube_resource = YouTubeResource(self.url, useproxy=use_proxy) except youtube_dl.utils.ExtractorError as e: if "unavailable" in str(e): LOGGER.error("==> [%s] Resource unavailable for URL: %s", self.__str__, self.url) return None if youtube_resource: try: # Save YouTube info to JSON cache file youtube_info = youtube_resource.get_resource_info(options) if youtube_info: json.dump(youtube_info, open(self.cache_path, 'w'), indent=4, ensure_ascii=False, sort_keys=True) else: LOGGER.error("==> [%s] Failed to extract YouTube info", self.__str__()) except Exception as e: LOGGER.error("==> [%s] Failed to get YouTube info: %s", self.__str__(), e) return None return youtube_info
def build_video_nodes(self, base_path, content): videos_url = self.get_videos_urls(content) base_path = build_path([DATA_DIR]) video_nodes = [] for video_url in videos_url: if YouTubeResource.is_youtube( video_url) and not YouTubeResource.is_channel(video_url): video = YouTubeResourceNode(video_url, lang=self.lang) video.download(download=DOWNLOAD_VIDEOS, base_path=base_path) yield video
def download_info(self, use_cache=True): """ Download video info to json file """ match = YOUTUBE_ID_REGEX.match(self.url) if not match: LOGGER.error('==> URL ' + self.url + ' does not match YOUTUBE_ID_REGEX') return False youtube_id = match.group('youtube_id') if not os.path.isdir(YOUTUBE_CACHE_DIR): os.mkdir(YOUTUBE_CACHE_DIR) vinfo_json_path = os.path.join(YOUTUBE_CACHE_DIR, youtube_id + '.json') # First try to get from cache: vinfo = None if os.path.exists(vinfo_json_path) and use_cache: vinfo = json.load(open(vinfo_json_path)) LOGGER.info("Retrieving cached video information...") # else get using youtube_dl: if not vinfo: LOGGER.info("Downloading %s from youtube...", self.url) try: video = YouTubeResource(self.url) except youtube_dl.utils.ExtractorError as e: if "unavailable" in str(e): LOGGER.error("Video not found at URL: %s", self.url) return False if video: try: vinfo = video.get_resource_info() json.dump(vinfo, open(vinfo_json_path, 'w'), indent=4, ensure_ascii=False, sort_keys=True) return True except Exception as e: LOGGER.error("Failed to get video info: %s", e) return False else: return False self.uid = vinfo[ 'id'] # video must have id because required to set youtube_id later self.title = vinfo.get('title', '') self.description = vinfo.get('description', '') if not vinfo['license']: self.license = "Licensed not available" elif "Creative Commons" in vinfo['license']: self.license_common = True else: self.license = vinfo['license'] return True
def test_proxy_playlist_download(tmp_path): playlist = YouTubeResource(YOUTUBE_TEST_PLAYLIST) playlist.download(tmp_path) temp_files = os.listdir(os.path.join(tmp_path, 'Playlist')) expected = [ 'zbkizy-Y3qw.jpg', 'oXnzstpBEOg.mp4', 'oXnzstpBEOg.jpg', 'zbkizy-Y3qw.mp4' ] assert set(temp_files) == set(expected)
def download_info(self): match = YOUTUBE_ID_REGEX.match(self.url) if not match: print('==> URL ' + self.url + ' does not match YOUTUBE_ID_REGEX') return False youtube_id = match.group('youtube_id') if not os.path.isdir(YOUTUBE_CACHE_DIR): os.mkdir(YOUTUBE_CACHE_DIR) vinfo_json_path = os.path.join(YOUTUBE_CACHE_DIR, youtube_id+'.json') # First try to get from cache: vinfo = None if os.path.exists(vinfo_json_path): vinfo = json.load(open(vinfo_json_path)) if not vinfo: # the json data for "Video unavailable" is `null` so can skip them return False print("Using cached video info for youtube_id", youtube_id) # else get using YouTubeResource if not vinfo: print("Downloading {} from youtube...".format(self.url)) try: video = YouTubeResource(self.url) except youtube_dl.utils.ExtractorError as e: if "unavailable" in str(e): print("Video not found at URL: {}".format(self.url)) return False if video: try: vinfo = video.get_resource_info() # Save the remaining "temporary scraped values" of attributes with actual values # from the video metadata. json.dump(vinfo, open(vinfo_json_path, 'w'), indent=4, ensure_ascii=False, sort_keys=True) except Exception as e: print(e) return False else: return False self.uid = vinfo['id'] # video must have id because required to set youtube_id later self.title = vinfo.get('title', '') self.description = vinfo.get('description', '') if not vinfo['license']: self.license = "Licensed not available" elif "Creative Commons" in vinfo['license']: self.license_common = True else: self.license = vinfo['license'] return True
def test_proxy_download(tmp_path): proxy.get_proxies(refresh=True) assert len(proxy.PROXY_LIST) > 1 video = YouTubeResource(YOUTUBE_TEST_VIDEO) video.download(tmp_path) temp_files = os.listdir(os.path.join(tmp_path, 'Watch')) has_video = False for afile in temp_files: if afile.endswith('.mp4'): has_video = True assert has_video, 'Video file not found'
def test_bad_proxies_get_banned(tmp_path): # create some fake proxies... FAKE_PROXIES = [ '123.123.123.123:1234', '142.123.1.234:123345', '156.245.233.211:12323', '11.22.33.44:123', ] # initialize PROXY_LIST to known-bad proxies to check that they get banned proxy.PROXY_LIST = FAKE_PROXIES.copy() video = YouTubeResource(YOUTUBE_TEST_VIDEO) video.download(tmp_path) # Fake proxies should get added to BROKEN_PROXIES assert set(FAKE_PROXIES).issubset(set(proxy.BROKEN_PROXIES))
def get_videos_urls(self, content): urls = set([]) if content is not None: video_urls = content.find_all( lambda tag: tag.name == "a" and tag.attrs.get("href", "").find( "youtube") != -1 or tag.attrs.get("href", "").find( "youtu.be") != -1 or tag.text.lower() == "youtube") for video_url in video_urls: urls.add(video_url.get("href", "")) for iframe in content.find_all("iframe"): url = iframe["src"] if YouTubeResource.is_youtube( url) and not YouTubeResource.is_channel(url): urls.add(YouTubeResource.transform_embed(url)) return urls
def get_playlist_info(self): """ Get playlist info from either local json cache or URL """ if not os.path.isdir(YOUTUBE_CACHE_DIR): os.mkdir(YOUTUBE_CACHE_DIR) playlist_info = None if os.path.exists(self.playlist_info_json_path) and self.use_cache: LOGGER.info( "[Playlist %s] Retrieving cached playlist information...", self.playlist_id) playlist_info = json.load(open(self.playlist_info_json_path)) if not playlist_info: playlist_url = YOUTUBE_PLAYLIST_URL_FORMAT.format(self.playlist_id) playlist_resource = YouTubeResource(playlist_url) if playlist_resource: try: playlist_info = playlist_resource.get_resource_info( dict(ignoreerrors=True, skip_download=True)) # Traverse through the video list to remove duplicates video_set = set() videos = playlist_info.get('children') for video in videos: if video['id'] in video_set: videos.remove(video) else: video_set.add(video['id']) json.dump(playlist_info, open(self.playlist_info_json_path, 'w'), indent=4, ensure_ascii=False, sort_keys=False) LOGGER.info("[Playlist %s] Successfully get playlist info", self.playlist_id) return playlist_info except Exception as e: LOGGER.error( "[Playlist %s] Failed to get playlist info: %s", self.playlist_id, e) return None return playlist_info
def get_subtitles_using_youtube_dl(youtube_id): youtube_url = 'https://youtube.com/watch?v=' + youtube_id yt_resource = YouTubeResource(youtube_url) lang_codes = [] try: result = yt_resource.get_resource_subtitles() # TODO(ivan) Consider including auto-generated subtitles to increase # coverage and handle edge cases of videos that are transalted # but no metadata: https://www.youtube.com/watch?v=qlGjA9p1UAM if result: for lang_code, lang_subs in result['subtitles'].items(): for lang_sub in lang_subs: if 'ext' in lang_sub and lang_sub[ 'ext'] == 'vtt' and lang_code not in lang_codes: lang_codes.append(lang_code) except Exception as e: LOGGER.error('get_subtitles_using_youtube_dl failed for ' + youtube_url) LOGGER.error(str(e)) return lang_codes
def download_from_web(web_url, download_settings, file_format=file_formats.MP4, ext="", download_ext=""): """ Download `web_url` using YoutubeDL using `download_settings` options. Args: download_settings (dict): options to pass onto YoutubeDL file_format (str): one of "mp4" or "vtt" ext (str): extensions to use as part of `outtmpl` given to YoutubeDL download_ext (str): extensions to append to `outtmpl` after downloading This is function operates differently when downloadin videos and substitles. For videos we set the `outtmpl` to the actual filename that will be downloaded, and the function must be called with ext = ".mp4" and download_ext="". For subtitles we set the `outtmpl` to extension-less string, and YoutubeDL automatically appends the language code and vtt extension, so the function must be called with ext="" and download_ext=".{youtube_lang}.vtt" :return: filename derived from hash of file contents {md5hash(file)}.ext """ key = generate_key("DOWNLOADED", web_url, settings=download_settings) cache_file = get_cache_filename(key) if cache_file: return cache_file # Get hash of web_url to act as temporary storage name url_hash = hashlib.md5() url_hash.update(web_url.encode('utf-8')) tempfilename = "{}{ext}".format(url_hash.hexdigest(), ext=ext) outtmpl_path = os.path.join(tempfile.gettempdir(), tempfilename) download_settings["outtmpl"] = outtmpl_path destination_path = outtmpl_path + download_ext # file dest. after download # Delete files in case previously downloaeded if os.path.exists(outtmpl_path): os.remove(outtmpl_path) if os.path.exists(destination_path): os.remove(destination_path) # Download the web_url which can be either a video or subtitles if not config.USEPROXY: # Connect to YouTube directly with youtube_dl.YoutubeDL(download_settings) as ydl: ydl.download([web_url]) if not os.path.exists(destination_path): raise youtube_dl.utils.DownloadError('Failed to download ' + web_url) else: # Connect to YouTube via an HTTP proxy yt_resource = YouTubeResource(web_url, useproxy=True, options=download_settings) result1 = yt_resource.get_resource_info() if result1 is None: raise youtube_dl.utils.DownloadError('Failed to get resource info') download_settings[ "writethumbnail"] = False # overwrite default behaviour if file_format == file_formats.VTT: # We need to use the proxy when downloading subtitles result2 = yt_resource.download(options=download_settings, useproxy=True) else: # For video files we can skip the proxy for faster download speed result2 = yt_resource.download(options=download_settings) if result2 is None or not os.path.exists(destination_path): raise youtube_dl.utils.DownloadError( 'Failed to download resource ' + web_url) # Write file to local storage filename = "{}.{}".format(get_hash(destination_path), file_format) with open(destination_path, "rb") as dlf, open(config.get_storage_path(filename), 'wb') as destf: shutil.copyfileobj(dlf, destf) FILECACHE.set(key, bytes(filename, "utf-8")) return filename