class Youtube2Zim: def __init__( self, collection_type, youtube_id, api_key, video_format, low_quality, nb_videos_per_page, all_subtitles, autoplay, output_dir, no_zim, fname, debug, tmp_dir, keep_build_dir, max_concurrency, youtube_store, language, locale_name, tags, dateafter, use_any_optimized_version, s3_url_with_credentials, title=None, description=None, creator=None, publisher=None, name=None, profile_image=None, banner_image=None, main_color=None, secondary_color=None, ): # data-retrieval info self.collection_type = collection_type self.youtube_id = youtube_id self.api_key = api_key self.dateafter = dateafter # video-encoding info self.video_format = video_format self.low_quality = low_quality # options & zim params self.nb_videos_per_page = nb_videos_per_page self.all_subtitles = all_subtitles self.autoplay = autoplay self.fname = fname self.language = language self.tags = [t.strip() for t in tags.split(",")] self.title = title self.description = description self.creator = creator self.publisher = publisher self.name = name self.profile_image = profile_image self.banner_image = banner_image self.main_color = main_color self.secondary_color = secondary_color # directory setup self.output_dir = Path(output_dir).expanduser().resolve() if tmp_dir: tmp_dir = Path(tmp_dir).expanduser().resolve() tmp_dir.mkdir(parents=True, exist_ok=True) self.build_dir = Path(tempfile.mkdtemp(dir=tmp_dir)) # process-related self.playlists = [] self.uploads_playlist_id = None self.videos_ids = [] self.main_channel_id = None # use for branding # debug/devel options self.no_zim = no_zim self.debug = debug self.keep_build_dir = keep_build_dir self.max_concurrency = max_concurrency # update youtube credentials store youtube_store.update(build_dir=self.build_dir, api_key=self.api_key, cache_dir=self.cache_dir) # Optimization-cache self.s3_url_with_credentials = s3_url_with_credentials self.use_any_optimized_version = use_any_optimized_version self.video_quality = "low" if self.low_quality else "high" self.s3_storage = None # set and record locale for translations locale_name = locale_name or get_language_details( self.language)["iso-639-1"] try: self.locale = setlocale(ROOT_DIR, locale_name) except locale.Error: logger.error( f"No locale for {locale_name}. Use --locale to specify it. defaulting to en_US" ) self.locale = setlocale(ROOT_DIR, "en") @property def root_dir(self): return ROOT_DIR @property def templates_dir(self): return self.root_dir.joinpath("templates") @property def assets_src_dir(self): return self.templates_dir.joinpath("assets") @property def assets_dir(self): return self.build_dir.joinpath("assets") @property def channels_dir(self): return self.build_dir.joinpath("channels") @property def cache_dir(self): return self.build_dir.joinpath("cache") @property def videos_dir(self): return self.build_dir.joinpath("videos") @property def profile_path(self): return self.build_dir.joinpath("profile.jpg") @property def banner_path(self): return self.build_dir.joinpath("banner.jpg") @property def is_user(self): return self.collection_type == USER @property def is_channel(self): return self.collection_type == CHANNEL @property def is_playlist(self): return self.collection_type == PLAYLIST @property def is_single_channel(self): if self.is_channel or self.is_user: return True return len(list(set([pl.creator_id for pl in self.playlists]))) == 1 @property def sorted_playlists(self): """ sorted list of playlists (by title) but with Uploads one at first if any """ if len(self.playlists) < 2: return self.playlists sorted_playlists = sorted(self.playlists, key=lambda x: x.title) index = 0 # make sure our Uploads, special playlist is first if self.uploads_playlist_id: try: index = [ index for index, p in enumerate(sorted_playlists) if p.playlist_id == self.uploads_playlist_id ][-1] except Exception: index = 0 return ([sorted_playlists[index]] + sorted_playlists[0:index] + sorted_playlists[index + 1:]) def run(self): """ execute the scraper step by step """ self.validate_id() # validate dateafter input self.validate_dateafter_input() logger.info( f"starting youtube scraper for {self.collection_type}#{self.youtube_id}" ) logger.info("preparing build folder at {}".format( self.build_dir.resolve())) self.prepare_build_folder() logger.info("testing Youtube credentials") if not credentials_ok(): raise ValueError( "Unable to connect to Youtube API v3. check `API_KEY`.") if self.s3_url_with_credentials and not self.s3_credentials_ok(): raise ValueError( "Unable to connect to Optimization Cache. Check its URL.") # fail early if supplied branding files are missing self.check_branding_values() logger.info("compute playlists list to retrieve") self.extract_playlists() logger.info(".. {} playlists:\n {}".format( len(self.playlists), "\n ".join([p.playlist_id for p in self.playlists]), )) logger.info("compute list of videos") self.extract_videos_list() nb_videos_msg = f".. {len(self.videos_ids)} videos" if self.dateafter.start.year != 1: nb_videos_msg += ( f" in date range: {self.dateafter.start} - {datetime.date.today()}" ) logger.info(f"{nb_videos_msg}.") # download videos (and recompress) logger.info( f"downloading all videos, subtitles and thumbnails (concurrency={self.max_concurrency})" ) logger.info(f" format: {self.video_format}") logger.info(f" quality: {self.video_quality}") logger.info(f" generated-subtitles: {self.all_subtitles}") if self.s3_storage: logger.info( f" using cache: {self.s3_storage.url.netloc} with bucket: {self.s3_storage.bucket_name}" ) succeeded, failed = self.download_video_files( max_concurrency=self.max_concurrency) if failed: logger.error( f"{len(failed)} video(s) failed to download: {failed}") if len(failed) >= len(succeeded): logger.critical("More than half of videos failed. exiting") raise IOError("Too much videos failed to download") logger.info("retrieve channel-info for all videos (author details)") get_videos_authors_info(succeeded) logger.info("download all author's profile pictures") self.download_authors_branding() logger.info("update general metadata") self.update_metadata() logger.info("creating HTML files") self.make_html_files(succeeded) # make zim file os.makedirs(self.output_dir, exist_ok=True) if not self.no_zim: period = datetime.datetime.now().strftime("%Y-%m") self.fname = (self.fname.format( period=period) if self.fname else f"{self.name}_{period}.zim") logger.info("building ZIM file") make_zim_file( build_dir=self.build_dir, fpath=self.output_dir / self.fname, name=self.name, main_page="home.html", favicon="favicon.jpg", title=self.title, description=self.description, language=self.language, creator=self.creator, publisher="Kiwix", tags=self.tags, scraper=SCRAPER, ) if not self.keep_build_dir: logger.info("removing temp folder") shutil.rmtree(self.build_dir, ignore_errors=True) logger.info("all done!") def s3_credentials_ok(self): logger.info("testing S3 Optimization Cache credentials") self.s3_storage = KiwixStorage(self.s3_url_with_credentials) if not self.s3_storage.check_credentials(list_buckets=True, bucket=True, write=True, read=True, failsafe=True): logger.error("S3 cache connection error testing permissions.") logger.error(f" Server: {self.s3_storage.url.netloc}") logger.error(f" Bucket: {self.s3_storage.bucket_name}") logger.error(f" Key ID: {self.s3_storage.params.get('keyid')}") logger.error(f" Public IP: {get_public_ip()}") return False return True def validate_dateafter_input(self): try: self.dateafter = youtube_dl.DateRange(self.dateafter) except Exception as exc: logger.error( f"Invalid dateafter input. Valid dateafter format: " f"YYYYMMDD or (now|today)[+-][0-9](day|week|month|year)(s).") raise ValueError(f"Invalid dateafter input: {exc}") def validate_id(self): # space not allowed in youtube-ID self.youtube_id = self.youtube_id.replace(" ", "") if self.collection_type == "channel" and len(self.youtube_id) > 24: raise ValueError("Invalid ChannelId") if "," in self.youtube_id and self.collection_type != "playlist": raise ValueError("Invalid YoutubeId") def prepare_build_folder(self): """ prepare build folder before we start downloading data """ # copy assets shutil.copytree(self.assets_src_dir, self.assets_dir) fix_source_dir(self.assets_dir, "assets") # cache folder to store youtube-api results self.cache_dir.mkdir(exist_ok=True) # make videos placeholder self.videos_dir.mkdir(exist_ok=True) # make channels placeholder (profile files) self.channels_dir.mkdir(exist_ok=True) def check_branding_values(self): """checks that user-supplied images and colors are valid (so to fail early) Images are checked for existence or downloaded then resized Colors are check for validity""" # skip this step if none of related values were supplied if not sum([ bool(x) for x in ( self.profile_image, self.banner_image, self.main_color, self.secondary_color, ) ]): return logger.info("checking your branding files and values") if self.profile_image: if self.profile_image.startswith("http"): stream_file(self.profile_image, self.profile_path) else: if not self.profile_image.exists(): raise IOError( f"--profile image could not be found: {self.profile_image}" ) shutil.move(self.profile_image, self.profile_path) resize_image(self.profile_path, width=100, height=100, method="thumbnail") if self.banner_image: if self.banner_image.startswith("http"): stream_file(self.banner_image, self.banner_path) else: if not self.banner_image.exists(): raise IOError( f"--banner image could not be found: {self.banner_image}" ) shutil.move(self.banner_image, self.banner_path) resize_image(self.banner_path, width=1060, height=175, method="thumbnail") if self.main_color and not is_hex_color(self.main_color): raise ValueError( f"--main-color is not a valid hex color: {self.main_color}") if self.secondary_color and not is_hex_color(self.secondary_color): raise ValueError( f"--secondary_color-color is not a valid hex color: {self.secondary_color}" ) def extract_playlists(self): """prepare a list of Playlist from user request USER: we fetch the hidden channel associate to it CHANNEL (and USER): we grab all playlists + `uploads` playlist PLAYLIST: we retrieve from the playlist Id(s)""" ( self.playlists, self.main_channel_id, self.uploads_playlist_id, ) = extract_playlists_details_from(self.collection_type, self.youtube_id) def extract_videos_list(self): all_videos = load_json(self.cache_dir, "videos") if all_videos is None: all_videos = {} # we only return video_ids that we'll use later on. per-playlist JSON stored for playlist in self.playlists: videos_json = get_videos_json(playlist.playlist_id) # filter in videos within date range and filter away deleted videos skip_outofrange = functools.partial(skip_outofrange_videos, self.dateafter) filter_videos = filter(skip_outofrange, videos_json) filter_videos = filter(skip_deleted_videos, filter_videos) all_videos.update( {v["contentDetails"]["videoId"]: v for v in filter_videos}) save_json(self.cache_dir, "videos", all_videos) self.videos_ids = [*all_videos.keys() ] # unpacking so it's subscriptable def download_video_files(self, max_concurrency): audext, vidext = { "webm": ("webm", "webm"), "mp4": ("m4a", "mp4") }[self.video_format] # prepare options which are shared with every downloader options = { "cachedir": self.videos_dir, "writethumbnail": True, "write_all_thumbnails": False, "writesubtitles": True, "allsubtitles": True, "subtitlesformat": "vtt", "keepvideo": False, "ignoreerrors": False, "retries": 20, "fragment-retries": 50, "skip-unavailable-fragments": True, # "external_downloader": "aria2c", # "external_downloader_args": ["--max-tries=20", "--retry-wait=30"], "outtmpl": str(self.videos_dir.joinpath("%(id)s", "video.%(ext)s")), "preferredcodec": self.video_format, "format": f"best[ext={vidext}]/bestvideo[ext={vidext}]+bestaudio[ext={audext}]/best", "y2z_videos_dir": self.videos_dir, } if self.all_subtitles: options.update({"writeautomaticsub": True}) # find number of actuall parallel workers nb_videos = len(self.videos_ids) concurrency = nb_videos if nb_videos < max_concurrency else max_concurrency # short-circuit concurency if we have only one thread (can help debug) if concurrency <= 1: return self.download_video_files_batch(options, self.videos_ids) # prepare out videos_ids batches def get_slot(): n = 0 while True: yield n n += 1 if n >= concurrency: n = 0 batches = [[] for _ in range(0, concurrency)] slot = get_slot() for video_id in self.videos_ids: batches[next(slot)].append(video_id) overall_succeeded = [] overall_failed = [] # execute the batches concurrently with concurrent.futures.ThreadPoolExecutor( max_workers=concurrency) as executor: fs = [ executor.submit(self.download_video_files_batch, options, videos_ids) for videos_ids in batches ] done, not_done = concurrent.futures.wait( fs, return_when=concurrent.futures.ALL_COMPLETED) # we have some `not_done` batches, indicating errors within if not_done: logger.critical( "Not all video-processing batches completed. Cancelling…") for future in not_done: exc = future.exception() if exc: logger.exception(exc) raise exc # retrieve our list of successful/failed video_ids for future in done: succeeded, failed = future.result() overall_succeeded += succeeded overall_failed += failed # remove left-over files for failed downloads logger.debug( f"removing left-over files of {len(overall_failed)} failed videos") for video_id in overall_failed: shutil.rmtree(self.videos_dir.joinpath(video_id), ignore_errors=True) return overall_succeeded, overall_failed def download_from_cache(self, key, video_path, encoder_version): """ whether it successfully downloaded from cache """ if self.use_any_optimized_version: if not self.s3_storage.has_object(key, self.s3_storage.bucket_name): return False else: if not self.s3_storage.has_object_matching_meta( key, tag="encoder_version", value=f"v{encoder_version}"): return False video_path.parent.mkdir(parents=True, exist_ok=True) try: self.s3_storage.download_file(key, video_path) except Exception as exc: logger.error(f"{key} failed to download from cache: {exc}") return False logger.info(f"downloaded {video_path} from cache at {key}") return True def upload_to_cache(self, key, video_path, encoder_version): """ whether it successfully uploaded to cache """ try: self.s3_storage.upload_file( video_path, key, meta={"encoder_version": f"v{encoder_version}"}) except Exception as exc: logger.error(f"{key} failed to upload to cache: {exc}") return False logger.info(f"uploaded {video_path} to cache at {key}") return True def download_video(self, video_id, options): """ download the video from cache/youtube and return True if successful """ preset = {"mp4": VideoMp4Low}.get(self.video_format, VideoWebmLow)() options_copy = options.copy() video_location = options_copy["y2z_videos_dir"].joinpath(video_id) video_path = video_location.joinpath(f"video.{self.video_format}") if self.s3_storage: s3_key = f"{self.video_format}/{self.video_quality}/{video_id}" logger.debug( f"Attempting to download video file for {video_id} from cache..." ) if self.download_from_cache(s3_key, video_path, preset.VERSION): return True try: # skip downloading the thumbnails options_copy.update({ "writethumbnail": False, "writesubtitles": False, "allsubtitles": False, "writeautomaticsub": False, }) with youtube_dl.YoutubeDL(options_copy) as ydl: ydl.download([video_id]) post_process_video( video_location, video_id, preset, self.video_format, self.low_quality, ) except ( youtube_dl.utils.DownloadError, FileNotFoundError, subprocess.CalledProcessError, ) as exc: logger.error(f"Video file for {video_id} could not be downloaded") logger.debug(exc) return False else: # upload to cache only if everything went well if self.s3_storage: logger.debug( f"Uploading video file for {video_id} to cache ...") self.upload_to_cache(s3_key, video_path, preset.VERSION) return True def download_thumbnail(self, video_id, options): """ download the thumbnail from cache/youtube and return True if successful """ preset = WebpHigh() options_copy = options.copy() video_location = options_copy["y2z_videos_dir"].joinpath(video_id) thumbnail_path = video_location.joinpath("video.webp") if self.s3_storage: s3_key = f"thumbnails/high/{video_id}" logger.debug( f"Attempting to download thumbnail for {video_id} from cache..." ) if self.download_from_cache(s3_key, thumbnail_path, preset.VERSION): return True try: # skip downloading the video options_copy.update({ "skip_download": True, "writesubtitles": False, "allsubtitles": False, "writeautomaticsub": False, }) with youtube_dl.YoutubeDL(options_copy) as ydl: ydl.download([video_id]) process_thumbnail(thumbnail_path, preset) except ( youtube_dl.utils.DownloadError, FileNotFoundError, subprocess.CalledProcessError, ) as exc: logger.error(f"Thumbnail for {video_id} could not be downloaded") logger.debug(exc) return False else: # upload to cache only if everything went well if self.s3_storage: logger.debug( f"Uploading thumbnail for {video_id} to cache ...") self.upload_to_cache(s3_key, thumbnail_path, preset.VERSION) return True def download_subtitles(self, video_id, options): """ download subtitles for a video """ options_copy = options.copy() options_copy.update({"skip_download": True, "writethumbnail": False}) try: with youtube_dl.YoutubeDL(options_copy) as ydl: ydl.download([video_id]) except Exception: logger.error(f"Could not download subtitles for {video_id}") def download_video_files_batch(self, options, videos_ids): """ download video file and thumbnail for all videos in batch and return succeeded and failed video ids """ succeeded = [] failed = [] for video_id in videos_ids: if self.download_video(video_id, options) and self.download_thumbnail( video_id, options): self.download_subtitles(video_id, options) succeeded.append(video_id) else: failed.append(video_id) return succeeded, failed def download_authors_branding(self): videos_channels_json = load_json(self.cache_dir, "videos_channels") uniq_channel_ids = list( set([chan["channelId"] for chan in videos_channels_json.values()])) for channel_id in uniq_channel_ids: save_channel_branding(self.channels_dir, channel_id, save_banner=False) self.copy_default_banner(channel_id) def copy_default_banner(self, channel_id): banner_path = self.channels_dir / channel_id / "banner.jpg" if not banner_path.exists(): shutil.copy( self.templates_dir / "assets" / "banner.jpg", self.channels_dir / channel_id / "banner.jpg", ) def update_metadata(self): # we use title, description, profile and banner of channel/user # or channel of first playlist try: main_channel_json = get_channel_json(self.main_channel_id) except KeyError: main_channel_json = { "snippet": { "title": "Unknown", "description": "" } } else: save_channel_branding(self.channels_dir, self.main_channel_id, save_banner=True) self.copy_default_banner(self.main_channel_id) # if a single playlist was requested, use if for names; # otherwise, use main_channel's details. auto_title = (self.playlists[0].title if self.is_playlist and len(self.playlists) == 1 else main_channel_json["snippet"]["title"].strip()) auto_description = (clean_text(self.playlists[0].description) if self.is_playlist and len(self.playlists) == 1 else clean_text( main_channel_json["snippet"]["description"])) self.title = self.title or auto_title or "-" self.description = self.description or auto_description or "-" if self.creator is None: if self.is_single_channel: self.creator = _("Youtube Channel “{title}”").format( title=main_channel_json["snippet"]["title"]) else: self.creator = _("Youtube Channels") self.publisher = self.publisher or "Kiwix" self.tags = self.tags or ["youtube"] if "_videos:yes" not in self.tags: self.tags.append("_videos:yes") # copy our main_channel branding into /(profile|banner).jpg if not supplied if not self.profile_path.exists(): shutil.copy( self.channels_dir.joinpath(self.main_channel_id, "profile.jpg"), self.profile_path, ) if not self.banner_path.exists(): shutil.copy( self.channels_dir.joinpath(self.main_channel_id, "banner.jpg"), self.banner_path, ) # set colors from images if not supplied if self.main_color is None or self.secondary_color is None: profile_main, profile_secondary = get_colors(self.profile_path) self.main_color = self.main_color or profile_main self.secondary_color = self.secondary_color or profile_secondary resize_image( self.profile_path, width=48, height=48, method="thumbnail", dst=self.build_dir.joinpath("favicon.jpg"), ) def make_html_files(self, actual_videos_ids): """make up HTML structure to read the content /home.html Homepage for each video: - <slug-title>.html HTML article - videos/<videoId>/video.<ext> video file - videos/<videoId>/video.<lang>.vtt subtititle(s) - videos/<videoId>/video.webp template """ def remove_unused_videos(videos): video_ids = [ video["contentDetails"]["videoId"] for video in videos ] for path in self.videos_dir.iterdir(): if path.is_dir() and path.name not in video_ids: logger.debug(f"Removing unused video {path.name}") shutil.rmtree(path, ignore_errors=True) def is_present(video): """ whether this video has actually been succeffuly downloaded """ return video["contentDetails"]["videoId"] in actual_videos_ids def video_has_channel(videos_channels, video): return video["contentDetails"]["videoId"] in videos_channels def get_subtitles(video_id): video_dir = self.videos_dir.joinpath(video_id) languages = [ x.stem.split(".")[1] for x in video_dir.iterdir() if x.is_file() and x.name.endswith(".vtt") ] def to_jinja_subtitle(lang): try: subtitle = get_language_details( YOUTUBE_LANG_MAP.get(lang, lang)) except Exception: logger.error(f"Failed to get language details for {lang}") raise return { "code": lang, # Youtube.com uses `English - code` format. # Note: videojs displays it lowercased anyway "name": f"{subtitle['english'].title()} - {subtitle['query']}", } # Youtube.com sorts subtitles by English name return sorted(map(to_jinja_subtitle, languages), key=lambda x: x["name"]) env = jinja2.Environment(loader=jinja2.FileSystemLoader( str(self.templates_dir)), autoescape=True) videos = load_json(self.cache_dir, "videos").values() # filter videos so we only include the ones we could retrieve videos = list(filter(is_present, videos)) videos_channels = load_json(self.cache_dir, "videos_channels") has_channel = functools.partial(video_has_channel, videos_channels) # filter videos to exclude those for which we have no channel (#76) videos = list(filter(has_channel, videos)) for video in videos: video_id = video["contentDetails"]["videoId"] title = video["snippet"]["title"] slug = get_slug(title) description = video["snippet"]["description"] publication_date = dt_parser.parse( video["contentDetails"]["videoPublishedAt"]) author = videos_channels[video_id] subtitles = get_subtitles(video_id) video_url = f"https://www.youtube.com/watch?v={video_id}" html = env.get_template("article.html").render( video_id=video_id, video_format=self.video_format, author=author, title=title, description=description, date=format_date(publication_date, format="medium", locale=self.locale), subtitles=subtitles, url=video_url, channel_id=video["snippet"]["channelId"], color=self.main_color, background_color=self.secondary_color, autoplay=self.autoplay, ) with open(self.build_dir.joinpath(f"{slug}.html"), "w", encoding="utf-8") as fp: fp.write(html) # build homepage html = env.get_template("home.html").render( playlists=self.playlists, video_format=self.video_format, title=self.title, description=self.description, color=self.main_color, background_color=self.secondary_color, page_label=_("Page {current}/{total}"), back_label=_("Back to top"), ) with open(self.build_dir.joinpath("home.html"), "w", encoding="utf-8") as fp: fp.write(html) # rewrite app.js including `format` with open(self.assets_dir.joinpath("app.js"), "w", encoding="utf-8") as fp: fp.write( env.get_template("assets/app.js").render( video_format=self.video_format)) # rewrite app.js including `pagination` with open(self.assets_dir.joinpath("db.js"), "w", encoding="utf-8") as fp: fp.write( env.get_template("assets/db.js").render( NB_VIDEOS_PER_PAGE=self.nb_videos_per_page)) # write list of videos in data.js def to_data_js(video): return { "id": video["contentDetails"]["videoId"], "title": video["snippet"]["title"], "slug": get_slug(video["snippet"]["title"]), "description": video["snippet"]["description"], "subtitles": get_subtitles(video["contentDetails"]["videoId"]), "thumbnail": str( Path("videos").joinpath(video["contentDetails"]["videoId"], "video.webp")), } with open(self.assets_dir.joinpath("data.js"), "w", encoding="utf-8") as fp: # write all playlists as they are for playlist in self.playlists: # retrieve list of videos for PL playlist_videos = load_json( self.cache_dir, f"playlist_{playlist.playlist_id}_videos") # filtering-out missing ones (deleted or not downloaded) playlist_videos = list( filter(skip_deleted_videos, playlist_videos)) playlist_videos = list(filter(is_present, playlist_videos)) playlist_videos = list(filter(has_channel, playlist_videos)) # sorting them based on playlist playlist_videos.sort(key=lambda v: v["snippet"]["position"]) fp.write("var json_{slug} = {json_str};\n".format( slug=playlist.slug, json_str=json.dumps(list(map(to_data_js, playlist_videos)), indent=4), )) # write a metadata.json file with some content-related data with open(self.build_dir.joinpath("metadata.json"), "w", encoding="utf-8") as fp: json.dump({"video_format": self.video_format}, fp, indent=4) # clean videos left out in videos directory remove_unused_videos(videos)
class Ted2Zim: def __init__( self, topics, debug, name, video_format, low_quality, output_dir, no_zim, fname, languages, locale_name, title, description, creator, publisher, tags, keep_build_dir, autoplay, use_any_optimized_version, s3_url_with_credentials, playlist, subtitles_enough, subtitles_setting, tmp_dir, threads, ): # video-encoding info self.video_format = video_format self.low_quality = low_quality # zim params self.fname = fname self.languages = ( [] if languages is None else [l.strip() for l in languages.split(",")] ) self.tags = [] if tags is None else [t.strip() for t in tags.split(",")] self.title = title self.description = description self.creator = creator self.publisher = publisher self.name = name # directory setup self.output_dir = pathlib.Path(output_dir).expanduser().resolve() if tmp_dir: pathlib.Path(tmp_dir).mkdir(parents=True, exist_ok=True) self.build_dir = pathlib.Path(tempfile.mkdtemp(dir=tmp_dir)) # scraper options self.topics = ( [] if not topics else [c.strip().replace(" ", "+") for c in topics.split(",")] ) self.autoplay = autoplay self.playlist = playlist self.subtitles_enough = subtitles_enough self.subtitles_setting = ( subtitles_setting if subtitles_setting == ALL or subtitles_setting == MATCHING or subtitles_setting == NONE else self.to_ted_langcodes( [lang.strip() for lang in subtitles_setting.split(",")] ) ) self.threads = threads self.yt_downloader = None # optimization cache self.s3_url_with_credentials = s3_url_with_credentials self.use_any_optimized_version = use_any_optimized_version self.s3_storage = None self.video_quality = "low" if self.low_quality else "high" # debug/developer options self.no_zim = no_zim self.keep_build_dir = keep_build_dir self.debug = debug # class members self.videos = [] self.playlist_title = None self.playlist_description = None self.source_languages = ( [] if not self.languages else self.to_ted_langcodes(self.languages) ) self.zim_lang = None self.already_visited = [] # set and record locale for translations locale_details = get_language_details(locale_name) if locale_details["querytype"] != "locale": locale_name = locale_details["iso-639-1"] try: self.locale = setlocale(ROOT_DIR, locale_name) except locale.Error: logger.error( f"No locale for {locale_name}. Use --locale to specify it. " "defaulting to en_US" ) self.locale = setlocale(ROOT_DIR, "en") # locale's language code self.locale_name = self.to_ted_langcodes(locale_name) @property def templates_dir(self): return ROOT_DIR.joinpath("templates") @property def videos_dir(self): return self.build_dir.joinpath("videos") @property def ted_videos_json(self): return self.build_dir.joinpath("ted_videos.json") @property def ted_topics_json(self): return self.build_dir.joinpath("ted_topics.json") @property def talks_base_url(self): return BASE_URL + "talks" @property def playlists_base_url(self): return BASE_URL + "playlists" def append_part1_or_part3(self, lang_code_list, lang_info): """Fills missing ISO languages codes for all in list lang_code_list: list og lang codes lang_info: see zimscraperlib.i18n""" # ignore extra language mappings if supplied query was an iso-639-1 code if "part1" in lang_info["iso_types"]: lang_code_list.append(lang_info["iso-639-1"]) # supplied query was not iso-639-1 else: if lang_info["iso-639-1"]: lang_code_list.append(lang_info["iso-639-1"]) # check for extra language codes to include if lang_info["iso-639-1"] in TEDLANGS["mappings"]: for code in TEDLANGS["mappings"][lang_info["iso-639-1"]]: lang_code_list.append(code) elif lang_info["iso-639-3"]: lang_code_list.append(lang_info["iso-639-3"]) else: supplied_lang = lang_info["query"] logger.error(f"Language {supplied_lang} is not supported by TED") def to_ted_langcodes(self, languages): """Converts languages queries into TED language codes Examples: ["English", "fr", "hin"] => ["en", "fr", "hi"] ["chi", "fake"] => ["zh", "zh-cn", "zh-tw"] """ lang_code_list = [] for lang in languages: lang_info = get_language_details(lang, failsafe=True) if lang_info: if lang_info["querytype"] == "purecode": self.append_part1_or_part3(lang_code_list, lang_info) elif lang_info["querytype"] == "locale": query = lang_info["query"].replace("_", "-") if query in TEDLANGS["locales"]: lang_code_list.append(query) else: self.append_part1_or_part3(lang_code_list, lang_info) else: self.append_part1_or_part3(lang_code_list, lang_info) return list(set(lang_code_list)) def extract_videos_from_playlist(self, playlist): """extracts metadata for all videos in the given playlist calls extract_video_info on all links to get this data """ playlist_url = f"{self.playlists_base_url}/{playlist}" logger.debug(f"extract_videos_from_playlist: {playlist_url}") soup = BeautifulSoup(download_link(playlist_url).text, features="html.parser") video_elements = soup.find_all("a", attrs={"class": "group"}) self.playlist_title = soup.find("h1").string self.playlist_description = soup.find("p", attrs={"class": "text-base"}).string for element in video_elements: relative_path = element.get("href") url = urllib.parse.urljoin(self.talks_base_url, relative_path) if self.extract_info_from_video_page(url): if self.source_languages and len(self.source_languages) > 1: other_lang_urls = self.generate_urls_for_other_languages(url) logger.debug( f"Searching info for the video in other {len(other_lang_urls)} language(s)" ) for lang_url in other_lang_urls: self.extract_info_from_video_page(lang_url) self.already_visited.append(urllib.parse.urlparse(url)[2]) logger.debug(f"Seen {relative_path}") logger.debug(f"Total videos found on playlist: {len(video_elements)}") if not video_elements: raise ValueError("Wrong playlist ID supplied. No videos found") def generate_search_result_and_scrape(self, topic_url, total_videos_scraped): """generates a search result and returns the total number of videos scraped""" page = 1 while True: logger.debug(f"generate_search_result_and_scrape: {topic_url}&page={page}") html = download_link(f"{topic_url}&page={page}").text nb_videos_extracted, nb_videos_on_page = self.extract_videos_on_topic_page( html ) if nb_videos_on_page == 0: break total_videos_scraped += nb_videos_extracted page += 1 return total_videos_scraped def extract_videos_from_topics(self, topic): """extracts metadata for required number of videos on different topics""" logger.debug(f"Fetching video links for topic: {topic}") topic_url = f"{self.talks_base_url}?topics%5B%5D={topic}" total_videos_scraped = 0 if self.source_languages: for lang in self.source_languages: topic_url = topic_url + f"&language={lang}" total_videos_scraped = self.generate_search_result_and_scrape( topic_url, total_videos_scraped ) else: total_videos_scraped = self.generate_search_result_and_scrape( topic_url, total_videos_scraped ) logger.info(f"Total video links found in {topic}: {total_videos_scraped}") if total_videos_scraped == 0: return False return True def update_zim_metadata(self): if not self.languages: self.zim_lang = "eng" else: if len(self.source_languages) > 1: self.zim_lang = "mul" else: lang_info = get_language_details( self.source_languages[0], failsafe=True ) if lang_info: self.zim_lang = lang_info["iso-639-3"] else: self.zim_lang = "eng" if self.playlist: if not self.title: self.title = self.playlist_title.strip() if not self.description: self.description = self.playlist_description.strip() else: if len(self.topics) > 1: if not self.title: self.title = "TED Collection" if not self.description: self.description = "A selection of TED videos from several topics" else: topic_str = self.topics[0].replace("+", " ") if not self.title: self.title = f"{topic_str.capitalize()} from TED" if not self.description: self.description = f"A selection of {topic_str} videos from TED" def get_display_name(self, lang_code, lang_name): """Display name for language""" lang_info = get_language_details(lang_code, failsafe=True) if lang_code != "en" and lang_info: return lang_info["native"] + " - " + lang_name return lang_name def get_subtitle_dict(self, lang): """dict of language name and code from a larger dict lang Example: { 'languageCode': 'en', 'languageName': 'English' } """ return { "languageName": self.get_display_name( lang["languageCode"], lang["languageName"] ), "languageCode": lang["languageCode"], } def generate_subtitle_list(self, video_id, langs, page_lang, audio_lang): """List of all subtitle languages with link to their pages""" subtitles = [] if self.subtitles_setting == ALL or ( not self.source_languages and self.topics and self.subtitles_setting != NONE ): subtitles = [self.get_subtitle_dict(lang) for lang in langs] elif self.subtitles_setting == MATCHING or ( self.subtitles_enough and self.subtitles_setting == NONE and page_lang != audio_lang ): subtitles = [ self.get_subtitle_dict(lang) for lang in langs if lang["languageCode"] == page_lang ] elif self.subtitles_setting and self.subtitles_setting != NONE: if not self.subtitles_enough and self.topics: subtitles = [ self.get_subtitle_dict(lang) for lang in langs if lang["languageCode"] in self.subtitles_setting ] else: subtitles = [ self.get_subtitle_dict(lang) for lang in langs if lang["languageCode"] in self.subtitles_setting or lang["languageCode"] in self.source_languages ] return update_subtitles_list(video_id, subtitles) def generate_urls_for_other_languages(self, url): """Possible URLs for other requested languages based on a video url""" urls = [] page_lang, query = self.get_lang_code_from_url(url, with_full_query=True) url_parts = list(urllib.parse.urlparse(url)) # update the language query field value with other languages and form URLs for language in self.source_languages: if language != page_lang: query.update({"language": language}) url_parts[4] = urllib.parse.urlencode(query) urls.append(urllib.parse.urlunparse(url_parts)) return urls def extract_videos_on_topic_page(self, page_html): # all videos are embedded in a <div> with the class name 'row'. # we are searching for the div inside this div, that has an <a>-tag # with the class name 'media__image', because this is the relative # link to the representative TED talk. It turns this relative link to # an absolute link and calls extract_video_info for them soup = BeautifulSoup(page_html, features="html.parser") video_links = soup.select("div.row div.media__image a") nb_extracted = 0 nb_listed = len(video_links) logger.debug(f"{nb_listed} video(s) found on current page") for video_link in video_links: url = urllib.parse.urljoin(self.talks_base_url, video_link["href"]) if self.extract_info_from_video_page(url): nb_extracted += 1 if self.source_languages and len(self.source_languages) > 1: other_lang_urls = self.generate_urls_for_other_languages(url) logger.debug( f"Searching info for video in other {len(other_lang_urls)} language(s)" ) for lang_url in other_lang_urls: self.extract_info_from_video_page(lang_url) self.already_visited.append(urllib.parse.urlparse(url)[2]) logger.debug(f"Seen {video_link['href']}") return nb_extracted, nb_listed def get_lang_code_from_url(self, url, with_full_query=False): """gets the queried language code from a ted talk url""" # sample - https://www.ted.com/talks/alex_rosenthal_the_gauntlet_think_like_a_coder_ep_8?language=ja url_parts = list(urllib.parse.urlparse(url)) # explode url to extract `language` query field value query = dict(urllib.parse.parse_qsl(url_parts[4])) current_lang = query.get("language") if with_full_query: return current_lang, query return current_lang def extract_download_link(self, talk_data): """Returns download link / youtube video ID for a TED video""" if ( isinstance(talk_data.get("resources", {}).get("h264"), list) and len(talk_data["resources"]["h264"]) and talk_data["resources"]["h264"][0].get("file") ): logger.debug( "Using h264 resource link for bitrate=" f"{talk_data['resources']['h264'][0].get('bitrate')}" ) return talk_data["resources"]["h264"][0]["file"] logger.error("No download link found for the video") return None def update_videos_list( self, video_id, lang_code, lang_name, title, description, speaker, speaker_profession, speaker_bio, speaker_picture, date, thumbnail, video_link, length, subtitles, ): # append to self.videos and return if not present if not [video for video in self.videos if video.get("id", None) == video_id]: self.videos.append( { "id": video_id, "languages": [ { "languageCode": lang_code, "languageName": self.get_display_name(lang_code, lang_name), } ], "title": [{"lang": lang_code, "text": title}], "description": [{"lang": lang_code, "text": description}], "speaker": speaker, "speaker_profession": speaker_profession, "speaker_bio": speaker_bio, "speaker_picture": speaker_picture, "date": date, "thumbnail": thumbnail, "video_link": video_link, "length": length, "subtitles": subtitles, } ) logger.debug(f"Successfully inserted video {video_id} into video list") return True # update localized meta for video if already in self.videos # based on --subtitles=matching logger.debug(f"Video {video_id} already present in video list") for index, video in enumerate(self.videos): if video.get("id", None) == video_id: if {"lang": lang_code, "text": title} not in video["title"]: self.videos[index]["title"].append( {"lang": lang_code, "text": title} ) self.videos[index]["description"].append( {"lang": lang_code, "text": description} ) self.videos[index]["languages"].append( { "languageCode": lang_code, "languageName": self.get_display_name(lang_code, lang_name), } ) if self.subtitles_setting == MATCHING or self.subtitles_setting == NONE: self.videos[index]["subtitles"] += subtitles return False def extract_video_info_from_json(self, json_data): player_data = json.loads(json_data["playerData"]) lang_code = json_data["language"] lang_name = [ lang["languageName"] for lang in player_data["languages"] if lang["languageCode"] == lang_code ][-1] # talk_info = json_data["talks"][0] native_talk_language = player_data["nativeLanguage"] if ( not self.subtitles_enough and self.source_languages and native_talk_language != lang_code and self.topics ): return False # Extract the speaker of the TED talk if len(json_data["speakers"]): speaker_info = json_data["speakers"][0] speaker = " ".join( [ speaker_info.get("firstName"), speaker_info.get("middleName"), speaker_info.get("lastName"), ] ) else: speaker_info = { "description": "None", "whotheyare": "None", "photo_url": "", } if "presenterDisplayName" in json_data: speaker = json_data["presenterDisplayName"] else: speaker = "None" # Extract the ted talk details from json video_id = json_data["id"] speaker_profession = speaker_info["description"] speaker_bio = speaker_info.get("whoTheyAre", "-") speaker_picture = speaker_info.get("avatar", "-") title = json_data["title"] description = json_data["description"] date = dateutil.parser.parse(json_data["recordedOn"]).strftime("%d %B %Y") length = int(json_data["duration"]) // 60 thumbnail = player_data["thumb"] video_link = self.extract_download_link(player_data) if not video_link: logger.error("No suitable download link found. Skipping video") return False langs = player_data["languages"] subtitles = self.generate_subtitle_list( video_id, langs, lang_code, native_talk_language ) return self.update_videos_list( video_id=video_id, lang_code=lang_code, lang_name=lang_name, title=title, description=description, speaker=speaker, speaker_profession=speaker_profession, speaker_bio=speaker_bio, speaker_picture=speaker_picture, date=date, thumbnail=thumbnail, video_link=video_link, length=length, subtitles=subtitles, ) def extract_info_from_video_page(self, url, retry_count=0): """extract all info from a TED video page url and update self.videos""" # Every TED video page has a <script>-tag with a Javascript # object with JSON in it. We will just stip away the object # signature and load the json to extract meta-data out of it. # returns True if successfully scraped new video # don't scrape if URL already visited if urllib.parse.urlparse(url)[2] in self.already_visited: return False # don't scrape if maximum retry count is reached if retry_count > 5: logger.error("Max retries exceeded. Skipping video") return False logger.debug(f"extract_info_from_video_page: {url}") soup = BeautifulSoup(download_link(url).text, features="html.parser") json_data = json.loads( soup.find("script", attrs={"id": "__NEXT_DATA__"}).string )["props"]["pageProps"]["videoData"] requested_lang_code = self.get_lang_code_from_url(url) if requested_lang_code and json_data["language"] != requested_lang_code: logger.error( f"Video has not yet been translated into {requested_lang_code}" ) return False return self.extract_video_info_from_json(json_data) def add_default_language(self): """add metatada in default language (english or first avail) on all videos""" for video in self.videos: en_found = False for index, lang in enumerate(video["languages"]): if lang["languageCode"] == "en": en_found = True video["title"] = [ {"lang": "default", "text": video["title"][index]["text"]} ] + video["title"] video["description"] = [ {"lang": "default", "text": video["description"][index]["text"]} ] + video["description"] if not en_found: video["title"] = [ {"lang": "default", "text": video["title"][0]["text"]} ] + video["title"] video["description"] = [ {"lang": "default", "text": video["description"][0]["text"]} ] + video["description"] # update video slug video["slug"] = slugify(video["title"][0]["text"], separator="-") def render_video_pages(self): # Render static html pages from the scraped video data and # save the pages in build_dir/<video-id>/index.html env = jinja2.Environment( loader=jinja2.FileSystemLoader(str(self.templates_dir)), autoescape=True ) for video in self.videos: titles = video["title"] html = env.get_template("article.html").render( speaker=video["speaker"], languages=video["subtitles"], speaker_bio=video["speaker_bio"].replace("Full bio", ""), speaker_img=video["speaker_picture"], date=video["date"], profession=video["speaker_profession"], video_format=self.video_format, autoplay=self.autoplay, video_id=str(video["id"]), title=get_main_title(titles, self.locale_name), titles=titles, descriptions=video["description"], back_to_list=_("Back to the list"), ) html_path = self.build_dir.joinpath(video["slug"]) with open(html_path, "w", encoding="utf-8") as html_page: html_page.write(html) def render_home_page(self): # Render the homepage env = jinja2.Environment( loader=jinja2.FileSystemLoader(str(self.templates_dir)), autoescape=True ) all_langs = { language["languageCode"]: language["languageName"] for video in self.videos for language in video["subtitles"] + video["languages"] } languages = [ {"languageName": value, "languageCode": key} for key, value in all_langs.items() ] languages = sorted(languages, key=lambda x: x["languageName"]) html = env.get_template("home.html").render( languages=languages, page_title=_("TED Talks"), language_filter_text=_("Filter by language"), back_to_top=_("Back to the top"), pagination_text=_("Page"), ) home_page_path = self.build_dir.joinpath("index") with open(home_page_path, "w", encoding="utf-8") as html_page: html_page.write(html) def copy_files_to_build_directory(self): # Copy files from template_dir to build_dir assets_dir = self.templates_dir.joinpath("assets") if assets_dir.exists(): shutil.copytree( assets_dir, self.build_dir.joinpath("assets"), dirs_exist_ok=True ) shutil.copy( self.templates_dir.joinpath("favicon.png"), self.build_dir.joinpath("favicon.png"), ) def generate_datafile(self): """Generate data.js inside assets folder""" video_list = [] for video in self.videos: lang_codes = [lang["languageCode"] for lang in video["subtitles"]] + [ lang["languageCode"] for lang in video["languages"] ] json_data = { "languages": [lang_code for lang_code in set(lang_codes)], "id": video["id"], "description": video["description"], "title": video["title"], "speaker": video["speaker"], "slug": video["slug"], } video_list.append(json_data) assets_path = self.build_dir.joinpath("assets") if not assets_path.exists(): assets_path.mkdir(parents=True) with open(assets_path.joinpath("data.js"), "w") as data_file: data_file.write("json_data = " + json.dumps(video_list, indent=4)) def download_jpeg_image_and_convert( self, url, fpath, preset_options={}, resize=None ): """downloads a JPEG image and converts and optimizes it into desired format detected from fpath""" org_jpeg_path = pathlib.Path( tempfile.NamedTemporaryFile(delete=False, suffix=".jpg").name ) save_large_file(url, org_jpeg_path) if resize is not None: resize_image( org_jpeg_path, width=resize[0], height=resize[1], method="cover", ) optimize_image( org_jpeg_path, fpath, convert=True, delete_src=True, **preset_options ) logger.debug(f"Converted {org_jpeg_path} to {fpath} and optimized ") def download_speaker_image( self, video_id, video_title, video_speaker, speaker_path ): """downloads the speaker image""" downloaded_from_cache = False preset = WebpMedium() if self.s3_storage: s3_key = f"speaker_image/{video_id}" downloaded_from_cache = self.download_from_cache( s3_key, speaker_path, preset.VERSION ) if not downloaded_from_cache: try: # download an image of the speaker if not video_speaker: logger.debug("Speaker doesn't have an image") else: logger.debug(f"Downloading Speaker image for {video_title}") self.download_jpeg_image_and_convert( video_speaker, speaker_path, preset_options=preset.options ) except Exception: logger.error(f"Could not download speaker image for {video_title}") else: if self.s3_storage and video_speaker: self.upload_to_cache(s3_key, speaker_path, preset.VERSION) def download_thumbnail( self, video_id, video_title, video_thumbnail, thumbnail_path ): """download the thumbnail""" downloaded_from_cache = False preset = WebpMedium() if self.s3_storage: s3_key = f"thumbnail/{video_id}" downloaded_from_cache = self.download_from_cache( s3_key, thumbnail_path, preset.VERSION ) if not downloaded_from_cache: try: # download the thumbnail of the video logger.debug(f"Downloading thumbnail for {video_title}") self.download_jpeg_image_and_convert( video_thumbnail, thumbnail_path, preset_options=preset.options, resize=(248, 187), ) except Exception: logger.error(f"Could not download thumbnail for {video_title}") else: if self.s3_storage: self.upload_to_cache(s3_key, thumbnail_path, preset.VERSION) def download_video_files(self, video): """download all video files (video, thumbnail, speaker)""" # Download all the TED talk videos and the meta-data for it. # Save the videos in build_dir/{video id}/video.mp4. # Save the thumbnail for the video in build_dir/{video id}/thumbnail.jpg. # Save the image of the speaker in build_dir/{video id}/speaker.jpg. # set up variables video_id = str(video["id"]) # Take the english version of title or else whatever language it's available in video_title = video["title"][0]["text"] video_link = video["video_link"] video_speaker = video["speaker_picture"] video_thumbnail = video["thumbnail"] video_dir = self.videos_dir.joinpath(video_id) org_video_file_path = video_dir.joinpath("video.mp4") req_video_file_path = video_dir.joinpath(f"video.{self.video_format}") speaker_path = video_dir.joinpath("speaker.webp") thumbnail_path = video_dir.joinpath("thumbnail.webp") # ensure that video directory exists if not video_dir.exists(): video_dir.mkdir(parents=True) # set preset preset = {"mp4": VideoMp4Low}.get(self.video_format, VideoWebmLow)() # download video downloaded_from_cache = False logger.debug(f"Downloading {video_title}") if self.s3_storage: s3_key = f"{self.video_format}/{self.video_quality}/{video_id}" downloaded_from_cache = self.download_from_cache( s3_key, req_video_file_path, preset.VERSION ) if not downloaded_from_cache: try: if "https://" not in video_link: options = ( BestWebm if self.video_format == "webm" else BestMp4 ).get_options( target_dir=video_dir, filepath=pathlib.Path("video.%(ext)s") ) self.yt_downloader.download(video_link, options) else: save_large_file(video_link, org_video_file_path) except Exception: logger.error(f"Could not download {org_video_file_path}") # download speaker and thumbnail images self.download_speaker_image(video_id, video_title, video_speaker, speaker_path) self.download_thumbnail(video_id, video_title, video_thumbnail, thumbnail_path) # recompress if necessary try: if not downloaded_from_cache: post_process_video( video_dir, video_id, preset, self.video_format, self.low_quality, ) except Exception as e: logger.error(f"Failed to post process video {video_id}") logger.debug(e) else: # upload to cache only if recompress was successful if self.s3_storage and not downloaded_from_cache: self.upload_to_cache(s3_key, req_video_file_path, preset.VERSION) def download_video_files_parallel(self): """download videos and images parallely""" self.yt_downloader = YoutubeDownloader(threads=1) with concurrent.futures.ThreadPoolExecutor( max_workers=self.threads ) as executor: fs = [ executor.submit(self.download_video_files, video) for video in self.videos ] concurrent.futures.wait(fs, return_when=concurrent.futures.ALL_COMPLETED) self.yt_downloader.shutdown() def download_subtitles(self, index, video): """download, converts and writes VTT subtitles for a video at a specific index in self.videos""" # Download the subtitle files, generate a WebVTT file # and save the subtitles in # build_dir/{video id}/subs/subs_{language code}.vtt if not video["subtitles"]: return video_dir = self.videos_dir.joinpath(video["id"]) subs_dir = video_dir.joinpath("subs") if not subs_dir.exists(): subs_dir.mkdir(parents=True) else: logger.debug(f"Subs dir exists already") # download subtitles logger.debug(f"Downloading subtitles for {video['title'][0]['text']}") valid_subs = [] for subtitle in video["subtitles"]: time.sleep(0.5) # throttling vtt_subtitle = WebVTT(subtitle["link"]).convert() if not vtt_subtitle: logger.error( f"Subtitle file for {subtitle['languageCode']} could not be created" ) continue valid_subs.append(subtitle) vtt_path = subs_dir.joinpath(f"subs_{subtitle['languageCode']}.vtt") with open(vtt_path, "w", encoding="utf-8") as sub_file: sub_file.write(vtt_subtitle) self.videos[index]["subtitles"] = valid_subs def download_subtitles_parallel(self): """download subtitles for all videos parallely""" with concurrent.futures.ThreadPoolExecutor( max_workers=self.threads ) as executor: fs = [ executor.submit(self.download_subtitles, index, video) for index, video in enumerate(self.videos) ] concurrent.futures.wait(fs, return_when=concurrent.futures.ALL_COMPLETED) def s3_credentials_ok(self): logger.info("Testing S3 Optimization Cache credentials") self.s3_storage = KiwixStorage(self.s3_url_with_credentials) if not self.s3_storage.check_credentials( list_buckets=True, bucket=True, write=True, read=True, failsafe=True ): logger.error("S3 cache connection error testing permissions.") logger.error(f" Server: {self.s3_storage.url.netloc}") logger.error(f" Bucket: {self.s3_storage.bucket_name}") logger.error(f" Key ID: {self.s3_storage.params.get('keyid')}") logger.error(f" Public IP: {get_public_ip()}") return False return True def download_from_cache(self, key, object_path, encoder_version): """whether it downloaded from S3 cache""" if self.use_any_optimized_version: if not self.s3_storage.has_object(key, self.s3_storage.bucket_name): return False else: if not self.s3_storage.has_object_matching_meta( key, tag="encoder_version", value=f"v{encoder_version}" ): return False object_path.parent.mkdir(parents=True, exist_ok=True) try: self.s3_storage.download_file(key, object_path) except Exception as exc: logger.error(f"{key} failed to download from cache: {exc}") return False logger.info(f"downloaded {object_path} from cache at {key}") return True def upload_to_cache(self, key, object_path, encoder_version): """whether it uploaded from S3 cache""" try: self.s3_storage.upload_file( object_path, key, meta={"encoder_version": f"v{encoder_version}"} ) except Exception as exc: logger.error(f"{key} failed to upload to cache: {exc}") return False logger.info(f"uploaded {object_path} to cache at {key}") return True def remove_failed_topics_and_check_extraction(self, failed_topics): """removes failed topics from topics list and raises error if scraper cannot continue""" for topic in failed_topics: self.topics.remove(topic) if not self.topics: if self.source_languages: raise ValueError( "No videos found for any topic in the language(s) requested. Check topic(s) and/or language code supplied to --languages" ) raise ValueError("Wrong topic(s) were supplied. No videos found") def run(self): logger.info( f"Starting scraper with:\n" f" langs: {', '.join(self.source_languages)}\n" f" subtitles : {', '.join(self.subtitles_setting) if isinstance(self.subtitles_setting, list) else self.subtitles_setting}\n" f" video format : {self.video_format}" ) if self.s3_url_with_credentials and not self.s3_credentials_ok(): raise ValueError("Unable to connect to Optimization Cache. Check its URL.") if self.s3_storage: logger.info( f"Using cache: {self.s3_storage.url.netloc} with bucket: {self.s3_storage.bucket_name}" ) # playlist mode requested if self.playlist: self.extract_videos_from_playlist(self.playlist) # topic(s) mode requested else: failed = [] for topic in self.topics: if not self.extract_videos_from_topics(topic): failed.append(topic) else: logger.debug(f"Successfully scraped {topic}") self.remove_failed_topics_and_check_extraction(failed) self.add_default_language() self.update_zim_metadata() self.download_video_files_parallel() self.download_subtitles_parallel() self.render_home_page() self.render_video_pages() self.copy_files_to_build_directory() self.generate_datafile() # zim creation and cleanup if not self.no_zim: self.fname = ( self.fname or f"{self.name.replace(' ', '-')}_{{period}}.zim" ).format(period=datetime.datetime.now().strftime("%Y-%m")) logger.info("building ZIM file") if not self.output_dir.exists(): self.output_dir.mkdir(parents=True) make_zim_file( build_dir=self.build_dir, fpath=self.output_dir.joinpath(self.fname), name=self.name, main_page="index", favicon="favicon.png", title=self.title, description=self.description, language=self.zim_lang, creator=self.creator, publisher=self.publisher, tags=self.tags + ["_category:ted", "ted", "_videos:yes"], scraper=SCRAPER, ) if not self.keep_build_dir: logger.info("removing temp folder") shutil.rmtree(self.build_dir, ignore_errors=True) logger.info("Done Everything")
def download_image_s3(self): self.logger.info("Starting s3 image download") # add credentials to URL url = urllib.parse.urlparse(self.task["download_uri"]) qs = urllib.parse.parse_qs(url.query) qs["keyId"] = Setting.s3_access_key qs["secretAccessKey"] = Setting.s3_secret_key # setup download logging downloader_log = io.StringIO() downloader_logger = logging.getLogger("downloader_log") downloader_logger.propagate = True downloader_logger.setLevel(logging.DEBUG) downloader_logger.addHandler(logging.StreamHandler(stream=downloader_log)) # init and test storage downloader_logger.info("initializing S3") s3_storage = KiwixStorage( urllib.parse.SplitResult( "https", url.netloc, url.path, urllib.parse.urlencode(qs, doseq=True), url.fragment, ).geturl() ) downloader_logger.debug( f"S3 initialized for {s3_storage.url.netloc}/{s3_storage.bucket_name}" ) # download downloader_logger.info(f"Downloading from {self.img_path.name}") try: hook = ImageTransferHook( output=downloader_log, size=s3_storage.get_object_stat(key=self.img_path.name).size, name=self.img_path.name, ) s3_storage.download_file( key=self.img_path.name, fpath=str(self.img_path), Callback=hook ) downloaded = True except Exception as exc: downloaded = False downloader_logger.error(f"downloader failed: {exc}") downloader_logger.exception(exc) else: downloader_logger.info("downloader ran successfuly.") if downloaded: # image downloaded, mark for autodeletion try: autodelete_on = datetime.datetime.now() + datetime.timedelta(days=1) downloader_logger.info(f"Setting autodelete to now ({autodelete_on})") s3_storage.set_object_autodelete_on( key=self.img_path.name, on=autodelete_on ) except Exception as exc: downloader_logger.error( "Failed to set autodelete (normal if before bucket retention)" ) downloader_logger.exception(exc) self.logger.info("collecting downloader log") try: self.logs["downloader_log"] = downloader_log.getvalue() downloader_log.close() except Exception as exc: self.logger.error(f"Failed to collect logs: {exc}") if not downloaded: raise subprocess.SubprocessError("S3 download failed")