def build_video_nodes(self, base_path, content): videos_url = self.get_videos_urls(content) base_path = build_path([DATA_DIR]) video_nodes = [] for video_url in videos_url: if YouTubeResource.is_youtube( video_url) and not YouTubeResource.is_channel(video_url): video = YouTubeResourceNode(video_url, lang=self.lang) video.download(download=DOWNLOAD_VIDEOS, base_path=base_path) yield video
def get_videos_urls(self, content): urls = set([]) if content is not None: video_urls = content.find_all( lambda tag: tag.name == "a" and tag.attrs.get("href", "").find( "youtube") != -1 or tag.attrs.get("href", "").find( "youtu.be") != -1 or tag.text.lower() == "youtube") for video_url in video_urls: urls.add(video_url.get("href", "")) for iframe in content.find_all("iframe"): url = iframe["src"] if YouTubeResource.is_youtube( url) and not YouTubeResource.is_channel(url): urls.add(YouTubeResource.transform_embed(url)) return urls