def fetch_video(video): youtube_id = video['id'] title = video['title'] description = video['description'] youtube_url = video['webpage_url'] subtitle_languages = video['subtitles'].keys() print(" Fetching video data: %s (%s)" % (title, youtube_url)) video_node = nodes.VideoNode( source_id=youtube_id, title=truncate_metadata(title), license=LICENSE, description=truncate_description(description), derive_thumbnail=True, language="en", files=[files.YouTubeVideoFile(youtube_id=youtube_id)], ) # Add subtitles in whichever languages are available. for language in subtitle_languages: # TODO(david): Should catch exception thrown by # files.YouTubeSubtitleFile rather than breaking abstraction. if languages.getlang(language) or languages.getlang_by_alpha2( language): video_node.add_file( files.YouTubeSubtitleFile(youtube_id=youtube_id, language=language)) else: print("WARNING: Subtitle language %s not found in languages file" % language) return video_node
def add_files(node, file_list): for f in file_list: path = f.get('path') if path is not None: abspath = get_abspath(path) # NEW: expand content:// --> ./content/ in file paths else: abspath = None file_type = guess_file_type(node.kind, filepath=abspath, youtube_id=f.get('youtube_id'), web_url=f.get('web_url'), encoding=f.get('encoding')) if file_type == FileTypes.AUDIO_FILE: node.add_file(files.AudioFile(path=abspath, language=f.get('language'))) elif file_type == FileTypes.THUMBNAIL: node.add_file(files.ThumbnailFile(path=abspath)) elif file_type == FileTypes.DOCUMENT_FILE: node.add_file(files.DocumentFile(path=abspath, language=f.get('language'))) elif file_type == FileTypes.HTML_ZIP_FILE: node.add_file(files.HTMLZipFile(path=abspath, language=f.get('language'))) elif file_type == FileTypes.VIDEO_FILE: node.add_file(files.VideoFile(path=abspath, language=f.get('language'), ffmpeg_settings=f.get('ffmpeg_settings'))) elif file_type == FileTypes.SUBTITLE_FILE: node.add_file(files.SubtitleFile(path=abspath, language=f['language'])) elif file_type == FileTypes.BASE64_FILE: node.add_file(files.Base64ImageFile(encoding=f['encoding'])) elif file_type == FileTypes.WEB_VIDEO_FILE: node.add_file(files.WebVideoFile(web_url=f['web_url'], high_resolution=f.get('high_resolution'))) elif file_type == FileTypes.YOUTUBE_VIDEO_FILE: node.add_file(files.YouTubeVideoFile(youtube_id=f['youtube_id'], high_resolution=f.get('high_resolution'))) node.add_file(files.YouTubeSubtitleFile(youtube_id=f['youtube_id'], language='en')) else: raise UnknownFileTypeError("Unrecognized file type '{0}'".format(f['path']))
def fetch_video(video): youtube_id = video['id'] title = video['title'] description = video['description'] youtube_url = video['webpage_url'] subtitle_languages = video['subtitles'].keys() print(" Fetching video data: %s (%s)" % (title, youtube_url)) video_node = nodes.VideoNode( source_id=youtube_id, title=truncate_metadata(title), license=LICENSE, description=truncate_description(description), derive_thumbnail=True, language="en", files=[files.YouTubeVideoFile(youtube_id=youtube_id)], ) # Add subtitles in whichever languages are available. for language in subtitle_languages: if getlang_patched(language): video_node.add_file(LanguagePatchedYouTubeSubtitleFile( youtube_id=youtube_id, youtube_language=language)) return video_node
def download_video_topics(topic_node, playlist_item, lang_obj, use_cache=True, to_sheet=False): """ Scrape, collect, and download the videos from playlist. """ playlist_obj = RefugeeResponsePlaylist(playlist_item, use_cache) playlist_info = playlist_obj.get_playlist_info() videos = [entry['id'] for entry in playlist_info.get('children')] for video in playlist_info.get('children'): video_id = video['id'] video_url = YOUTUBE_VIDEO_URL_FORMAT.format(video_id) video_source_id = 'refugee-response-{0}-{1}'.format( lang_obj.name, video_id) if video_id in VIDEO_DESCRIPTION_MAP: video_description = VIDEO_DESCRIPTION_MAP[video_id] else: # Exclude videos continue LOGGER.info("Video Description: '%s'", video_description) try: video_node = nodes.VideoNode( source_id=video_source_id, title=video['title'], description=video_description, author=REFUGEE_RESPONSE, language=lang_obj.code, provider=REFUGEE_RESPONSE, thumbnail=video['thumbnail'], license=licenses.get_license( "CC BY-NC-ND", copyright_holder=REFUGEE_RESPONSE), files=[ files.YouTubeVideoFile(youtube_id=video_id, language=lang_obj.code) ]) topic_node.add_child(video_node) except Exception as e: LOGGER.error('Error downloading this video: %s', e)
def add_files(node, file_list): EXPECTED_FILE_TYPES = [ VIDEO_FILE, AUDIO_FILE, DOCUMENT_FILE, HTML5_FILE, THUMBNAIL_FILE, SUBTITLES_FILE ] for f in file_list: file_type = f.get('file_type') if file_type not in EXPECTED_FILE_TYPES: LOGGER.critical(file_type) raise NotImplementedError( 'Unexpected File type found in channel json.') path = f.get('path') # path can be an URL or a local path (or None) # handle different types of files if file_type == VIDEO_FILE: # handle three types of video files if 'youtube_id' in f: video_file = files.YouTubeVideoFile( youtube_id=f['youtube_id'], download_settings=f.get('download_settings', None), high_resolution=f.get('high_resolution', True), maxheight=f.get('maxheight', None), language=f.get('language', None), ) elif 'web_url' in f: video_file = files.WebVideoFile( web_url=f['web_url'], download_settings=f.get('download_settings', None), high_resolution=f.get('high_resolution', True), maxheight=f.get('maxheight', None), language=f.get('language', None), ) else: video_file = files.VideoFile( path=f['path'], language=f.get('language', None), ffmpeg_settings=f.get('ffmpeg_settings'), ) node.add_file(video_file) elif file_type == AUDIO_FILE: node.add_file( files.AudioFile(path=f['path'], language=f.get('language', None))) elif file_type == DOCUMENT_FILE: node.add_file( files.DocumentFile(path=path, language=f.get('language', None))) elif file_type == HTML5_FILE: node.add_file( files.HTMLZipFile(path=path, language=f.get('language', None))) elif file_type == THUMBNAIL_FILE: if 'encoding' in f: node.add_file(files.Base64ImageFile(encoding=f['encoding'], )) else: node.add_file( files.ThumbnailFile( path=path, language=f.get('language', None), )) elif file_type == SUBTITLES_FILE: if 'youtube_id' in f: node.add_file( files.YouTubeSubtitleFile(youtube_id=f['youtube_id'], language=f['language'])) else: node.add_file( files.SubtitleFile(path=path, language=f['language'])) else: raise UnknownFileTypeError("Unrecognized file type '{0}'".format( f['path']))
def add_video_nodes_from_playlist( self, youtube_client, playlist_id, subtitle_languages=SUBTITLE_LANGUAGES, copyright_holder=COPYRIGHT_HOLDER, only_creative_commons=ONLY_CREATIVE_COMMONS): first_page = True next_page_token = None playlist_request_kwargs = { 'part': 'contentDetails', 'maxResults': 50, 'playlistId': playlist_id, } # Apparently the same video is in one of the playlists twice! # This is used to keep track of videos that have already been added. videos_added = {} while first_page or next_page_token: first_page = False # we're visiting the first page now! playlist_info = youtube_client.playlistItems().list( **playlist_request_kwargs).execute() playlist_items = playlist_info['items'] video_ids = [ vid['contentDetails']['videoId'] for vid in playlist_items ] videos = youtube_client.videos().list( part='status,snippet', id=','.join(video_ids)).execute()['items'] # Apparently the same video is in one of the playlists twice! # Uncomment the following code to see for yourself: # video_ids = [v['id'] for v in videos] # duplicated_videos = [v for v in video_ids if video_ids.count(v) > 1] # print("The following videos are duplicated: {}".format(duplicated_videos)) for video in videos: if video['id'] in videos_added: continue if only_creative_commons and video['status'][ 'license'] != 'creativeCommon': print( "The video '{}' is not licensed as Creative Commons... it is licensed as {}" .format(video['snippet']['title'], video['status']['license'])) else: try: video_license = licenses.CC_BY \ if video['status']['license'] == 'creativeCommon' \ else NON_CREATIVE_COMMONS_LICENSE_DEFAULT video_node = nodes.VideoNode( # source_id="{}__{}".format(video['id'], playlist_id), source_id=video['id'], title=video['snippet']['title'], language=CHANNEL_LANGUAGE, license=get_license( video_license, copyright_holder=copyright_holder), thumbnail=get_largest_thumbnail( video['snippet']['thumbnails']).get('url'), files=[ files.YouTubeVideoFile(video['id']), ]) # Get subtitles for languages designated in SUBTITLE_LANGUAGES for lang_code in subtitle_languages: if files.is_youtube_subtitle_file_supported_language( lang_code): video_node.add_file( files.YouTubeSubtitleFile( youtube_id=video['id'], language=lang_code)) else: print('Unsupported subtitle language code:', lang_code) self.add_child(video_node) videos_added[video['id']] = video_node except Exception as e: raise e # set up the next page, if there is one next_page_token = playlist_info.get('nextPageToken') if next_page_token: playlist_request_kwargs['pageToken'] = next_page_token else: try: del playlist_request_kwargs['pageToken'] except Exception as e: pass
def construct_channel(self, *args, **kwargs): """ Creates ChannelNode and build topic tree Args: - args: arguments passed in during upload_channel (currently None) - kwargs: extra argumens and options not handled by `uploadchannel`. For example, add the command line option lang="fr" and the string "fr" will be passed along to `construct_channel` as kwargs['lang']. Returns: ChannelNode """ channel = self.get_channel(*args, **kwargs) # Create ChannelNode from data in self.channel_info from apiclient.discovery import build # instantiate a YouTube Data API v3 client youtube = build('youtube', 'v3', developerKey=kwargs['--youtube-api-token']) playlists = youtube.playlists().list( # list all of the YouTube channel's playlists part='snippet', channelId=YOUTUBE_CHANNEL_ID, maxResults=50 ).execute()['items'] # For getting the thumbnail automatically # youtube_channel = youtube.channels().list( # id=YOUTUBE_CHANNEL_ID, # part='snippet' # ).execute()['items'][0] # channel.thumbnail = get_largest_thumbnail(youtube_channel['snippet']['thumbnails']).get('url') for playlist in playlists: topic = nodes.TopicNode(title=playlist['snippet']['title'], source_id=playlist['id']) first_page = True next_page_token = None playlist_request_kwargs = { 'part': 'contentDetails', 'maxResults': 50, 'playlistId': playlist['id'], } while first_page or next_page_token: first_page = False # we're visiting the first page now! playlist_info = youtube.playlistItems().list(**playlist_request_kwargs).execute() playlist_items = playlist_info['items'] video_ids = [vid['contentDetails']['videoId'] for vid in playlist_items] videos = youtube.videos().list( part='status,snippet', id=','.join(video_ids) ).execute()['items'] for video in videos: if video['status']['license'] == 'creativeCommon': try: video_node = nodes.VideoNode( source_id=video['id'], title=video['snippet']['title'], language=CHANNEL_LANGUAGE, license=get_license(licenses.CC_BY, copyright_holder='Espresso English'), thumbnail=get_largest_thumbnail(video['snippet']['thumbnails']).get('url'), files=[ files.YouTubeVideoFile(video['id']), ] ) topic.add_child(video_node) # Get subtitles for languages designated in SUBTITLE_LANGUAGES for lang_code in SUBTITLE_LANGUAGES: if files.is_youtube_subtitle_file_supported_language(lang_code): video_node.add_file( files.YouTubeSubtitleFile( youtube_id=video['id'], language=lang_code ) ) else: print('Unsupported subtitle language code:', lang_code) except Exception as e: raise e # set up the next page, if there is one next_page_token = playlist_info.get('nextPageToken') if next_page_token: playlist_request_kwargs['pageToken'] = next_page_token else: try: del playlist_request_kwargs['pageToken'] except Exception as e: pass channel.add_child(topic) raise_for_invalid_channel(channel) # Check for errors in channel construction return channel
def download_content_node(category_node, url, title, thumbnail=None, description=None): doc = get_parsed_html_from_url(url) destination = tempfile.mkdtemp() doc = download_static_assets(doc, destination, 'https://k12.thoughtfullearning.com', request_fn=make_request, url_blacklist=url_blacklist) remove_node(doc, '#header') remove_node(doc, '.subMenuBarContainer') remove_node(doc, '.breadbookmarkcontainer') remove_node(doc, '.resourcePageTypeTitle') remove_node(doc, '.sharethis-wrapper') remove_node(doc, '.ccBlock') remove_node(doc, '#block-views-resource-info-block-block-1') remove_node(doc, '#block-views-resource-info-block-block-1') remove_node(doc, '#block-views-resource-info-block-block') remove_node(doc, '.productSuggestionContainer') remove_node(doc, 'footer') # For minilessons remove_node(doc, '.field-name-field-minilesson-downloadables') # For writing assessments remove_node(doc, '.assessmentTGLink') remove_node(doc, '.assessmentModelRubrics') remove_node(doc, '.view-display-id-attachment_1') # Write out the HTML source. with open(os.path.join(destination, "index.html"), "w") as f: f.write(str(doc)) print(" ... downloaded to %s" % destination) #preview_in_browser(destination) thumbnail_path = None if thumbnail: # Manually download the thumbnail and use it so we can lowercase the # extension to be accepted by Ricecooker. thumbnail_filename = derive_filename(thumbnail) thumbnail_path = os.path.join(destination, thumbnail_filename) download_file(thumbnail, destination, request_fn=make_request, filename=thumbnail_filename) # If there is an embedded video in the page source grab it as a video node. video_node = None iframe = doc.select_one('.embedded-video iframe') if iframe: youtube_url = iframe['src'] youtube_id = get_youtube_id_from_url(youtube_url) info = ydl.extract_info(youtube_url, download=False) video_title = info['title'] print( " ... and with video titled %s from www.youtube.com/watch?v=%s" % (video_title, youtube_id)) video_node = nodes.VideoNode( source_id=youtube_id, title=truncate_metadata(info['title']), license=licenses.CC_BY_NC_SALicense( copyright_holder=truncate_metadata('Thoughtful Learning')), description=info['description'], language="en", derive_thumbnail=True, files=[files.YouTubeVideoFile(youtube_id)], ) category_node.add_child(video_node) zip_path = create_predictable_zip(destination) app_node = nodes.HTML5AppNode( source_id=url, title=truncate_metadata(title), license=licenses.CC_BY_NC_SALicense( copyright_holder=truncate_metadata('Thoughtful Learning')), description=description, thumbnail=thumbnail_path, files=[files.HTMLZipFile(zip_path)], language="en", ) category_node.add_child(app_node)
def construct_channel(self, *args, **kwargs): """ Creates ChannelNode and build topic tree Args: - args: arguments passed in on the command line - kwargs: extra options passed in as key="value" pairs on the command line For example, add the command line option lang="fr" and the value "fr" will be passed along to `construct_channel` as kwargs['lang']. Returns: ChannelNode """ # editing metadata for key, value in kwargs.items(): if key == NO_CACHE_KEYNAME: self.use_cache = False LOGGER.info("use_cache = '%d'", self.use_cache) if key == EXTRACT_VIDEO_INFO: self.insert_video_info = True self.video_list = value.split(",") if key == EXTRACT_VIDEO_PLAYLIST_INFO: self.insert_video_info = True self.to_playlist = value if key == DOWNLOAD_TO_CSV: if value == "true": print('csv = true') create_csv() exit(0) channel = self.get_channel( *args, **kwargs) # Create ChannelNode from data in self.channel_info # Get Channel Topics # Create thumbnails folder in chefdata if not exists if not os.path.isdir(os.path.join('chefdata', 'thumbnails')): os.makedirs(os.path.join('chefdata', 'thumbnails')) # youtube_cache = os.path.join("chefdata", "youtubecache") for playlist_id in PLAYLIST_MAP: playlist = YouTubePlaylistUtils(id=playlist_id, cache_dir=YOUTUBE_CACHE_DIR) playlist_info = playlist.get_playlist_info(use_proxy=False) # Get channel description if there is any playlist_description = '' if playlist_info["description"]: playlist_description = playlist_info["description"] else: playlist_description = playlist_info["title"] topic_source_id = 'aimhi-child-topic-{0}'.format( playlist_info["title"]) topic_node = nodes.TopicNode(title=playlist_info["title"], source_id=topic_source_id, author="AimHi", provider="AimHi", description=playlist_description, language="en") video_ids = [] # insert videos into playlist topic after creation for child in playlist_info["children"]: # check for duplicate videos if child["id"] not in video_ids: video = YouTubeVideoUtils(id=child["id"], cache_dir=False) video_details = video.get_video_info(use_proxy=False) video_source_id = "AimHi-{0}-{1}".format( playlist_info["title"], video_details["id"]) # Check youtube thumbnail extension as some are not supported formats thumbnail_link = '' print(video_details["thumbnail"]) image_response = requests.get("{0}".format( video_details["thumbnail"])) img = Image.open(BytesIO(image_response.content)) if img.format not in ['JPG', 'PNG', 'JPEG']: # if not in correct format, convert image and download to files folder print(video_details["thumbnail"]) print("{0}'s thumbnail not supported ({1}).".format( video_details["id"], img.format)) img_file_name = '{}_thumbnail.jpg'.format( video_details["id"]) thumbnail_link = os.path.join('chefdata', 'thumbnails', img_file_name) jpg_img = img.convert("RGB") # resive image to thumbnail dimensions jpg_img = jpg_img.resize((400, 225), Image.ANTIALIAS) jpg_img.save(thumbnail_link) else: thumbnail_link = video_details["thumbnail"] print(thumbnail_link) video_node = nodes.VideoNode( source_id=video_source_id, title=video_details["title"], description=video_details["description"], author="AimHi", language="en", provider="AimHi", thumbnail=thumbnail_link, license=licenses.get_license("CC BY-NC-ND", copyright_holder="AimHi"), files=[ files.YouTubeVideoFile( youtube_id=video_details["id"], language="en") ]) # add video to topic print(video_details["id"] + " has been added!") # add id to video_ids array video_ids.append(video_details["id"]) topic_node.add_child(video_node) else: continue # add topic to channel channel.add_child(topic_node) return channel