def fetch_video(video): youtube_id = video['id'] title = video['title'] description = video['description'] youtube_url = video['webpage_url'] subtitle_languages = video['subtitles'].keys() print(" Fetching video data: %s (%s)" % (title, youtube_url)) video_node = nodes.VideoNode( source_id=youtube_id, title=truncate_metadata(title), license=LICENSE, description=truncate_description(description), derive_thumbnail=True, language="en", files=[files.YouTubeVideoFile(youtube_id=youtube_id)], ) # Add subtitles in whichever languages are available. for language in subtitle_languages: # TODO(david): Should catch exception thrown by # files.YouTubeSubtitleFile rather than breaking abstraction. if languages.getlang(language) or languages.getlang_by_alpha2( language): video_node.add_file( files.YouTubeSubtitleFile(youtube_id=youtube_id, language=language)) else: print("WARNING: Subtitle language %s not found in languages file" % language) return video_node
def add_content_to_tree(self, channel): tree = self.channel_tree lang = 'English' lang_obj = getlang("en") for class_name in tree[lang]: class_obj = tree[lang][class_name] class_id = "{}-{}".format(lang, class_name) class_node = nodes.TopicNode(source_id=class_name, title=class_name) for subject_name in class_obj: subject_id = "{}-{}".format(class_id, subject_name) subject_node = nodes.TopicNode(source_id=subject_id, title=subject_name) subject_obj = class_obj[subject_name] for item in subject_obj['items']: item_id = "{}-{}".format(subject_id, get_column(item, 'id')) video = nodes.VideoNode( source_id=item_id, title=get_column(item, 'name'), description=get_column(item, 'description'), files=[ files.VideoFile(path=get_column(item, 'file')) ], language=lang_obj, # FIXME: Use the column's license field instead of hardcoding. license=licenses.get_license(le_licenses.CC_BY, copyright_holder=get_column(item, "copyright")), # thumbnail=get_column(item, "thumbnail") ) subject_node.add_child(video) class_node.add_child(subject_node) channel.add_child(class_node)
def fetch_video(video): youtube_id = video['id'] title = video['title'] description = video['description'] youtube_url = video['webpage_url'] subtitle_languages = video['subtitles'].keys() print(" Fetching video data: %s (%s)" % (title, youtube_url)) video_node = nodes.VideoNode( source_id=youtube_id, title=truncate_metadata(title), license=LICENSE, description=truncate_description(description), derive_thumbnail=True, language="en", files=[files.YouTubeVideoFile(youtube_id=youtube_id)], ) # Add subtitles in whichever languages are available. for language in subtitle_languages: if getlang_patched(language): video_node.add_file(LanguagePatchedYouTubeSubtitleFile( youtube_id=youtube_id, youtube_language=language)) return video_node
def scrape_directory(topic, directory, indent=1): for subdirectory, folders, myfiles in os.walk(directory): # Go through all of the folders under directory for folder in folders: print('{}{}'.format(' ' * indent, folder)) subtopic = nodes.TopicNode(source_id=folder, title=folder) topic.add_child(subtopic) # Go through folders under directory scrape_directory(subtopic, os.sep.join([subdirectory, folder]), indent=indent + 1) for file in myfiles: name, ext = os.path.splitext(file) if ext == '.mp4': video = nodes.VideoNode(source_id=subdirectory + file, title=name, license=LICENSE, copyright_holder=COPYRIGHT_HOLDER) videofile = files.VideoFile(os.sep.join([subdirectory, file])) video.add_file(videofile) topic.add_child(video) elif ext == '.pdf': with PDFParser(os.path.sep.join([subdirectory, file])) as parser: chapters = parser.get_data_file() generate_pdf_nodes(chapters, topic, source=os.path.basename(file)) break
def scrape_collection_files(topic, url): assets = json.loads(downloader.read(url))['data'] images = [] for asset in assets: if asset['attributes']['extension'] == 'png': images.append({ 'url': asset['attributes']['thumbnail_url'].replace( 'element.png', '*****@*****.**'), 'caption': asset['attributes']['name'] }) elif asset['attributes']['extension'] == 'mp4': video_data = json.loads( downloader.read(FILE_STORAGE_URL.format(id=asset['id']))) video = video_data['data'][0]['attributes'] topic.add_child( nodes.VideoNode(source_id=video['url'], title=asset['attributes']['name'], license=LICENSE, files=[ files.VideoFile(video['url']), files.ThumbnailFile(video['thumbnail_url']) ])) else: LOGGER.warning('Unable to add {} from {}'.format( asset['attributes']['extension'], url)) # Add images to slideshow node if len(images): topic.add_child(create_slideshow(images, url, topic.title, 'English'))
def construct_channel(self, *args, **kwargs): """ Creates ChannelNode and build topic tree Args: - args: arguments passed in during upload_channel (currently None) - kwargs: extra argumens and options not handled by `uploadchannel`. For example, add the command line option lang="fr" and the string "fr" will be passed along to `construct_channel` as kwargs['lang']. Returns: ChannelNode Healing Classrooms is organized with the following hierarchy: Playlist (TopicNode) | Youtube Video (VideoNode) | Youtube Video (VideoNode) """ channel = self.get_channel(*args, **kwargs) # Create ChannelNode from data in self.channel_info # Download the playlist/video information with youtube_dl.YoutubeDL({'skip_download': True}) as ydl: info_dict = ydl.extract_info(PLAYLISTS_URL, download=False) # Generate topics based off playlist entries in dict for playlist in info_dict['entries']: # Get language of playlist (hack) language = "fr" if "English" in playlist['title']: language = "en" elif "Arabic" in playlist['title']: language = "ar" playlist_topic = nodes.TopicNode(title=playlist['title'], source_id=playlist['id'], language=language) channel.add_child(playlist_topic) # Generate videos based off video entries in dict for video in playlist['entries']: thumbnail_url = len(video['thumbnails']) and video['thumbnails'][0]['url'] playlist_topic.add_child(nodes.VideoNode( title = video['title'], source_id = video['id'], license = licenses.PublicDomainLicense(), description = video['description'], derive_thumbnail = not thumbnail_url, files = [files.WebVideoFile(video['webpage_url'])], thumbnail = thumbnail_url, author = AUTHOR, # tags = video['categories'] + video['tags'], # TODO: uncomment this when added )) raise_for_invalid_channel(channel) # Check for errors in channel construction return channel
def to_contentnode(self, title, directory=None, *args, **kwargs): # Generate a node based on the kind attribute filepath = self.to_file(directory=directory) if self.kind == content_kinds.HTML5: return nodes.HTML5AppNode(source_id=self.url, title=title, files=[files.HTMLZipFile(filepath)], **kwargs) elif self.kind == content_kinds.VIDEO: return nodes.VideoNode(source_id=self.url, title=title, files=[files.VideoFile(filepath)], **kwargs)
def scrape_iversity(channel): url = "{}/en/my/courses/rethinking-us-them-integration-and-diversity-in-europe/lesson_units".format( BASE_URL) LOGGER.info(" Scraping Migration Matters at {}".format(url)) source = read_source(url) chapters = source.find_all('div', {'class': 'chapter-units-wrapper'}) for chapter in chapters: title = str(chapter.find('div', {'class': 'chapter-title'}).string) source_id = title.strip().replace(" ", "_") topic = nodes.TopicNode(source_id=source_id, title=title) lessons = chapter.find_all('a', {'class': 'unit-wrapper'}) for lesson in lessons: video_exists = lesson.find('i', {'class': 'unit_video'}) video_title = str( lesson.find('span', { 'class': 'unit-title' }).string).strip() if video_exists: video_source_id = video_title.replace(" ", "_") video_url = "{}{}".format(BASE_URL, lesson.attrs["href"]) video_source = read_source(video_url) video_info = video_source.find('video') video_subtitle_path = video_info.find('track', { 'kind': 'subtitles' }).attrs["src"] video_subtitle = files.SubtitleFile( path=video_subtitle_path, language=languages.getlang('en').code) video_link = video_info.find('source', { 'res': '480' }).attrs["src"] video_file = files.VideoFile( path=video_link, language=languages.getlang('en').code) video_node = nodes.VideoNode( source_id=video_source_id, title=video_title, files=[video_file, video_subtitle], license=CHANNEL_LICENSE, copyright_holder=COPYRIGHT_HOLDER) LOGGER.info(" Uploading video - {}".format( video_title.strip())) topic.add_child(video_node) else: LOGGER.info( "Format of the file is not supported by the sushi chef : {}" .format(video_title)) channel.add_child(topic)
def _build_tree(node, sourcetree): """ Parse nodes given in `sourcetree` and add as children of `node`. """ for child_source_node in sourcetree: try: main_file = child_source_node['files'][ 0] if 'files' in child_source_node else {} kind = guess_content_kind( path=main_file.get('path'), web_video_data=main_file.get('youtube_id') or main_file.get('web_url'), questions=child_source_node.get("questions")) except UnknownContentKindError: continue if kind == content_kinds.TOPIC: child_node = nodes.TopicNode( source_id=child_source_node["id"], title=child_source_node["title"], author=child_source_node.get("author"), description=child_source_node.get("description"), thumbnail=child_source_node.get("thumbnail"), ) node.add_child(child_node) source_tree_children = child_source_node.get("children", []) _build_tree(child_node, source_tree_children) elif kind == content_kinds.VIDEO: child_node = nodes.VideoNode( source_id=child_source_node["id"], title=child_source_node["title"], license=get_license(child_source_node.get("license"), description="Description of license", copyright_holder=child_source_node.get( 'copyright_holder')), author=child_source_node.get("author"), description=child_source_node.get("description"), derive_thumbnail=True, # video-specific data thumbnail=child_source_node.get('thumbnail'), ) add_files(child_node, child_source_node.get("files") or []) node.add_child(child_node) else: # unknown content file format continue return node
def scrape_video_collection(url, topic): """ Scrape videos under video collection and add to the topic node Args: url (str): url to video page (e.g. https://www.exploratorium.edu/video/inflatable-jimmy-kuehnle) topic (TopicNode): topic to add video nodes to """ try: collection_contents = BeautifulSoup(read(url), 'html5lib') for result in collection_contents.find_all('div', {'class': 'search-result'}): header = result.find('div', {'class': 'views-field-field-html-title'}) LOGGER.info(" {}".format(header.text.strip())) # Get video from given url description = result.find('div', {'class': 'search-description'}) video_contents = BeautifulSoup(read(header.find('a')['href']), 'html.parser') for k, v in get_brightcove_mapping(video_contents).items(): video_node = nodes.VideoNode( source_id=k, title=header.text.strip().replace("’", "'"), description=description.text.strip() if description else "", license=LICENSE, copyright_holder=COPYRIGHT_HOLDER, author=v.get('author') or "", files=[ files.WebVideoFile(v['url'], high_resolution=False) ], thumbnail=get_thumbnail_url(result.find('img')['src']), ) # If video doesn't already exist here, add to topic if not next((c for c in topic.children if c.source_id == video_node.source_id), None): topic.add_child(video_node) # Scrape next page (if any) next_page_url = get_next_page_url(collection_contents) if next_page_url: scrape_video_collection(next_page_url, topic) except requests.exceptions.HTTPError: LOGGER.error("Could not read collection at {}".format(url))
def download_video_topics(topic_node, playlist_item, lang_obj, use_cache=True, to_sheet=False): """ Scrape, collect, and download the videos from playlist. """ playlist_obj = RefugeeResponsePlaylist(playlist_item, use_cache) playlist_info = playlist_obj.get_playlist_info() videos = [entry['id'] for entry in playlist_info.get('children')] for video in playlist_info.get('children'): video_id = video['id'] video_url = YOUTUBE_VIDEO_URL_FORMAT.format(video_id) video_source_id = 'refugee-response-{0}-{1}'.format( lang_obj.name, video_id) if video_id in VIDEO_DESCRIPTION_MAP: video_description = VIDEO_DESCRIPTION_MAP[video_id] else: # Exclude videos continue LOGGER.info("Video Description: '%s'", video_description) try: video_node = nodes.VideoNode( source_id=video_source_id, title=video['title'], description=video_description, author=REFUGEE_RESPONSE, language=lang_obj.code, provider=REFUGEE_RESPONSE, thumbnail=video['thumbnail'], license=licenses.get_license( "CC BY-NC-ND", copyright_holder=REFUGEE_RESPONSE), files=[ files.YouTubeVideoFile(youtube_id=video_id, language=lang_obj.code) ]) topic_node.add_child(video_node) except Exception as e: LOGGER.error('Error downloading this video: %s', e)
def make_content_node(kind, source_id, title, license, filepath, optionals): """ Create `kind` subclass of ContentNode based on required args and optionals. """ content_node = None if kind == content_kinds.VIDEO: content_node = nodes.VideoNode( source_id=source_id, title=title, license=license, author=optionals.get("author", None), description=optionals.get("description", None), derive_thumbnail=True, # video-specific data files=[files.VideoFile(path=filepath)], ) elif kind == content_kinds.AUDIO: content_node = nodes.AudioNode( source_id=source_id, title=title, license=license, author=optionals.get("author", None), description=optionals.get("description", None), thumbnail=optionals.get("thumbnail", None), files=[files.AudioFile(path=filepath)], ) elif kind == content_kinds.DOCUMENT: content_node = nodes.DocumentNode( source_id=source_id, title=title, license=license, author=optionals.get("author", None), description=optionals.get("description", None), thumbnail=optionals.get("thumbnail", None), files=[files.DocumentFile(path=filepath)], ) return content_node
def scrape_content(endpoint, channel, existingNode=None): replacements = {" ": "%20", "#": "%23"} content = read_source(endpoint) attributes = content.find("tbody").find_all("td", "text-xs-left") for attribute in attributes: source_id = attribute.attrs["data-sort-value"] # Check if it is mp4 file if source_id.endswith(".mp4"): video_info = attribute.find("a") video_title, _ext = splitext(str(video_info.string)) filter_video_link = video_info.attrs["href"][1:].replace( " ", "%20") video_link = BASE_URL + filter_video_link video_file = files.VideoFile(path=video_link) video_node = nodes.VideoNode(source_id=source_id, title=video_title, files=[video_file], license=CHANNEL_LICENSE) existingNode.add_child(video_node) # Check if it is a directory elif source_id.startswith("dir"): title = str(attribute.find("strong").string) topic_node = nodes.TopicNode(source_id=source_id, title=title) if existingNode: existingNode.add_child(topic_node) else: channel.add_child(topic_node) new_end_point = replace_all(title, replacements) new_end = endpoint + "{}/".format(new_end_point) scrape_content(new_end, channel, topic_node) else: LOGGER.info( "Format of the file is not supported by the sushi chef : {}". format(source_id))
def video_node_from_dropbox(self, video_details, link, token): dbx = dropbox.Dropbox(token) metadata, res = dbx.sharing_get_shared_link_file(url=link) # get relative path to video file video_path = os.path.relpath(os.path.join(VIDEO_FOLDER, metadata.name)) if not os.path.isfile(video_path): with open(video_path, 'wb') as f: f.write(res.content) else: LOGGER.info("{} already downloaded. Skipping".format( metadata.name)) video_file = files.VideoFile(path=video_path) video_node = nodes.VideoNode( title=video_details["title"], source_id=link, license=licenses.CC_BYLicense("TicTacLearn"), files=[video_file]) return video_node
def _build_tree(node, sourcetree): """ Parse nodes given in `sourcetree` and add as children of `node`. """ for child_source_node in sourcetree: try: main_file = child_source_node['files'][0] if 'files' in child_source_node else {} kind = guess_content_kind(path=main_file.get('path'), web_video_data=main_file.get('youtube_id') or main_file.get('web_url'), questions=child_source_node.get("questions")) except UnknownContentKindError: continue if kind == content_kinds.TOPIC: child_node = nodes.TopicNode( source_id=child_source_node["id"], title=child_source_node["title"], author=child_source_node.get("author"), description=child_source_node.get("description"), thumbnail=child_source_node.get("thumbnail"), ) node.add_child(child_node) source_tree_children = child_source_node.get("children", []) _build_tree(child_node, source_tree_children) elif kind == content_kinds.VIDEO: child_node = nodes.VideoNode( source_id=child_source_node["id"], title=child_source_node["title"], license=get_license(child_source_node.get("license"), description="Description of license"), author=child_source_node.get("author"), description=child_source_node.get("description"), derive_thumbnail=True, # video-specific data thumbnail=child_source_node.get('thumbnail'), ) add_files(child_node, child_source_node.get("files") or []) node.add_child(child_node) elif kind == content_kinds.AUDIO: child_node = nodes.AudioNode( source_id=child_source_node["id"], title=child_source_node["title"], license=child_source_node.get("license"), author=child_source_node.get("author"), description=child_source_node.get("description"), thumbnail=child_source_node.get("thumbnail"), ) add_files(child_node, child_source_node.get("files") or []) node.add_child(child_node) elif kind == content_kinds.DOCUMENT: child_node = nodes.DocumentNode( source_id=child_source_node["id"], title=child_source_node["title"], license=child_source_node.get("license"), author=child_source_node.get("author"), description=child_source_node.get("description"), thumbnail=child_source_node.get("thumbnail"), ) add_files(child_node, child_source_node.get("files") or []) node.add_child(child_node) elif kind == content_kinds.EXERCISE: child_node = nodes.ExerciseNode( source_id=child_source_node["id"], title=child_source_node["title"], license=child_source_node.get("license"), author=child_source_node.get("author"), description=child_source_node.get("description"), exercise_data={}, # Just set to default thumbnail=child_source_node.get("thumbnail"), ) add_files(child_node, child_source_node.get("files") or []) for q in child_source_node.get("questions"): question = create_question(q) child_node.add_question(question) node.add_child(child_node) elif kind == content_kinds.HTML5: child_node = nodes.HTML5AppNode( source_id=child_source_node["id"], title=child_source_node["title"], license=child_source_node.get("license"), author=child_source_node.get("author"), description=child_source_node.get("description"), thumbnail=child_source_node.get("thumbnail"), ) add_files(child_node, child_source_node.get("files") or []) node.add_child(child_node) else: # unknown content file format continue return node
def build_tree_from_json(parent_node, sourcetree): """ Recusively parse nodes in the list `sourcetree` and add them as children to the `parent_node`. Usually called with `parent_node` being a `ChannelNode`. """ EXPECTED_NODE_TYPES = [ TOPIC_NODE, VIDEO_NODE, AUDIO_NODE, EXERCISE_NODE, DOCUMENT_NODE, HTML5_NODE ] for source_node in sourcetree: kind = source_node['kind'] if kind not in EXPECTED_NODE_TYPES: LOGGER.critical('Unexpected node type found: ' + kind) raise NotImplementedError( 'Unexpected node type found in json data.') if kind == TOPIC_NODE: child_node = nodes.TopicNode( source_id=source_node.get("source_id", None), title=source_node["title"], author=source_node.get("author"), description=source_node.get("description"), language=source_node.get('language', None), thumbnail=source_node.get("thumbnail"), ) parent_node.add_child(child_node) source_tree_children = source_node.get("children", []) build_tree_from_json(child_node, source_tree_children) elif kind == VIDEO_NODE: child_node = nodes.VideoNode( source_id=source_node["source_id"], title=source_node["title"], license=get_license(**source_node['license']), author=source_node.get("author"), description=source_node.get("description"), language=source_node.get('language', None), derive_thumbnail=source_node.get( 'derive_thumbnail', True), # video-specific option thumbnail=source_node.get('thumbnail'), ) add_files(child_node, source_node.get("files") or []) parent_node.add_child(child_node) elif kind == AUDIO_NODE: child_node = nodes.AudioNode( source_id=source_node["source_id"], title=source_node["title"], license=get_license(**source_node['license']), author=source_node.get("author"), description=source_node.get("description"), language=source_node.get('language', None), thumbnail=source_node.get('thumbnail'), ) add_files(child_node, source_node.get("files") or []) parent_node.add_child(child_node) elif kind == EXERCISE_NODE: child_node = nodes.ExerciseNode( source_id=source_node["source_id"], title=source_node["title"], license=get_license(**source_node['license']), author=source_node.get("author"), description=source_node.get("description"), language=source_node.get('language', None), thumbnail=source_node.get("thumbnail"), questions=[], ) add_questions(child_node, source_node.get("questions") or []) parent_node.add_child(child_node) elif kind == DOCUMENT_NODE: child_node = nodes.DocumentNode( source_id=source_node["source_id"], title=source_node["title"], license=get_license(**source_node['license']), author=source_node.get("author"), description=source_node.get("description"), language=source_node.get('language', None), thumbnail=source_node.get("thumbnail"), ) add_files(child_node, source_node.get("files") or []) parent_node.add_child(child_node) elif kind == HTML5_NODE: child_node = nodes.HTML5AppNode( source_id=source_node["source_id"], title=source_node["title"], license=get_license(**source_node['license']), author=source_node.get("author"), description=source_node.get("description"), language=source_node.get('language', None), thumbnail=source_node.get("thumbnail"), ) add_files(child_node, source_node.get("files") or []) parent_node.add_child(child_node) else: LOGGER.critical("Encountered an unknown kind: " + str(source_node)) continue return parent_node
def add_video_nodes_from_playlist( self, youtube_client, playlist_id, subtitle_languages=SUBTITLE_LANGUAGES, copyright_holder=COPYRIGHT_HOLDER, only_creative_commons=ONLY_CREATIVE_COMMONS): first_page = True next_page_token = None playlist_request_kwargs = { 'part': 'contentDetails', 'maxResults': 50, 'playlistId': playlist_id, } # Apparently the same video is in one of the playlists twice! # This is used to keep track of videos that have already been added. videos_added = {} while first_page or next_page_token: first_page = False # we're visiting the first page now! playlist_info = youtube_client.playlistItems().list( **playlist_request_kwargs).execute() playlist_items = playlist_info['items'] video_ids = [ vid['contentDetails']['videoId'] for vid in playlist_items ] videos = youtube_client.videos().list( part='status,snippet', id=','.join(video_ids)).execute()['items'] # Apparently the same video is in one of the playlists twice! # Uncomment the following code to see for yourself: # video_ids = [v['id'] for v in videos] # duplicated_videos = [v for v in video_ids if video_ids.count(v) > 1] # print("The following videos are duplicated: {}".format(duplicated_videos)) for video in videos: if video['id'] in videos_added: continue if only_creative_commons and video['status'][ 'license'] != 'creativeCommon': print( "The video '{}' is not licensed as Creative Commons... it is licensed as {}" .format(video['snippet']['title'], video['status']['license'])) else: try: video_license = licenses.CC_BY \ if video['status']['license'] == 'creativeCommon' \ else NON_CREATIVE_COMMONS_LICENSE_DEFAULT video_node = nodes.VideoNode( # source_id="{}__{}".format(video['id'], playlist_id), source_id=video['id'], title=video['snippet']['title'], language=CHANNEL_LANGUAGE, license=get_license( video_license, copyright_holder=copyright_holder), thumbnail=get_largest_thumbnail( video['snippet']['thumbnails']).get('url'), files=[ files.YouTubeVideoFile(video['id']), ]) # Get subtitles for languages designated in SUBTITLE_LANGUAGES for lang_code in subtitle_languages: if files.is_youtube_subtitle_file_supported_language( lang_code): video_node.add_file( files.YouTubeSubtitleFile( youtube_id=video['id'], language=lang_code)) else: print('Unsupported subtitle language code:', lang_code) self.add_child(video_node) videos_added[video['id']] = video_node except Exception as e: raise e # set up the next page, if there is one next_page_token = playlist_info.get('nextPageToken') if next_page_token: playlist_request_kwargs['pageToken'] = next_page_token else: try: del playlist_request_kwargs['pageToken'] except Exception as e: pass
def scrape_content(title, content_url): """ title: Boys' clothing content_url: http://www.touchableearth.org/china-culture-boys-clothing/ """ print(" Scraping content node: %s (%s)" % (title, content_url)) doc = get_parsed_html_from_url(content_url) if not doc: # 404 return None description = create_description(doc) source_id = doc.select_one(".current_post.active .post_id")["value"] base_node_attributes = { "source_id": source_id, "title": title, "license": TE_LICENSE, "description": description, } youtube_iframe = doc.select_one(".video-container iframe") if youtube_iframe: youtube_url = doc.select_one(".video-container iframe")["src"] youtube_id = get_youtube_id_from_url(youtube_url) if not youtube_id: print(" *** WARNING: youtube_id not found for content url", content_url) print(" Skipping.") return None try: info = ydl.extract_info(youtube_url, download=False) subtitles = info.get("subtitles") subtitle_languages = subtitles.keys() if subtitles else [] print(" ... with subtitles in languages:", subtitle_languages) except youtube_dl.DownloadError as e: # Some of the videos have been removed from the YouTube channel -- # skip creating content nodes for them entirely so they don't show up # as non-loadable videos in Kolibri. print(" NOTE: Skipping video download due to error: ", e) return None video_node = nodes.VideoNode( **base_node_attributes, derive_thumbnail=True, files=[WatermarkedYouTubeVideoFile(youtube_id=youtube_id)], ) # Add subtitles in whichever languages are available. for language in subtitle_languages: video_node.add_file( files.YouTubeSubtitleFile(youtube_id=youtube_id, language=language)) return video_node img = doc.select_one(".uncode-single-media-wrapper img") if img: img_src = img["data-guid"] or img["src"] destination = tempfile.mkdtemp() download_file(img_src, destination, request_fn=make_request, filename="image.jpg") with open(os.path.join(destination, "index.html"), "w") as f: f.write(""" <!doctype html> <html> <head></head> <body> <img src="image.jpg" style="width: 100%; max-width: 1200px;" /> </body> </html> """) zip_path = create_predictable_zip(destination) return nodes.HTML5AppNode( **base_node_attributes, files=[files.HTMLZipFile(zip_path)], thumbnail=img_src, ) return None
def construct_channel(self, *args, **kwargs): """ Creates ChannelNode and build topic tree Args: - args: arguments passed in during upload_channel (currently None) - kwargs: extra argumens and options not handled by `uploadchannel`. For example, add the command line option lang="fr" and the string "fr" will be passed along to `construct_channel` as kwargs['lang']. Returns: ChannelNode """ channel = self.get_channel(*args, **kwargs) # Create ChannelNode from data in self.channel_info from apiclient.discovery import build # instantiate a YouTube Data API v3 client youtube = build('youtube', 'v3', developerKey=kwargs['--youtube-api-token']) playlists = youtube.playlists().list( # list all of the YouTube channel's playlists part='snippet', channelId=YOUTUBE_CHANNEL_ID, maxResults=50 ).execute()['items'] # For getting the thumbnail automatically # youtube_channel = youtube.channels().list( # id=YOUTUBE_CHANNEL_ID, # part='snippet' # ).execute()['items'][0] # channel.thumbnail = get_largest_thumbnail(youtube_channel['snippet']['thumbnails']).get('url') for playlist in playlists: topic = nodes.TopicNode(title=playlist['snippet']['title'], source_id=playlist['id']) first_page = True next_page_token = None playlist_request_kwargs = { 'part': 'contentDetails', 'maxResults': 50, 'playlistId': playlist['id'], } while first_page or next_page_token: first_page = False # we're visiting the first page now! playlist_info = youtube.playlistItems().list(**playlist_request_kwargs).execute() playlist_items = playlist_info['items'] video_ids = [vid['contentDetails']['videoId'] for vid in playlist_items] videos = youtube.videos().list( part='status,snippet', id=','.join(video_ids) ).execute()['items'] for video in videos: if video['status']['license'] == 'creativeCommon': try: video_node = nodes.VideoNode( source_id=video['id'], title=video['snippet']['title'], language=CHANNEL_LANGUAGE, license=get_license(licenses.CC_BY, copyright_holder='Espresso English'), thumbnail=get_largest_thumbnail(video['snippet']['thumbnails']).get('url'), files=[ files.YouTubeVideoFile(video['id']), ] ) topic.add_child(video_node) # Get subtitles for languages designated in SUBTITLE_LANGUAGES for lang_code in SUBTITLE_LANGUAGES: if files.is_youtube_subtitle_file_supported_language(lang_code): video_node.add_file( files.YouTubeSubtitleFile( youtube_id=video['id'], language=lang_code ) ) else: print('Unsupported subtitle language code:', lang_code) except Exception as e: raise e # set up the next page, if there is one next_page_token = playlist_info.get('nextPageToken') if next_page_token: playlist_request_kwargs['pageToken'] = next_page_token else: try: del playlist_request_kwargs['pageToken'] except Exception as e: pass channel.add_child(topic) raise_for_invalid_channel(channel) # Check for errors in channel construction return channel
def convert_ka_node_to_ricecooker_node(ka_node): if ka_node.slug in SLUG_BLACKLIST: return None if isinstance(ka_node, KhanTopic): topic = nodes.TopicNode( source_id=ka_node.id, title=ka_node.title, description=ka_node.description[:400], ) for ka_subtopic in ka_node.children: subtopic = convert_ka_node_to_ricecooker_node(ka_subtopic) if subtopic: topic.add_child(subtopic) return topic elif isinstance(ka_node, KhanExercise): exercise = nodes.ExerciseNode( source_id=ka_node.id, title=ka_node.title, description=ka_node.description[:400], # exercise_data={'mastery_model': node.get('suggested_completion_criteria')}, license=licenses.SpecialPermissionsLicense( copyright_holder="Khan Academy", description= "Permission granted to distribute through Kolibri for non-commercial use" ), # need to formalize with KA thumbnail=node.thumbnail, ) for ka_assessment_item in ka_node.get_assessment_items(): assessment_item = PerseusQuestion( id=assessment_item.id, raw_data=assessment_item.data, source_url=assessment_item.source_url, ) exercise.add_question(assessment_item) return exercise elif isinstance(ka_node, KhanVideo): # TODO: Use traditional compression here to avoid breaking existing KA downloads? files = [ VideoFile( ka_node.download_urls.get("mp4-low", ka_node.download_urls.get("mp4"))) ] # if the video is in English, include any subtitles available along with it if ka_node.lang == "en": for lang_code in ka_node.get_subtitle_languages(): files.append(YouTubeSubtitleFile(node.id, language=lang_code)) # convert KA's license format into our own license classes if ka_node.license in LICENSE_MAPPING: license = LICENSE_MAPPING[ka_node.license] else: # license = licenses.CC_BY_NC_SA # or? raise Exception("Unknown license on video {}: {}".format( ka_node.id, ka_node.license)) video = nodes.VideoNode( source_id=ka_node.id, title=ka_node.title, description=ka_node.description[:400], license=license, thumbnail=node.thumbnail, files=files, ) return video elif isinstance(ka_node, KhanArticle): # TODO return None
def _build_tree(node, sourcetree): #for child_source_node in sourcetree: #d=dict(child_source_node) #print(child_source_node) #title = child_source_node.replace(u'\xa0', u' ').replace('\n', '') #title="none" #title="" files = "" for s in sourcetree: print(type(s)) if s.get('type') == 'file': title = str(s.get('name')) print("title:") print(title) files = s.get('files') else: # if child_source_node=='children': #for i in range(len(sourcetree.get('children'))): # _build_tree(node,sourcetree.get('children')[i]) #print(s) child_node = nodes.TopicNode( source_id=str(s.get('name')), title=str(s.get('name')).replace("_", " "), ) node.add_child(child_node) source_tree_children = s.get("children", []) _build_tree(child_node, source_tree_children) #print("T:", title) #path="none" #source_id="none" #print("S:", source_id) #fancy_license = get_license(licenses.SPECIAL_PERMISSIONS, description='gfh', copyright_holder='sed') for child_source_node in sourcetree: try: main_file = child_source_node['files'][ 0] if 'files' in child_source_node else {} kind = guess_content_kind( path=main_file.get('path'), web_video_data=main_file.get('youtube_id') or main_file.get('web_url')) except UnknownContentKindError: continue print("kind:") print(kind) # if kind == content_kinds.TOPIC: # child_node = nodes.TopicNode( # source_id=str(uuid.uuid4()), # title=str(child_source_node.get('name')) # ) # node.add_child(child_node) # source_tree_children = child_source_node.get("children", []) # _build_tree(child_node, source_tree_children) if kind == content_kinds.VIDEO: child_node = nodes.VideoNode( # source_id=str(uuid.uuid4()), source_id=str(child_source_node.get('name')).replace(' ', '_'), title=str(child_source_node.get('name').replace(".mp4", "")), license='All Rights Reserved', copyright_holder="Sarva Shiksha Abhiyaan", ) add_files(child_node, child_source_node.get("files") or []) node.add_child(child_node) else: # unknown content file format continue return node
def crawl_each_post(post_url): resp = requests.get(post_url, headers=HEADERS) soup = BeautifulSoup(resp.content, "html.parser") wrapper = soup.find('div', {'class': 'wpb_wrapper'}) course_name = wrapper.find('div', { 'class': 'vc_custom_heading' }).getText().strip() delimiters = " OF ", " FROM " regex_pattern = '|'.join(map(re.escape, delimiters)) course = re.split(regex_pattern, course_name)[1] wpb_video_wrapper = wrapper.find_all('div', {'class': 'wpb_video_wrapper'}) if wpb_video_wrapper: for each_wrapper in wpb_video_wrapper: video_url = each_wrapper.find('iframe').attrs["src"].split( "?feature")[0] video_id = video_url.split("/")[-1] ydl = youtube_dl.YoutubeDL({ 'outtmpl': './downloads/%(id)s.%(ext)s', 'writeautomaticsub': True, 'logger': LOGGER }) with ydl: result = ydl.extract_info( "http://www.youtube.com/watch?v={}".format(video_id), download=True) if 'entries' in result: video = result['entries'][0] else: video = result video_title = video["title"] video_source_id = video_title.strip().replace(" ", "_") video_path = "{}/{}.mp4".format(DOWNLOAD_DIRECTORY, video_id) video_subtitle_path = "{}/{}.en.vtt".format( DOWNLOAD_DIRECTORY, video_id) video_file = files.VideoFile(path=video_path, language=languages.getlang('en').code) video_subtitle = files.SubtitleFile( path=video_subtitle_path, language=languages.getlang('en').code) video_node = nodes.VideoNode( source_id=video_source_id, title=video_title, files=[video_file, video_subtitle], license=CHANNEL_LICENSE, copyright_holder=COPYRIGHT_HOLDER, ) if course not in EPISODE_DICT: EPISODE_DICT[course] = [video_node] else: EPISODE_DICT[course].append(video_node) LOGGER.info(" Uploading video - {}".format(video_title.strip())) else: LOGGER.info( "Format of the file is not supported by the sushi chef : {}". format(course_name))
def construct_channel(self, *args, **kwargs): """ Creates ChannelNode and build topic tree Args: - args: arguments passed in on the command line - kwargs: extra options passed in as key="value" pairs on the command line For example, add the command line option lang="fr" and the value "fr" will be passed along to `construct_channel` as kwargs['lang']. Returns: ChannelNode """ # editing metadata for key, value in kwargs.items(): if key == NO_CACHE_KEYNAME: self.use_cache = False LOGGER.info("use_cache = '%d'", self.use_cache) if key == EXTRACT_VIDEO_INFO: self.insert_video_info = True self.video_list = value.split(",") if key == EXTRACT_VIDEO_PLAYLIST_INFO: self.insert_video_info = True self.to_playlist = value if key == DOWNLOAD_TO_CSV: if value == "true": print('csv = true') create_csv() exit(0) channel = self.get_channel( *args, **kwargs) # Create ChannelNode from data in self.channel_info # Get Channel Topics # Create thumbnails folder in chefdata if not exists if not os.path.isdir(os.path.join('chefdata', 'thumbnails')): os.makedirs(os.path.join('chefdata', 'thumbnails')) # youtube_cache = os.path.join("chefdata", "youtubecache") for playlist_id in PLAYLIST_MAP: playlist = YouTubePlaylistUtils(id=playlist_id, cache_dir=YOUTUBE_CACHE_DIR) playlist_info = playlist.get_playlist_info(use_proxy=False) # Get channel description if there is any playlist_description = '' if playlist_info["description"]: playlist_description = playlist_info["description"] else: playlist_description = playlist_info["title"] topic_source_id = 'aimhi-child-topic-{0}'.format( playlist_info["title"]) topic_node = nodes.TopicNode(title=playlist_info["title"], source_id=topic_source_id, author="AimHi", provider="AimHi", description=playlist_description, language="en") video_ids = [] # insert videos into playlist topic after creation for child in playlist_info["children"]: # check for duplicate videos if child["id"] not in video_ids: video = YouTubeVideoUtils(id=child["id"], cache_dir=False) video_details = video.get_video_info(use_proxy=False) video_source_id = "AimHi-{0}-{1}".format( playlist_info["title"], video_details["id"]) # Check youtube thumbnail extension as some are not supported formats thumbnail_link = '' print(video_details["thumbnail"]) image_response = requests.get("{0}".format( video_details["thumbnail"])) img = Image.open(BytesIO(image_response.content)) if img.format not in ['JPG', 'PNG', 'JPEG']: # if not in correct format, convert image and download to files folder print(video_details["thumbnail"]) print("{0}'s thumbnail not supported ({1}).".format( video_details["id"], img.format)) img_file_name = '{}_thumbnail.jpg'.format( video_details["id"]) thumbnail_link = os.path.join('chefdata', 'thumbnails', img_file_name) jpg_img = img.convert("RGB") # resive image to thumbnail dimensions jpg_img = jpg_img.resize((400, 225), Image.ANTIALIAS) jpg_img.save(thumbnail_link) else: thumbnail_link = video_details["thumbnail"] print(thumbnail_link) video_node = nodes.VideoNode( source_id=video_source_id, title=video_details["title"], description=video_details["description"], author="AimHi", language="en", provider="AimHi", thumbnail=thumbnail_link, license=licenses.get_license("CC BY-NC-ND", copyright_holder="AimHi"), files=[ files.YouTubeVideoFile( youtube_id=video_details["id"], language="en") ]) # add video to topic print(video_details["id"] + " has been added!") # add id to video_ids array video_ids.append(video_details["id"]) topic_node.add_child(video_node) else: continue # add topic to channel channel.add_child(topic_node) return channel
def construct_channel(self, *args, **kwargs): """ Creates ChannelNode and build topic tree Args: - args: arguments passed in during upload_channel (currently None) - kwargs: extra argumens and options not handled by `uploadchannel`. For example, add the command line option lang="fr" and the string "fr" will be passed along to `construct_channel` as kwargs['lang']. Returns: ChannelNode Healing Classrooms is organized with the following hierarchy: Playlist (TopicNode) | Youtube Video (VideoNode) | Youtube Video (VideoNode) """ channel = self.get_channel( *args, **kwargs) # Create ChannelNode from data in self.channel_info # Download the playlist/video information try: with youtube_dl.YoutubeDL({'skip_download': True}) as ydl: info_dict = ydl.extract_info(PLAYLISTS_URL, download=False) print(info_dict.keys()) # Generate topics based off playlist entries in dict #for playlist in info_dict['entries']: # Get language of playlist (hack) # language = "fr" # if "English" in playlist['title']: # language = "en" # elif "Arabic" in playlist['title']: language = "ar" # playlist_topic = nodes.TopicNode(title=playlist['title'], source_id=playlist['id'], language=language) # channel.add_child(playlist_topic) # Generate videos based off video entries in dict videos = sorted( info_dict['entries'], key=lambda x: int(re.search("\d+", x['title']).group())) print([v['title'] for v in videos]) import time time.sleep(15) for video in videos: #try: # num, = re.findall("\d+",video['title']) # title = re.sub(video['title'], num, "") # title = ("0"+num)[-2:] + " " + title #except Exception as e: # print (e) # print (video['title']) # print (repr(video['title'])) # raise thumbnail_url = len( video['thumbnails']) and video['thumbnails'][0]['url'] channel.add_child( nodes.VideoNode( title=video['title'], source_id=video['id'], license=licenses.PublicDomainLicense(), description=video['description'], derive_thumbnail=not thumbnail_url, files=[files.WebVideoFile(video['webpage_url'])], thumbnail=thumbnail_url, author=AUTHOR, # tags = video['categories'] + video['tags'], # TODO: uncomment this when added )) except Exception as e: import traceback, sys traceback.print_exc(file=sys.stdout) raise raise_for_invalid_channel( channel) # Check for errors in channel construction return channel
def build_tree_from_json(parent_node, sourcetree): """ Recusively parse nodes in the list `sourcetree` and add them as children to the `parent_node`. Usually called with `parent_node` being a `ChannelNode`. """ EXPECTED_NODE_TYPES = [TOPIC_NODE, VIDEO_NODE, AUDIO_NODE, EXERCISE_NODE, DOCUMENT_NODE, HTML5_NODE, SLIDESHOW_NODE] for source_node in sourcetree: kind = source_node['kind'] if kind not in EXPECTED_NODE_TYPES: LOGGER.critical('Unexpected node kind found: ' + kind) raise NotImplementedError('Unexpected node kind found in json data.') if kind == TOPIC_NODE: child_node = nodes.TopicNode( source_id=source_node.get('source_id', None), title=source_node['title'], description=source_node.get('description'), author=source_node.get('author'), aggregator=source_node.get('aggregator'), provider=source_node.get('provider'), # no role for topics (computed dynaically from descendants) language=source_node.get('language'), thumbnail=source_node.get('thumbnail'), tags=source_node.get('tags'), ) parent_node.add_child(child_node) source_tree_children = source_node.get('children', []) build_tree_from_json(child_node, source_tree_children) elif kind == VIDEO_NODE: child_node = nodes.VideoNode( source_id=source_node['source_id'], title=source_node['title'], description=source_node.get('description'), license=get_license(**source_node['license']), author=source_node.get('author'), aggregator=source_node.get('aggregator'), provider=source_node.get('provider'), role=source_node.get('role', roles.LEARNER), language=source_node.get('language'), derive_thumbnail=source_node.get('derive_thumbnail', True), # video-specific option thumbnail=source_node.get('thumbnail'), tags=source_node.get('tags'), ) add_files(child_node, source_node.get('files') or []) parent_node.add_child(child_node) elif kind == AUDIO_NODE: child_node = nodes.AudioNode( source_id=source_node['source_id'], title=source_node['title'], description=source_node.get('description'), license=get_license(**source_node['license']), author=source_node.get('author'), aggregator=source_node.get('aggregator'), provider=source_node.get('provider'), role=source_node.get('role', roles.LEARNER), language=source_node.get('language'), thumbnail=source_node.get('thumbnail'), tags=source_node.get('tags'), ) add_files(child_node, source_node.get('files') or []) parent_node.add_child(child_node) elif kind == EXERCISE_NODE: child_node = nodes.ExerciseNode( source_id=source_node['source_id'], title=source_node['title'], description=source_node.get('description'), license=get_license(**source_node['license']), author=source_node.get('author'), aggregator=source_node.get('aggregator'), provider=source_node.get('provider'), role=source_node.get('role', roles.LEARNER), language=source_node.get('language'), thumbnail=source_node.get('thumbnail'), tags=source_node.get('tags'), exercise_data=source_node.get('exercise_data'), questions=[], ) add_questions(child_node, source_node.get('questions') or []) parent_node.add_child(child_node) elif kind == DOCUMENT_NODE: child_node = nodes.DocumentNode( source_id=source_node['source_id'], title=source_node['title'], description=source_node.get('description'), license=get_license(**source_node['license']), author=source_node.get('author'), aggregator=source_node.get('aggregator'), provider=source_node.get('provider'), role=source_node.get('role', roles.LEARNER), language=source_node.get('language'), thumbnail=source_node.get('thumbnail'), tags=source_node.get('tags'), ) add_files(child_node, source_node.get('files') or []) parent_node.add_child(child_node) elif kind == HTML5_NODE: child_node = nodes.HTML5AppNode( source_id=source_node['source_id'], title=source_node['title'], description=source_node.get('description'), license=get_license(**source_node['license']), author=source_node.get('author'), aggregator=source_node.get('aggregator'), provider=source_node.get('provider'), role=source_node.get('role', roles.LEARNER), language=source_node.get('language'), thumbnail=source_node.get('thumbnail'), tags=source_node.get('tags'), ) add_files(child_node, source_node.get('files') or []) parent_node.add_child(child_node) elif kind == SLIDESHOW_NODE: child_node = nodes.SlideshowNode( source_id=source_node['source_id'], title=source_node['title'], description=source_node.get('description'), license=get_license(**source_node['license']), author=source_node.get('author'), aggregator=source_node.get('aggregator'), provider=source_node.get('provider'), role=source_node.get('role', roles.LEARNER), language=source_node.get('language'), thumbnail=source_node.get('thumbnail'), tags=source_node.get('tags') ) add_files(child_node, source_node.get('files') or []) parent_node.add_child(child_node) else: LOGGER.critical('Encountered an unknown kind: ' + str(source_node)) continue return parent_node
def download_content_node(category_node, url, title, thumbnail=None, description=None): doc = get_parsed_html_from_url(url) destination = tempfile.mkdtemp() doc = download_static_assets(doc, destination, 'https://k12.thoughtfullearning.com', request_fn=make_request, url_blacklist=url_blacklist) remove_node(doc, '#header') remove_node(doc, '.subMenuBarContainer') remove_node(doc, '.breadbookmarkcontainer') remove_node(doc, '.resourcePageTypeTitle') remove_node(doc, '.sharethis-wrapper') remove_node(doc, '.ccBlock') remove_node(doc, '#block-views-resource-info-block-block-1') remove_node(doc, '#block-views-resource-info-block-block-1') remove_node(doc, '#block-views-resource-info-block-block') remove_node(doc, '.productSuggestionContainer') remove_node(doc, 'footer') # For minilessons remove_node(doc, '.field-name-field-minilesson-downloadables') # For writing assessments remove_node(doc, '.assessmentTGLink') remove_node(doc, '.assessmentModelRubrics') remove_node(doc, '.view-display-id-attachment_1') # Write out the HTML source. with open(os.path.join(destination, "index.html"), "w") as f: f.write(str(doc)) print(" ... downloaded to %s" % destination) #preview_in_browser(destination) thumbnail_path = None if thumbnail: # Manually download the thumbnail and use it so we can lowercase the # extension to be accepted by Ricecooker. thumbnail_filename = derive_filename(thumbnail) thumbnail_path = os.path.join(destination, thumbnail_filename) download_file(thumbnail, destination, request_fn=make_request, filename=thumbnail_filename) # If there is an embedded video in the page source grab it as a video node. video_node = None iframe = doc.select_one('.embedded-video iframe') if iframe: youtube_url = iframe['src'] youtube_id = get_youtube_id_from_url(youtube_url) info = ydl.extract_info(youtube_url, download=False) video_title = info['title'] print( " ... and with video titled %s from www.youtube.com/watch?v=%s" % (video_title, youtube_id)) video_node = nodes.VideoNode( source_id=youtube_id, title=truncate_metadata(info['title']), license=licenses.CC_BY_NC_SALicense( copyright_holder=truncate_metadata('Thoughtful Learning')), description=info['description'], language="en", derive_thumbnail=True, files=[files.YouTubeVideoFile(youtube_id)], ) category_node.add_child(video_node) zip_path = create_predictable_zip(destination) app_node = nodes.HTML5AppNode( source_id=url, title=truncate_metadata(title), license=licenses.CC_BY_NC_SALicense( copyright_holder=truncate_metadata('Thoughtful Learning')), description=description, thumbnail=thumbnail_path, files=[files.HTMLZipFile(zip_path)], language="en", ) category_node.add_child(app_node)