def add_content_to_tree(self, channel): tree = self.channel_tree lang = 'English' lang_obj = getlang("en") for class_name in tree[lang]: class_obj = tree[lang][class_name] class_id = "{}-{}".format(lang, class_name) class_node = nodes.TopicNode(source_id=class_name, title=class_name) for subject_name in class_obj: subject_id = "{}-{}".format(class_id, subject_name) subject_node = nodes.TopicNode(source_id=subject_id, title=subject_name) subject_obj = class_obj[subject_name] for item in subject_obj['items']: item_id = "{}-{}".format(subject_id, get_column(item, 'id')) video = nodes.VideoNode( source_id=item_id, title=get_column(item, 'name'), description=get_column(item, 'description'), files=[ files.VideoFile(path=get_column(item, 'file')) ], language=lang_obj, # FIXME: Use the column's license field instead of hardcoding. license=licenses.get_license(le_licenses.CC_BY, copyright_holder=get_column(item, "copyright")), # thumbnail=get_column(item, "thumbnail") ) subject_node.add_child(video) class_node.add_child(subject_node) channel.add_child(class_node)
def add_files(node, file_list): for f in file_list: path = f.get('path') if path is not None: abspath = get_abspath(path) # NEW: expand content:// --> ./content/ in file paths else: abspath = None file_type = guess_file_type(node.kind, filepath=abspath, youtube_id=f.get('youtube_id'), web_url=f.get('web_url'), encoding=f.get('encoding')) if file_type == FileTypes.AUDIO_FILE: node.add_file(files.AudioFile(path=abspath, language=f.get('language'))) elif file_type == FileTypes.THUMBNAIL: node.add_file(files.ThumbnailFile(path=abspath)) elif file_type == FileTypes.DOCUMENT_FILE: node.add_file(files.DocumentFile(path=abspath, language=f.get('language'))) elif file_type == FileTypes.HTML_ZIP_FILE: node.add_file(files.HTMLZipFile(path=abspath, language=f.get('language'))) elif file_type == FileTypes.VIDEO_FILE: node.add_file(files.VideoFile(path=abspath, language=f.get('language'), ffmpeg_settings=f.get('ffmpeg_settings'))) elif file_type == FileTypes.SUBTITLE_FILE: node.add_file(files.SubtitleFile(path=abspath, language=f['language'])) elif file_type == FileTypes.BASE64_FILE: node.add_file(files.Base64ImageFile(encoding=f['encoding'])) elif file_type == FileTypes.WEB_VIDEO_FILE: node.add_file(files.WebVideoFile(web_url=f['web_url'], high_resolution=f.get('high_resolution'))) elif file_type == FileTypes.YOUTUBE_VIDEO_FILE: node.add_file(files.YouTubeVideoFile(youtube_id=f['youtube_id'], high_resolution=f.get('high_resolution'))) node.add_file(files.YouTubeSubtitleFile(youtube_id=f['youtube_id'], language='en')) else: raise UnknownFileTypeError("Unrecognized file type '{0}'".format(f['path']))
def scrape_collection_files(topic, url): assets = json.loads(downloader.read(url))['data'] images = [] for asset in assets: if asset['attributes']['extension'] == 'png': images.append({ 'url': asset['attributes']['thumbnail_url'].replace( 'element.png', '*****@*****.**'), 'caption': asset['attributes']['name'] }) elif asset['attributes']['extension'] == 'mp4': video_data = json.loads( downloader.read(FILE_STORAGE_URL.format(id=asset['id']))) video = video_data['data'][0]['attributes'] topic.add_child( nodes.VideoNode(source_id=video['url'], title=asset['attributes']['name'], license=LICENSE, files=[ files.VideoFile(video['url']), files.ThumbnailFile(video['thumbnail_url']) ])) else: LOGGER.warning('Unable to add {} from {}'.format( asset['attributes']['extension'], url)) # Add images to slideshow node if len(images): topic.add_child(create_slideshow(images, url, topic.title, 'English'))
def scrape_directory(topic, directory, indent=1): for subdirectory, folders, myfiles in os.walk(directory): # Go through all of the folders under directory for folder in folders: print('{}{}'.format(' ' * indent, folder)) subtopic = nodes.TopicNode(source_id=folder, title=folder) topic.add_child(subtopic) # Go through folders under directory scrape_directory(subtopic, os.sep.join([subdirectory, folder]), indent=indent + 1) for file in myfiles: name, ext = os.path.splitext(file) if ext == '.mp4': video = nodes.VideoNode(source_id=subdirectory + file, title=name, license=LICENSE, copyright_holder=COPYRIGHT_HOLDER) videofile = files.VideoFile(os.sep.join([subdirectory, file])) video.add_file(videofile) topic.add_child(video) elif ext == '.pdf': with PDFParser(os.path.sep.join([subdirectory, file])) as parser: chapters = parser.get_data_file() generate_pdf_nodes(chapters, topic, source=os.path.basename(file)) break
def to_contentnode(self, title, directory=None, *args, **kwargs): # Generate a node based on the kind attribute filepath = self.to_file(directory=directory) if self.kind == content_kinds.HTML5: return nodes.HTML5AppNode(source_id=self.url, title=title, files=[files.HTMLZipFile(filepath)], **kwargs) elif self.kind == content_kinds.VIDEO: return nodes.VideoNode(source_id=self.url, title=title, files=[files.VideoFile(filepath)], **kwargs)
def scrape_iversity(channel): url = "{}/en/my/courses/rethinking-us-them-integration-and-diversity-in-europe/lesson_units".format( BASE_URL) LOGGER.info(" Scraping Migration Matters at {}".format(url)) source = read_source(url) chapters = source.find_all('div', {'class': 'chapter-units-wrapper'}) for chapter in chapters: title = str(chapter.find('div', {'class': 'chapter-title'}).string) source_id = title.strip().replace(" ", "_") topic = nodes.TopicNode(source_id=source_id, title=title) lessons = chapter.find_all('a', {'class': 'unit-wrapper'}) for lesson in lessons: video_exists = lesson.find('i', {'class': 'unit_video'}) video_title = str( lesson.find('span', { 'class': 'unit-title' }).string).strip() if video_exists: video_source_id = video_title.replace(" ", "_") video_url = "{}{}".format(BASE_URL, lesson.attrs["href"]) video_source = read_source(video_url) video_info = video_source.find('video') video_subtitle_path = video_info.find('track', { 'kind': 'subtitles' }).attrs["src"] video_subtitle = files.SubtitleFile( path=video_subtitle_path, language=languages.getlang('en').code) video_link = video_info.find('source', { 'res': '480' }).attrs["src"] video_file = files.VideoFile( path=video_link, language=languages.getlang('en').code) video_node = nodes.VideoNode( source_id=video_source_id, title=video_title, files=[video_file, video_subtitle], license=CHANNEL_LICENSE, copyright_holder=COPYRIGHT_HOLDER) LOGGER.info(" Uploading video - {}".format( video_title.strip())) topic.add_child(video_node) else: LOGGER.info( "Format of the file is not supported by the sushi chef : {}" .format(video_title)) channel.add_child(topic)
def make_content_node(kind, source_id, title, license, filepath, optionals): """ Create `kind` subclass of ContentNode based on required args and optionals. """ content_node = None if kind == content_kinds.VIDEO: content_node = nodes.VideoNode( source_id=source_id, title=title, license=license, author=optionals.get("author", None), description=optionals.get("description", None), derive_thumbnail=True, # video-specific data files=[files.VideoFile(path=filepath)], ) elif kind == content_kinds.AUDIO: content_node = nodes.AudioNode( source_id=source_id, title=title, license=license, author=optionals.get("author", None), description=optionals.get("description", None), thumbnail=optionals.get("thumbnail", None), files=[files.AudioFile(path=filepath)], ) elif kind == content_kinds.DOCUMENT: content_node = nodes.DocumentNode( source_id=source_id, title=title, license=license, author=optionals.get("author", None), description=optionals.get("description", None), thumbnail=optionals.get("thumbnail", None), files=[files.DocumentFile(path=filepath)], ) return content_node
def scrape_content(endpoint, channel, existingNode=None): replacements = {" ": "%20", "#": "%23"} content = read_source(endpoint) attributes = content.find("tbody").find_all("td", "text-xs-left") for attribute in attributes: source_id = attribute.attrs["data-sort-value"] # Check if it is mp4 file if source_id.endswith(".mp4"): video_info = attribute.find("a") video_title, _ext = splitext(str(video_info.string)) filter_video_link = video_info.attrs["href"][1:].replace( " ", "%20") video_link = BASE_URL + filter_video_link video_file = files.VideoFile(path=video_link) video_node = nodes.VideoNode(source_id=source_id, title=video_title, files=[video_file], license=CHANNEL_LICENSE) existingNode.add_child(video_node) # Check if it is a directory elif source_id.startswith("dir"): title = str(attribute.find("strong").string) topic_node = nodes.TopicNode(source_id=source_id, title=title) if existingNode: existingNode.add_child(topic_node) else: channel.add_child(topic_node) new_end_point = replace_all(title, replacements) new_end = endpoint + "{}/".format(new_end_point) scrape_content(new_end, channel, topic_node) else: LOGGER.info( "Format of the file is not supported by the sushi chef : {}". format(source_id))
def video_node_from_dropbox(self, video_details, link, token): dbx = dropbox.Dropbox(token) metadata, res = dbx.sharing_get_shared_link_file(url=link) # get relative path to video file video_path = os.path.relpath(os.path.join(VIDEO_FOLDER, metadata.name)) if not os.path.isfile(video_path): with open(video_path, 'wb') as f: f.write(res.content) else: LOGGER.info("{} already downloaded. Skipping".format( metadata.name)) video_file = files.VideoFile(path=video_path) video_node = nodes.VideoNode( title=video_details["title"], source_id=link, license=licenses.CC_BYLicense("TicTacLearn"), files=[video_file]) return video_node
def add_files(node, file_list): for f in file_list: file_name, file_type = parse_file_name(f) print (f) if file_type == 'mp3':#FileTypes.AUDIO_FILE: node.add_file(files.AudioFile(path=f)) #elif file_type == FileTypes.THUMBNAIL: # node.add_file(files.ThumbnailFile(path=f['path'])) elif file_type == 'pdf':#FileTypes.DOCUMENT_FILE: node.add_file(files.DocumentFile(path=f)) #elif file_type == FileTypes.HTML_ZIP_FILE: # node.add_file(files.HTMLZipFile(path=f['path'], language=f.get('language'))) elif file_type == 'mp4':#FileTypes.VIDEO_FILE: node.add_file(files.VideoFile(path=f)) #elif file_type == FileTypes.SUBTITLE_FILE: # node.add_file(files.SubtitleFile(path=f['path'], language=f['language'])) #elif file_type == FileTypes.BASE64_FILE: # node.add_file(files.Base64ImageFile(encoding=f['encoding'])) #elif file_type == FileTypes.WEB_VIDEO_FILE: # node.add_file(files.WebVideoFile(web_url=f['web_url'], high_resolution=f.get('high_resolution'))) #elif file_type == FileTypes.YOUTUBE_VIDEO_FILE: # node.add_file(files.YouTubeVideoFile(youtube_id=f['youtube_id'], high_resolution=f.get('high_resolution'))) else: raise UnknownFileTypeError("Unrecognised file type '{0}'".format(f['path']))
def add_files(node, file_list): for f in file_list: path = f.get('path') if path is not None: abspath = get_abspath( path ) # NEW: expand content:// --> ./content/ in file paths else: abspath = None print("kind:" + node.kind.upper()) file_type = guess_file_type(node.kind, filepath=abspath) if file_type == FileTypes.THUMBNAIL: node.add_file(files.ThumbnailFile(path=abspath)) elif file_type == FileTypes.VIDEO_FILE: node.add_file( files.VideoFile(path=abspath, language=f.get('language'))) else: raise UnknownFileTypeError("Unrecognized file type '{0}'".format( f['path']))
def add_files(node, file_list): EXPECTED_FILE_TYPES = [ VIDEO_FILE, AUDIO_FILE, DOCUMENT_FILE, HTML5_FILE, THUMBNAIL_FILE, SUBTITLES_FILE ] for f in file_list: file_type = f.get('file_type') if file_type not in EXPECTED_FILE_TYPES: LOGGER.critical(file_type) raise NotImplementedError( 'Unexpected File type found in channel json.') path = f.get('path') # path can be an URL or a local path (or None) # handle different types of files if file_type == VIDEO_FILE: # handle three types of video files if 'youtube_id' in f: video_file = files.YouTubeVideoFile( youtube_id=f['youtube_id'], download_settings=f.get('download_settings', None), high_resolution=f.get('high_resolution', True), maxheight=f.get('maxheight', None), language=f.get('language', None), ) elif 'web_url' in f: video_file = files.WebVideoFile( web_url=f['web_url'], download_settings=f.get('download_settings', None), high_resolution=f.get('high_resolution', True), maxheight=f.get('maxheight', None), language=f.get('language', None), ) else: video_file = files.VideoFile( path=f['path'], language=f.get('language', None), ffmpeg_settings=f.get('ffmpeg_settings'), ) node.add_file(video_file) elif file_type == AUDIO_FILE: node.add_file( files.AudioFile(path=f['path'], language=f.get('language', None))) elif file_type == DOCUMENT_FILE: node.add_file( files.DocumentFile(path=path, language=f.get('language', None))) elif file_type == HTML5_FILE: node.add_file( files.HTMLZipFile(path=path, language=f.get('language', None))) elif file_type == THUMBNAIL_FILE: if 'encoding' in f: node.add_file(files.Base64ImageFile(encoding=f['encoding'], )) else: node.add_file( files.ThumbnailFile( path=path, language=f.get('language', None), )) elif file_type == SUBTITLES_FILE: if 'youtube_id' in f: node.add_file( files.YouTubeSubtitleFile(youtube_id=f['youtube_id'], language=f['language'])) else: node.add_file( files.SubtitleFile(path=path, language=f['language'])) else: raise UnknownFileTypeError("Unrecognized file type '{0}'".format( f['path']))
def crawl_each_post(post_url): resp = requests.get(post_url, headers=HEADERS) soup = BeautifulSoup(resp.content, "html.parser") wrapper = soup.find('div', {'class': 'wpb_wrapper'}) course_name = wrapper.find('div', { 'class': 'vc_custom_heading' }).getText().strip() delimiters = " OF ", " FROM " regex_pattern = '|'.join(map(re.escape, delimiters)) course = re.split(regex_pattern, course_name)[1] wpb_video_wrapper = wrapper.find_all('div', {'class': 'wpb_video_wrapper'}) if wpb_video_wrapper: for each_wrapper in wpb_video_wrapper: video_url = each_wrapper.find('iframe').attrs["src"].split( "?feature")[0] video_id = video_url.split("/")[-1] ydl = youtube_dl.YoutubeDL({ 'outtmpl': './downloads/%(id)s.%(ext)s', 'writeautomaticsub': True, 'logger': LOGGER }) with ydl: result = ydl.extract_info( "http://www.youtube.com/watch?v={}".format(video_id), download=True) if 'entries' in result: video = result['entries'][0] else: video = result video_title = video["title"] video_source_id = video_title.strip().replace(" ", "_") video_path = "{}/{}.mp4".format(DOWNLOAD_DIRECTORY, video_id) video_subtitle_path = "{}/{}.en.vtt".format( DOWNLOAD_DIRECTORY, video_id) video_file = files.VideoFile(path=video_path, language=languages.getlang('en').code) video_subtitle = files.SubtitleFile( path=video_subtitle_path, language=languages.getlang('en').code) video_node = nodes.VideoNode( source_id=video_source_id, title=video_title, files=[video_file, video_subtitle], license=CHANNEL_LICENSE, copyright_holder=COPYRIGHT_HOLDER, ) if course not in EPISODE_DICT: EPISODE_DICT[course] = [video_node] else: EPISODE_DICT[course].append(video_node) LOGGER.info(" Uploading video - {}".format(video_title.strip())) else: LOGGER.info( "Format of the file is not supported by the sushi chef : {}". format(course_name))