def get_thumbnail(url): filename, _ext = os.path.splitext(os.path.basename(url)) img_path = os.path.sep.join( [THUMBNAILS_DIRECTORY, "{}.png".format(filename)]) svg_path = os.path.sep.join( [THUMBNAILS_DIRECTORY, "{}.svg".format(filename)]) # This thumbnail gets converted with an error, so download it separately for now if "US_history" in filename: return files.ThumbnailFile(path="US_history.png") # Copy pngs to local storage if url.endswith("png"): with open(img_path, 'wb') as pngobj: pngobj.write(downloader.read(url)) elif url.endswith("svg"): with open(svg_path, 'wb') as svgobj: # renderPM doesn't read <style> tags, so add style to individual elements svg_contents = BeautifulSoup(downloader.read(url), 'html.parser') svg_contents = BeautifulSoup( svg_contents.find('svg').prettify(), 'html.parser') if svg_contents.find('style'): sheet = cssutils.parseString(svg_contents.find('style').string) for rule in sheet: rectangles = svg_contents.find_all( 'rect', {'class': rule.selectorText.lstrip('.')}) paths = svg_contents.find_all( 'path', {'class': rule.selectorText.lstrip('.')}) polygons = svg_contents.find_all( 'polygon', {'class': rule.selectorText.lstrip('.')}) for el in rectangles + paths + polygons: el['style'] = "" for prop in rule.style: el['style'] += "{}:{};".format( prop.name, prop.value) # Beautifulsoup autocorrects some words to be all lowercase, so undo correction autocorrected_fields = ["baseProfile", "viewBox"] svg = svg_contents.find('svg') for field in autocorrected_fields: if svg.get(field.lower()): svg[field] = svg[field.lower()] del svg[field.lower()] svgobj.write(svg_contents.renderContents()) drawing = svg2rlg(svg_path) renderPM.drawToFile(drawing, img_path) else: import pdb pdb.set_trace() return files.ThumbnailFile(path=img_path)
def add_files(node, file_list): for f in file_list: path = f.get('path') if path is not None: abspath = get_abspath(path) # NEW: expand content:// --> ./content/ in file paths else: abspath = None file_type = guess_file_type(node.kind, filepath=abspath, youtube_id=f.get('youtube_id'), web_url=f.get('web_url'), encoding=f.get('encoding')) if file_type == FileTypes.AUDIO_FILE: node.add_file(files.AudioFile(path=abspath, language=f.get('language'))) elif file_type == FileTypes.THUMBNAIL: node.add_file(files.ThumbnailFile(path=abspath)) elif file_type == FileTypes.DOCUMENT_FILE: node.add_file(files.DocumentFile(path=abspath, language=f.get('language'))) elif file_type == FileTypes.HTML_ZIP_FILE: node.add_file(files.HTMLZipFile(path=abspath, language=f.get('language'))) elif file_type == FileTypes.VIDEO_FILE: node.add_file(files.VideoFile(path=abspath, language=f.get('language'), ffmpeg_settings=f.get('ffmpeg_settings'))) elif file_type == FileTypes.SUBTITLE_FILE: node.add_file(files.SubtitleFile(path=abspath, language=f['language'])) elif file_type == FileTypes.BASE64_FILE: node.add_file(files.Base64ImageFile(encoding=f['encoding'])) elif file_type == FileTypes.WEB_VIDEO_FILE: node.add_file(files.WebVideoFile(web_url=f['web_url'], high_resolution=f.get('high_resolution'))) elif file_type == FileTypes.YOUTUBE_VIDEO_FILE: node.add_file(files.YouTubeVideoFile(youtube_id=f['youtube_id'], high_resolution=f.get('high_resolution'))) node.add_file(files.YouTubeSubtitleFile(youtube_id=f['youtube_id'], language='en')) else: raise UnknownFileTypeError("Unrecognized file type '{0}'".format(f['path']))
def scrape_collection_files(topic, url): assets = json.loads(downloader.read(url))['data'] images = [] for asset in assets: if asset['attributes']['extension'] == 'png': images.append({ 'url': asset['attributes']['thumbnail_url'].replace( 'element.png', '*****@*****.**'), 'caption': asset['attributes']['name'] }) elif asset['attributes']['extension'] == 'mp4': video_data = json.loads( downloader.read(FILE_STORAGE_URL.format(id=asset['id']))) video = video_data['data'][0]['attributes'] topic.add_child( nodes.VideoNode(source_id=video['url'], title=asset['attributes']['name'], license=LICENSE, files=[ files.VideoFile(video['url']), files.ThumbnailFile(video['thumbnail_url']) ])) else: LOGGER.warning('Unable to add {} from {}'.format( asset['attributes']['extension'], url)) # Add images to slideshow node if len(images): topic.add_child(create_slideshow(images, url, topic.title, 'English'))
def get_thumbnail(url): """ Reads page source using downloader class to get json data """ # Hacky method to get images, but much more lightweight than converting svg to png filename, _ext = os.path.splitext(os.path.basename(url)) img_path = "{}{}{}.png".format(DOWNLOAD_DIRECTORY, os.path.sep, filename) driver = webdriver.PhantomJS() driver.set_script_timeout(30) driver.get(url) driver.save_screenshot(img_path) return files.ThumbnailFile(path=img_path)
def add_files(node, file_list): for f in file_list: file_type = guess_file_type(node.kind, filepath=f.get('path'), youtube_id=f.get('youtube_id'), web_url=f.get('web_url'), encoding=f.get('encoding')) if file_type == FileTypes.THUMBNAIL: node.add_file(files.ThumbnailFile(path=f['path'])) elif file_type == FileTypes.BASE64_FILE: node.add_file(files.Base64ImageFile(encoding=f['encoding'])) else: raise UnknownFileTypeError("Unrecognized file type '{0}'".format(f['path']))
def create_slideshow(images, source_id, title, language_name): """ images: {url: str, caption: str} """ thumbnailFile = files.ThumbnailFile(images[0]['url']) if '--slides' in sys.argv: slides = [ files.SlideImageFile(image['url'], caption=image.get('caption', '')) for image in images ] return nodes.SlideshowNode(source_id=source_id, title=title, license=LICENSE, language=LANGUAGE_MAP[language_name], files=[thumbnailFile] + slides) # Create PDF filename = hashlib.md5(source_id.encode('utf-8')).hexdigest() pdfpath = '{}{}{}.pdf'.format(DOCUMENT_DOWNLOAD_DIR, os.path.sep, filename) if not os.path.exists(pdfpath): image_list = [] for image in images: img = Image.open(BytesIO(downloader.read(image['url']))) if img.mode == 'RGBA': img = img.convert('RGB') image_list.append(img) image_list[0].save(pdfpath, save_all=True, append_images=image_list[1:]) return nodes.DocumentNode( source_id=source_id, title=title, license=LICENSE, language=LANGUAGE_MAP[language_name], files=[thumbnailFile, files.DocumentFile(pdfpath)])
def add_files(node, file_list): for f in file_list: path = f.get('path') if path is not None: abspath = get_abspath( path ) # NEW: expand content:// --> ./content/ in file paths else: abspath = None print("kind:" + node.kind.upper()) file_type = guess_file_type(node.kind, filepath=abspath) if file_type == FileTypes.THUMBNAIL: node.add_file(files.ThumbnailFile(path=abspath)) elif file_type == FileTypes.VIDEO_FILE: node.add_file( files.VideoFile(path=abspath, language=f.get('language'))) else: raise UnknownFileTypeError("Unrecognized file type '{0}'".format( f['path']))
def add_files(node, file_list): EXPECTED_FILE_TYPES = [ VIDEO_FILE, AUDIO_FILE, DOCUMENT_FILE, HTML5_FILE, THUMBNAIL_FILE, SUBTITLES_FILE ] for f in file_list: file_type = f.get('file_type') if file_type not in EXPECTED_FILE_TYPES: LOGGER.critical(file_type) raise NotImplementedError( 'Unexpected File type found in channel json.') path = f.get('path') # path can be an URL or a local path (or None) # handle different types of files if file_type == VIDEO_FILE: # handle three types of video files if 'youtube_id' in f: video_file = files.YouTubeVideoFile( youtube_id=f['youtube_id'], download_settings=f.get('download_settings', None), high_resolution=f.get('high_resolution', True), maxheight=f.get('maxheight', None), language=f.get('language', None), ) elif 'web_url' in f: video_file = files.WebVideoFile( web_url=f['web_url'], download_settings=f.get('download_settings', None), high_resolution=f.get('high_resolution', True), maxheight=f.get('maxheight', None), language=f.get('language', None), ) else: video_file = files.VideoFile( path=f['path'], language=f.get('language', None), ffmpeg_settings=f.get('ffmpeg_settings'), ) node.add_file(video_file) elif file_type == AUDIO_FILE: node.add_file( files.AudioFile(path=f['path'], language=f.get('language', None))) elif file_type == DOCUMENT_FILE: node.add_file( files.DocumentFile(path=path, language=f.get('language', None))) elif file_type == HTML5_FILE: node.add_file( files.HTMLZipFile(path=path, language=f.get('language', None))) elif file_type == THUMBNAIL_FILE: if 'encoding' in f: node.add_file(files.Base64ImageFile(encoding=f['encoding'], )) else: node.add_file( files.ThumbnailFile( path=path, language=f.get('language', None), )) elif file_type == SUBTITLES_FILE: if 'youtube_id' in f: node.add_file( files.YouTubeSubtitleFile(youtube_id=f['youtube_id'], language=f['language'])) else: node.add_file( files.SubtitleFile(path=path, language=f['language'])) else: raise UnknownFileTypeError("Unrecognized file type '{0}'".format( f['path']))