示例#1
0
def get_thumbnail(url):
    filename, _ext = os.path.splitext(os.path.basename(url))
    img_path = os.path.sep.join(
        [THUMBNAILS_DIRECTORY, "{}.png".format(filename)])
    svg_path = os.path.sep.join(
        [THUMBNAILS_DIRECTORY, "{}.svg".format(filename)])

    # This thumbnail gets converted with an error, so download it separately for now
    if "US_history" in filename:
        return files.ThumbnailFile(path="US_history.png")

    # Copy pngs to local storage
    if url.endswith("png"):
        with open(img_path, 'wb') as pngobj:
            pngobj.write(downloader.read(url))

    elif url.endswith("svg"):
        with open(svg_path, 'wb') as svgobj:
            # renderPM doesn't read <style> tags, so add style to individual elements
            svg_contents = BeautifulSoup(downloader.read(url), 'html.parser')
            svg_contents = BeautifulSoup(
                svg_contents.find('svg').prettify(), 'html.parser')
            if svg_contents.find('style'):
                sheet = cssutils.parseString(svg_contents.find('style').string)
                for rule in sheet:
                    rectangles = svg_contents.find_all(
                        'rect', {'class': rule.selectorText.lstrip('.')})
                    paths = svg_contents.find_all(
                        'path', {'class': rule.selectorText.lstrip('.')})
                    polygons = svg_contents.find_all(
                        'polygon', {'class': rule.selectorText.lstrip('.')})
                    for el in rectangles + paths + polygons:
                        el['style'] = ""
                        for prop in rule.style:
                            el['style'] += "{}:{};".format(
                                prop.name, prop.value)

            # Beautifulsoup autocorrects some words to be all lowercase, so undo correction
            autocorrected_fields = ["baseProfile", "viewBox"]
            svg = svg_contents.find('svg')
            for field in autocorrected_fields:
                if svg.get(field.lower()):
                    svg[field] = svg[field.lower()]
                    del svg[field.lower()]

            svgobj.write(svg_contents.renderContents())
        drawing = svg2rlg(svg_path)
        renderPM.drawToFile(drawing, img_path)

    else:
        import pdb
        pdb.set_trace()

    return files.ThumbnailFile(path=img_path)
示例#2
0
def add_files(node, file_list):
    for f in file_list:

        path = f.get('path')
        if path is not None:
            abspath = get_abspath(path)      # NEW: expand  content://  -->  ./content/  in file paths
        else:
            abspath = None

        file_type = guess_file_type(node.kind, filepath=abspath, youtube_id=f.get('youtube_id'), web_url=f.get('web_url'), encoding=f.get('encoding'))

        if file_type == FileTypes.AUDIO_FILE:
            node.add_file(files.AudioFile(path=abspath, language=f.get('language')))
        elif file_type == FileTypes.THUMBNAIL:
            node.add_file(files.ThumbnailFile(path=abspath))
        elif file_type == FileTypes.DOCUMENT_FILE:
            node.add_file(files.DocumentFile(path=abspath, language=f.get('language')))
        elif file_type == FileTypes.HTML_ZIP_FILE:
            node.add_file(files.HTMLZipFile(path=abspath, language=f.get('language')))
        elif file_type == FileTypes.VIDEO_FILE:
            node.add_file(files.VideoFile(path=abspath, language=f.get('language'), ffmpeg_settings=f.get('ffmpeg_settings')))
        elif file_type == FileTypes.SUBTITLE_FILE:
            node.add_file(files.SubtitleFile(path=abspath, language=f['language']))
        elif file_type == FileTypes.BASE64_FILE:
            node.add_file(files.Base64ImageFile(encoding=f['encoding']))
        elif file_type == FileTypes.WEB_VIDEO_FILE:
            node.add_file(files.WebVideoFile(web_url=f['web_url'], high_resolution=f.get('high_resolution')))
        elif file_type == FileTypes.YOUTUBE_VIDEO_FILE:
            node.add_file(files.YouTubeVideoFile(youtube_id=f['youtube_id'], high_resolution=f.get('high_resolution')))
            node.add_file(files.YouTubeSubtitleFile(youtube_id=f['youtube_id'], language='en'))
        else:
            raise UnknownFileTypeError("Unrecognized file type '{0}'".format(f['path']))
示例#3
0
def scrape_collection_files(topic, url):
    assets = json.loads(downloader.read(url))['data']
    images = []
    for asset in assets:
        if asset['attributes']['extension'] == 'png':
            images.append({
                'url':
                asset['attributes']['thumbnail_url'].replace(
                    'element.png', '*****@*****.**'),
                'caption':
                asset['attributes']['name']
            })

        elif asset['attributes']['extension'] == 'mp4':
            video_data = json.loads(
                downloader.read(FILE_STORAGE_URL.format(id=asset['id'])))
            video = video_data['data'][0]['attributes']
            topic.add_child(
                nodes.VideoNode(source_id=video['url'],
                                title=asset['attributes']['name'],
                                license=LICENSE,
                                files=[
                                    files.VideoFile(video['url']),
                                    files.ThumbnailFile(video['thumbnail_url'])
                                ]))
        else:
            LOGGER.warning('Unable to add {} from {}'.format(
                asset['attributes']['extension'], url))

    # Add images to slideshow node
    if len(images):
        topic.add_child(create_slideshow(images, url, topic.title, 'English'))
def get_thumbnail(url):
    """ Reads page source using downloader class to get json data """
    # Hacky method to get images, but much more lightweight than converting svg to png
    filename, _ext = os.path.splitext(os.path.basename(url))
    img_path = "{}{}{}.png".format(DOWNLOAD_DIRECTORY, os.path.sep, filename)
    driver = webdriver.PhantomJS()
    driver.set_script_timeout(30)
    driver.get(url)
    driver.save_screenshot(img_path)
    return files.ThumbnailFile(path=img_path)
示例#5
0
def add_files(node, file_list):
    for f in file_list:
        file_type = guess_file_type(node.kind, filepath=f.get('path'), youtube_id=f.get('youtube_id'), web_url=f.get('web_url'), encoding=f.get('encoding'))

        if file_type == FileTypes.THUMBNAIL:
            node.add_file(files.ThumbnailFile(path=f['path']))
        elif file_type == FileTypes.BASE64_FILE:
            node.add_file(files.Base64ImageFile(encoding=f['encoding']))
        else:
            raise UnknownFileTypeError("Unrecognized file type '{0}'".format(f['path']))
示例#6
0
def create_slideshow(images, source_id, title, language_name):
    """
        images: {url: str, caption: str}
    """

    thumbnailFile = files.ThumbnailFile(images[0]['url'])

    if '--slides' in sys.argv:
        slides = [
            files.SlideImageFile(image['url'],
                                 caption=image.get('caption', ''))
            for image in images
        ]
        return nodes.SlideshowNode(source_id=source_id,
                                   title=title,
                                   license=LICENSE,
                                   language=LANGUAGE_MAP[language_name],
                                   files=[thumbnailFile] + slides)

    # Create PDF
    filename = hashlib.md5(source_id.encode('utf-8')).hexdigest()
    pdfpath = '{}{}{}.pdf'.format(DOCUMENT_DOWNLOAD_DIR, os.path.sep, filename)

    if not os.path.exists(pdfpath):
        image_list = []
        for image in images:
            img = Image.open(BytesIO(downloader.read(image['url'])))
            if img.mode == 'RGBA':
                img = img.convert('RGB')
            image_list.append(img)

        image_list[0].save(pdfpath,
                           save_all=True,
                           append_images=image_list[1:])

    return nodes.DocumentNode(
        source_id=source_id,
        title=title,
        license=LICENSE,
        language=LANGUAGE_MAP[language_name],
        files=[thumbnailFile, files.DocumentFile(pdfpath)])
def add_files(node, file_list):
    for f in file_list:
        path = f.get('path')
        if path is not None:
            abspath = get_abspath(
                path
            )  # NEW: expand  content://  -->  ./content/  in file paths
        else:
            abspath = None

        print("kind:" + node.kind.upper())

        file_type = guess_file_type(node.kind, filepath=abspath)

        if file_type == FileTypes.THUMBNAIL:
            node.add_file(files.ThumbnailFile(path=abspath))

        elif file_type == FileTypes.VIDEO_FILE:
            node.add_file(
                files.VideoFile(path=abspath, language=f.get('language')))

        else:
            raise UnknownFileTypeError("Unrecognized file type '{0}'".format(
                f['path']))
示例#8
0
def add_files(node, file_list):
    EXPECTED_FILE_TYPES = [
        VIDEO_FILE, AUDIO_FILE, DOCUMENT_FILE, HTML5_FILE, THUMBNAIL_FILE,
        SUBTITLES_FILE
    ]

    for f in file_list:
        file_type = f.get('file_type')
        if file_type not in EXPECTED_FILE_TYPES:
            LOGGER.critical(file_type)
            raise NotImplementedError(
                'Unexpected File type found in channel json.')

        path = f.get('path')  # path can be an URL or a local path (or None)

        # handle different types of files
        if file_type == VIDEO_FILE:
            # handle three types of video files
            if 'youtube_id' in f:
                video_file = files.YouTubeVideoFile(
                    youtube_id=f['youtube_id'],
                    download_settings=f.get('download_settings', None),
                    high_resolution=f.get('high_resolution', True),
                    maxheight=f.get('maxheight', None),
                    language=f.get('language', None),
                )
            elif 'web_url' in f:
                video_file = files.WebVideoFile(
                    web_url=f['web_url'],
                    download_settings=f.get('download_settings', None),
                    high_resolution=f.get('high_resolution', True),
                    maxheight=f.get('maxheight', None),
                    language=f.get('language', None),
                )
            else:
                video_file = files.VideoFile(
                    path=f['path'],
                    language=f.get('language', None),
                    ffmpeg_settings=f.get('ffmpeg_settings'),
                )
            node.add_file(video_file)

        elif file_type == AUDIO_FILE:
            node.add_file(
                files.AudioFile(path=f['path'],
                                language=f.get('language', None)))

        elif file_type == DOCUMENT_FILE:
            node.add_file(
                files.DocumentFile(path=path, language=f.get('language',
                                                             None)))

        elif file_type == HTML5_FILE:
            node.add_file(
                files.HTMLZipFile(path=path, language=f.get('language', None)))

        elif file_type == THUMBNAIL_FILE:
            if 'encoding' in f:
                node.add_file(files.Base64ImageFile(encoding=f['encoding'], ))
            else:
                node.add_file(
                    files.ThumbnailFile(
                        path=path,
                        language=f.get('language', None),
                    ))

        elif file_type == SUBTITLES_FILE:
            if 'youtube_id' in f:
                node.add_file(
                    files.YouTubeSubtitleFile(youtube_id=f['youtube_id'],
                                              language=f['language']))
            else:
                node.add_file(
                    files.SubtitleFile(path=path, language=f['language']))

        else:
            raise UnknownFileTypeError("Unrecognized file type '{0}'".format(
                f['path']))