Exemplo n.º 1
0
def add_files(node, file_list):
    for f in file_list:

        path = f.get('path')
        if path is not None:
            abspath = get_abspath(path)      # NEW: expand  content://  -->  ./content/  in file paths
        else:
            abspath = None

        file_type = guess_file_type(node.kind, filepath=abspath, youtube_id=f.get('youtube_id'), web_url=f.get('web_url'), encoding=f.get('encoding'))

        if file_type == FileTypes.AUDIO_FILE:
            node.add_file(files.AudioFile(path=abspath, language=f.get('language')))
        elif file_type == FileTypes.THUMBNAIL:
            node.add_file(files.ThumbnailFile(path=abspath))
        elif file_type == FileTypes.DOCUMENT_FILE:
            node.add_file(files.DocumentFile(path=abspath, language=f.get('language')))
        elif file_type == FileTypes.HTML_ZIP_FILE:
            node.add_file(files.HTMLZipFile(path=abspath, language=f.get('language')))
        elif file_type == FileTypes.VIDEO_FILE:
            node.add_file(files.VideoFile(path=abspath, language=f.get('language'), ffmpeg_settings=f.get('ffmpeg_settings')))
        elif file_type == FileTypes.SUBTITLE_FILE:
            node.add_file(files.SubtitleFile(path=abspath, language=f['language']))
        elif file_type == FileTypes.BASE64_FILE:
            node.add_file(files.Base64ImageFile(encoding=f['encoding']))
        elif file_type == FileTypes.WEB_VIDEO_FILE:
            node.add_file(files.WebVideoFile(web_url=f['web_url'], high_resolution=f.get('high_resolution')))
        elif file_type == FileTypes.YOUTUBE_VIDEO_FILE:
            node.add_file(files.YouTubeVideoFile(youtube_id=f['youtube_id'], high_resolution=f.get('high_resolution')))
            node.add_file(files.YouTubeSubtitleFile(youtube_id=f['youtube_id'], language='en'))
        else:
            raise UnknownFileTypeError("Unrecognized file type '{0}'".format(f['path']))
def fetch_video(video):
    youtube_id = video['id']
    title = video['title']
    description = video['description']
    youtube_url = video['webpage_url']
    subtitle_languages = video['subtitles'].keys()

    print("    Fetching video data: %s (%s)" % (title, youtube_url))

    video_node = nodes.VideoNode(
        source_id=youtube_id,
        title=truncate_metadata(title),
        license=LICENSE,
        description=truncate_description(description),
        derive_thumbnail=True,
        language="en",
        files=[files.YouTubeVideoFile(youtube_id=youtube_id)],
    )

    # Add subtitles in whichever languages are available.
    for language in subtitle_languages:
        # TODO(david): Should catch exception thrown by
        # files.YouTubeSubtitleFile rather than breaking abstraction.
        if languages.getlang(language) or languages.getlang_by_alpha2(
                language):
            video_node.add_file(
                files.YouTubeSubtitleFile(youtube_id=youtube_id,
                                          language=language))
        else:
            print("WARNING: Subtitle language %s not found in languages file" %
                  language)

    return video_node
Exemplo n.º 3
0
def add_files(node, file_list):
    EXPECTED_FILE_TYPES = [
        VIDEO_FILE, AUDIO_FILE, DOCUMENT_FILE, HTML5_FILE, THUMBNAIL_FILE,
        SUBTITLES_FILE
    ]

    for f in file_list:
        file_type = f.get('file_type')
        if file_type not in EXPECTED_FILE_TYPES:
            LOGGER.critical(file_type)
            raise NotImplementedError(
                'Unexpected File type found in channel json.')

        path = f.get('path')  # path can be an URL or a local path (or None)

        # handle different types of files
        if file_type == VIDEO_FILE:
            # handle three types of video files
            if 'youtube_id' in f:
                video_file = files.YouTubeVideoFile(
                    youtube_id=f['youtube_id'],
                    download_settings=f.get('download_settings', None),
                    high_resolution=f.get('high_resolution', True),
                    maxheight=f.get('maxheight', None),
                    language=f.get('language', None),
                )
            elif 'web_url' in f:
                video_file = files.WebVideoFile(
                    web_url=f['web_url'],
                    download_settings=f.get('download_settings', None),
                    high_resolution=f.get('high_resolution', True),
                    maxheight=f.get('maxheight', None),
                    language=f.get('language', None),
                )
            else:
                video_file = files.VideoFile(
                    path=f['path'],
                    language=f.get('language', None),
                    ffmpeg_settings=f.get('ffmpeg_settings'),
                )
            node.add_file(video_file)

        elif file_type == AUDIO_FILE:
            node.add_file(
                files.AudioFile(path=f['path'],
                                language=f.get('language', None)))

        elif file_type == DOCUMENT_FILE:
            node.add_file(
                files.DocumentFile(path=path, language=f.get('language',
                                                             None)))

        elif file_type == HTML5_FILE:
            node.add_file(
                files.HTMLZipFile(path=path, language=f.get('language', None)))

        elif file_type == THUMBNAIL_FILE:
            if 'encoding' in f:
                node.add_file(files.Base64ImageFile(encoding=f['encoding'], ))
            else:
                node.add_file(
                    files.ThumbnailFile(
                        path=path,
                        language=f.get('language', None),
                    ))

        elif file_type == SUBTITLES_FILE:
            if 'youtube_id' in f:
                node.add_file(
                    files.YouTubeSubtitleFile(youtube_id=f['youtube_id'],
                                              language=f['language']))
            else:
                node.add_file(
                    files.SubtitleFile(path=path, language=f['language']))

        else:
            raise UnknownFileTypeError("Unrecognized file type '{0}'".format(
                f['path']))
    def add_video_nodes_from_playlist(
            self,
            youtube_client,
            playlist_id,
            subtitle_languages=SUBTITLE_LANGUAGES,
            copyright_holder=COPYRIGHT_HOLDER,
            only_creative_commons=ONLY_CREATIVE_COMMONS):
        first_page = True
        next_page_token = None
        playlist_request_kwargs = {
            'part': 'contentDetails',
            'maxResults': 50,
            'playlistId': playlist_id,
        }

        # Apparently the same video is in one of the playlists twice!
        # This is used to keep track of videos that have already been added.
        videos_added = {}
        while first_page or next_page_token:
            first_page = False  # we're visiting the first page now!
            playlist_info = youtube_client.playlistItems().list(
                **playlist_request_kwargs).execute()
            playlist_items = playlist_info['items']

            video_ids = [
                vid['contentDetails']['videoId'] for vid in playlist_items
            ]
            videos = youtube_client.videos().list(
                part='status,snippet',
                id=','.join(video_ids)).execute()['items']

            # Apparently the same video is in one of the playlists twice!
            # Uncomment the following code to see for yourself:

            # video_ids = [v['id'] for v in videos]
            # duplicated_videos = [v for v in video_ids if video_ids.count(v) > 1]
            # print("The following videos are duplicated: {}".format(duplicated_videos))

            for video in videos:
                if video['id'] in videos_added:
                    continue
                if only_creative_commons and video['status'][
                        'license'] != 'creativeCommon':
                    print(
                        "The video '{}' is not licensed as Creative Commons... it is licensed as {}"
                        .format(video['snippet']['title'],
                                video['status']['license']))
                else:
                    try:
                        video_license = licenses.CC_BY \
                                        if video['status']['license'] == 'creativeCommon' \
                                        else NON_CREATIVE_COMMONS_LICENSE_DEFAULT
                        video_node = nodes.VideoNode(
                            # source_id="{}__{}".format(video['id'], playlist_id),
                            source_id=video['id'],
                            title=video['snippet']['title'],
                            language=CHANNEL_LANGUAGE,
                            license=get_license(
                                video_license,
                                copyright_holder=copyright_holder),
                            thumbnail=get_largest_thumbnail(
                                video['snippet']['thumbnails']).get('url'),
                            files=[
                                files.YouTubeVideoFile(video['id']),
                            ])

                        # Get subtitles for languages designated in SUBTITLE_LANGUAGES
                        for lang_code in subtitle_languages:
                            if files.is_youtube_subtitle_file_supported_language(
                                    lang_code):
                                video_node.add_file(
                                    files.YouTubeSubtitleFile(
                                        youtube_id=video['id'],
                                        language=lang_code))
                            else:
                                print('Unsupported subtitle language code:',
                                      lang_code)

                        self.add_child(video_node)
                        videos_added[video['id']] = video_node
                    except Exception as e:
                        raise e

            # set up the next page, if there is one
            next_page_token = playlist_info.get('nextPageToken')
            if next_page_token:
                playlist_request_kwargs['pageToken'] = next_page_token
            else:
                try:
                    del playlist_request_kwargs['pageToken']
                except Exception as e:
                    pass
def scrape_content(title, content_url):
    """
    title: Boys' clothing
    content_url: http://www.touchableearth.org/china-culture-boys-clothing/
    """
    print("    Scraping content node: %s (%s)" % (title, content_url))

    doc = get_parsed_html_from_url(content_url)
    if not doc:  # 404
        return None

    description = create_description(doc)
    source_id = doc.select_one(".current_post.active .post_id")["value"]

    base_node_attributes = {
        "source_id": source_id,
        "title": title,
        "license": TE_LICENSE,
        "description": description,
    }

    youtube_iframe = doc.select_one(".video-container iframe")
    if youtube_iframe:
        youtube_url = doc.select_one(".video-container iframe")["src"]
        youtube_id = get_youtube_id_from_url(youtube_url)

        if not youtube_id:
            print("    *** WARNING: youtube_id not found for content url",
                  content_url)
            print("    Skipping.")
            return None

        try:
            info = ydl.extract_info(youtube_url, download=False)
            subtitles = info.get("subtitles")
            subtitle_languages = subtitles.keys() if subtitles else []
            print("      ... with subtitles in languages:", subtitle_languages)
        except youtube_dl.DownloadError as e:
            # Some of the videos have been removed from the YouTube channel --
            # skip creating content nodes for them entirely so they don't show up
            # as non-loadable videos in Kolibri.
            print("        NOTE: Skipping video download due to error: ", e)
            return None

        video_node = nodes.VideoNode(
            **base_node_attributes,
            derive_thumbnail=True,
            files=[WatermarkedYouTubeVideoFile(youtube_id=youtube_id)],
        )

        # Add subtitles in whichever languages are available.
        for language in subtitle_languages:
            video_node.add_file(
                files.YouTubeSubtitleFile(youtube_id=youtube_id,
                                          language=language))

        return video_node

    img = doc.select_one(".uncode-single-media-wrapper img")
    if img:
        img_src = img["data-guid"] or img["src"]
        destination = tempfile.mkdtemp()
        download_file(img_src,
                      destination,
                      request_fn=make_request,
                      filename="image.jpg")

        with open(os.path.join(destination, "index.html"), "w") as f:
            f.write("""
                <!doctype html>
                <html>
                <head></head>
                <body>
                    <img src="image.jpg" style="width: 100%; max-width: 1200px;" />
                </body>
                </html>
            """)

        zip_path = create_predictable_zip(destination)

        return nodes.HTML5AppNode(
            **base_node_attributes,
            files=[files.HTMLZipFile(zip_path)],
            thumbnail=img_src,
        )

    return None
Exemplo n.º 6
0
    def construct_channel(self, *args, **kwargs):
        """
        Creates ChannelNode and build topic tree
        Args:
          - args: arguments passed in during upload_channel (currently None)
          - kwargs: extra argumens and options not handled by `uploadchannel`.
            For example, add the command line option   lang="fr"  and the string
            "fr" will be passed along to `construct_channel` as kwargs['lang'].
        Returns: ChannelNode
        """
        channel = self.get_channel(*args, **kwargs)  # Create ChannelNode from data in self.channel_info
        
        from apiclient.discovery import build
        # instantiate a YouTube Data API v3 client
        youtube = build('youtube', 'v3', developerKey=kwargs['--youtube-api-token'])
        playlists = youtube.playlists().list( # list all of the YouTube channel's playlists
            part='snippet',
            channelId=YOUTUBE_CHANNEL_ID,
            maxResults=50
        ).execute()['items']

        # For getting the thumbnail automatically
        
        # youtube_channel = youtube.channels().list(
        #     id=YOUTUBE_CHANNEL_ID,
        #     part='snippet'
        # ).execute()['items'][0]

        # channel.thumbnail = get_largest_thumbnail(youtube_channel['snippet']['thumbnails']).get('url')

        for playlist in playlists:
            topic = nodes.TopicNode(title=playlist['snippet']['title'], source_id=playlist['id'])
            first_page = True
            next_page_token = None
            playlist_request_kwargs = {
                'part': 'contentDetails',
                'maxResults': 50,
                'playlistId': playlist['id'],
            }

            while first_page or next_page_token:
                first_page = False # we're visiting the first page now!
                playlist_info = youtube.playlistItems().list(**playlist_request_kwargs).execute()
                playlist_items = playlist_info['items']

                video_ids = [vid['contentDetails']['videoId'] for vid in playlist_items]
                videos = youtube.videos().list(
                    part='status,snippet',
                    id=','.join(video_ids)
                ).execute()['items']

                for video in videos:
                    if video['status']['license'] == 'creativeCommon':
                        try:
                            video_node = nodes.VideoNode(
                                source_id=video['id'],
                                title=video['snippet']['title'],
                                language=CHANNEL_LANGUAGE,
                                license=get_license(licenses.CC_BY, copyright_holder='Espresso English'),
                                thumbnail=get_largest_thumbnail(video['snippet']['thumbnails']).get('url'),
                                files=[
                                    files.YouTubeVideoFile(video['id']),
                                ]
                            )

                            topic.add_child(video_node)
                            
                            # Get subtitles for languages designated in SUBTITLE_LANGUAGES
                            for lang_code in SUBTITLE_LANGUAGES:
                                if files.is_youtube_subtitle_file_supported_language(lang_code):
                                    video_node.add_file(
                                        files.YouTubeSubtitleFile(
                                            youtube_id=video['id'],
                                            language=lang_code
                                        )
                                    )
                                else:
                                    print('Unsupported subtitle language code:', lang_code)

                        except Exception as e:
                            raise e
                
                # set up the next page, if there is one
                next_page_token = playlist_info.get('nextPageToken')
                if next_page_token:
                    playlist_request_kwargs['pageToken'] = next_page_token
                else:
                    try:
                        del playlist_request_kwargs['pageToken']
                    except Exception as e:
                        pass

            channel.add_child(topic)

        raise_for_invalid_channel(channel)  # Check for errors in channel construction

        return channel