Пример #1
0
    def download_thumbnails(self, video_ids):
        """
		Download video thumbnails
		:param video_ids list, list of YouTube video IDs
		"""

        # prepare staging area
        results_path = self.dataset.get_staging_area()

        # Use YouTubeDL and the YouTube API to request video data
        youtube = build(config.YOUTUBE_API_SERVICE_NAME,
                        config.YOUTUBE_API_VERSION,
                        developerKey=config.YOUTUBE_DEVELOPER_KEY)

        ids_list = get_yt_compatible_ids(video_ids)
        retries = 0

        for i, ids_string in enumerate(ids_list):
            if self.interrupted:
                raise ProcessorInterruptedException(
                    "Interrupted while downloading thumbnails from YouTube")

            while retries < self.max_retries:
                try:
                    response = youtube.videos().list(part="snippet",
                                                     id=ids_string,
                                                     maxResults=50).execute()
                    break
                except Exception as error:
                    self.dataset.update_status("Encountered exception " +
                                               str(error) +
                                               ".\nSleeping for " +
                                               str(self.sleep_time))
                    retries += 1
                    api_error = error
                    time.sleep(
                        self.sleep_time)  # Wait a bit before trying again

            # Do nothing with the results if the requests failed -
            # be in the final results file
            if retries >= self.max_retries:
                self.dataset.update_status("Error during YouTube API request")
            else:
                # Get and return results for each video
                for metadata in response["items"]:

                    # Get the URL of the thumbnail
                    thumb_url = metadata["snippet"]["thumbnails"]["high"][
                        "url"]
                    # Format the path to save the thumbnail to
                    save_path = results_path.joinpath(
                        metadata["id"] + "." + str(thumb_url.split('.')[-1]))
                    # Download the image
                    urllib.request.urlretrieve(thumb_url, save_path)

            self.dataset.update_status("Downloaded thumbnails for " +
                                       str(i * 50) + "/" + str(len(video_ids)))

        # create zip of archive and delete temporary files and folder
        self.dataset.update_status("Compressing results into archive")

        # Save the count of images for `finish` function
        image_count = 0

        self.write_archive_and_finish(results_path)
Пример #2
0
    def request_youtube_api(self, ids, custom_key=None, object_type="video"):
        """
		Use the YouTube API to fetch metadata from videos or channels.

		:param video_ids, str:		A list of valid YouTube IDs
		:param custom_key, str:		A custom API key which can be provided by the user.
		:param object_type, str:	The type of object to query. Currently only `video` or `channel`. 
		
		:return list, containing dicts with YouTube's response metadata.
		Max 50 results per try.

		"""

        ids_list = get_yt_compatible_ids(ids)

        if object_type != "video" and object_type != "channel":
            return "No valid YouTube object type (currently only 'channel' and 'video' are supported)"

        # List of dicts for all video data
        results = {}

        # Use standard key or custom key
        if custom_key:
            api_key = custom_key
        else:
            api_key = config.YOUTUBE_DEVELOPER_KEY

        for i, ids_string in enumerate(ids_list):

            retries = 0
            api_error = ""

            try:
                # Use YouTubeDL and the YouTube API to request video data
                youtube = build(config.YOUTUBE_API_SERVICE_NAME,
                                config.YOUTUBE_API_VERSION,
                                developerKey=api_key)
            # Catch invalid API keys
            except HttpError as e:
                if e.resp.status == 400:  # "Bad Request"
                    self.invalid_api_key = True
                    return results
            # Google API's also throws other weird errors that might be resolved by retrying, like SSLEOFError
            except Exception as e:
                time.sleep(self.sleep_time)  # Wait a bit before trying again
                pass

            while retries < self.max_retries:
                try:
                    if object_type == "video":
                        response = youtube.videos().list(
                            part='snippet,contentDetails,statistics',
                            id=ids_string,
                            maxResults=50).execute()
                    elif object_type == "channel":
                        response = youtube.channels().list(
                            part=
                            "snippet,topicDetails,statistics,brandingSettings",
                            id=ids_string,
                            maxResults=50).execute()

                    self.api_limit_reached = False

                    break

                # Check rate limits
                except HttpError as httperror:

                    status_code = httperror.resp.status

                    if status_code == 403:  # "Forbidden", what Google returns with rate limits
                        retries += 1
                        self.api_limit_reached = True
                        self.dataset.update_status(
                            "API quota limit might be reached (HTTP" +
                            str(status_code) + "), sleeping for " +
                            str(self.sleep_time))
                        time.sleep(
                            self.sleep_time)  # Wait a bit before trying again
                        pass

                    else:
                        retries += 1
                        self.dataset.update_status(
                            "API error encoutered (HTTP" + str(status_code) +
                            "), sleeping for " + str(self.sleep_time))
                        time.sleep(
                            self.sleep_time)  # Wait a bit before trying again
                        pass

                # Google API's also throws other weird errors that might be resolved by retrying, like SSLEOFError
                except Exception as e:
                    retries += 1
                    self.dataset.update_status(
                        "Error encoutered, sleeping for " +
                        str(self.sleep_time))
                    time.sleep(
                        self.sleep_time)  # Wait a bit before trying again
                    pass

            # Do nothing with the results if the requests failed
            if retries > self.max_retries:
                if self.api_limit_reached == True:
                    self.dataset.update_status(
                        "Daily YouTube API requests exceeded.")

                return results

            else:

                # Sometimes there's no results,
                # and "respoonse" won't have an item key.
                if "items" not in response:
                    continue

                # Get and return results for each video
                for metadata in response["items"]:
                    result = {}

                    # This will become the key
                    result_id = metadata["id"]

                    if object_type == "video":

                        # Results as dict entries
                        result["type"] = "video"

                        result["upload_time"] = metadata["snippet"].get(
                            "publishedAt")
                        result["channel_id"] = metadata["snippet"].get(
                            "channelId")
                        result["channel_title"] = metadata["snippet"].get(
                            "channelTitle")
                        result["video_id"] = metadata["snippet"].get("videoId")
                        result["video_title"] = metadata["snippet"].get(
                            "title")
                        result["video_duration"] = metadata.get(
                            "contentDetails").get("duration")
                        result["video_view_count"] = metadata[
                            "statistics"].get("viewCount")
                        result["video_comment_count"] = metadata[
                            "statistics"].get("commentCount")
                        result["video_likes_count"] = metadata[
                            "statistics"].get("likeCount")
                        result["video_dislikes_count"] = metadata[
                            "statistics"].get("dislikeCount")
                        result["video_topic_ids"] = metadata.get(
                            "topicDetails")
                        result["video_category_id"] = metadata["snippet"].get(
                            "categoryId")
                        result["video_tags"] = metadata["snippet"].get("tags")

                    elif object_type == "channel":

                        # Results as dict entries
                        result["type"] = "channel"
                        result["channel_id"] = metadata["snippet"].get(
                            "channelId")
                        result["channel_title"] = metadata["snippet"].get(
                            "title")
                        result["channel_description"] = metadata[
                            "snippet"].get("description")
                        result["channel_default_language"] = metadata[
                            "snippet"].get("defaultLanguage")
                        result["channel_country"] = metadata["snippet"].get(
                            "country")
                        result["channel_viewcount"] = metadata[
                            "statistics"].get("viewCount")
                        result["channel_commentcount"] = metadata[
                            "statistics"].get("commentCount")
                        result["channel_subscribercount"] = metadata[
                            "statistics"].get("subscriberCount")
                        result["channel_videocount"] = metadata[
                            "statistics"].get("videoCount")
                        # This one sometimes fails for some reason
                        if "topicDetails" in metadata:
                            result["channel_topic_ids"] = metadata[
                                "topicDetails"].get("topicIds")
                            result["channel_topic_categories"] = metadata[
                                "topicDetails"].get("topicCategories")
                        result["channel_branding_keywords"] = metadata.get(
                            "brandingSettings").get("channel").get("keywords")

                    results[result_id] = result

            # Update status per response item
            self.dataset.update_status("Got metadata from " + str(i * 50) +
                                       "/" + str(len(ids)) + " " +
                                       object_type + " YouTube URLs")

        return results