def _extract_videos(raw_json: str) -> Tuple[List[str], Optional[str]]: """Extracts videos from a raw json page :param str raw_json: Input json extracted from the page or the last server response :rtype: Tuple[List[str], Optional[str]] :returns: Tuple containing a list of up to 100 video watch ids and a continuation token, if more videos are available """ initial_data = json.loads(raw_json) # this is the json tree structure, if the json was extracted from # html try: videos = initial_data["contents"][ "twoColumnBrowseResultsRenderer"]["tabs"][1]["tabRenderer"][ "content"]["sectionListRenderer"]["contents"][0][ "itemSectionRenderer"]["contents"][0]["gridRenderer"][ "items"] except (KeyError, IndexError, TypeError): try: # this is the json tree structure, if the json was directly sent # by the server in a continuation response important_content = initial_data[1]['response'][ 'onResponseReceivedActions'][0][ 'appendContinuationItemsAction']['continuationItems'] videos = important_content except (KeyError, IndexError, TypeError): try: # this is the json tree structure, if the json was directly sent # by the server in a continuation response # no longer a list and no longer has the "response" key important_content = initial_data[ 'onResponseReceivedActions'][0][ 'appendContinuationItemsAction'][ 'continuationItems'] videos = important_content except (KeyError, IndexError, TypeError) as p: logger.info(p) return [], None try: continuation = videos[-1]['continuationItemRenderer'][ 'continuationEndpoint']['continuationCommand']['token'] videos = videos[:-1] except (KeyError, IndexError): # if there is an error, no continuation is available continuation = None # remove duplicates return ( uniqueify( list( # only extract the video ids from the video data map( lambda x: (f"/watch?v=" f"{x['gridVideoRenderer']['videoId']}"), videos)), ), continuation, )
def _extract_videos(raw_json: str) -> Tuple[List[str], Optional[str]]: """Extracts videos from a raw json page :param str raw_json: Input json extracted from the page or the last server response :rtype: Tuple[List[str], Optional[str]] :returns: Tuple containing a list of up to 100 video watch ids and a continuation token, if more videos are available """ initial_data = json.loads(raw_json) try: # this is the json tree structure, if the json was extracted from # html important_content = \ initial_data["contents"]["twoColumnBrowseResultsRenderer"][ "tabs"][ 0][ "tabRenderer"]["content"]["sectionListRenderer"][ "contents"][0][ "itemSectionRenderer"]["contents"][0][ "playlistVideoListRenderer"] except (KeyError, IndexError, TypeError): try: # this is the json tree structure, if the json was directly sent # by the server in a continuation response important_content = \ initial_data[1]["response"][ "continuationContents"][ "playlistVideoListContinuation"] except (KeyError, IndexError, TypeError) as p: print(p) return [], None videos = important_content["contents"] try: continuation = \ important_content["continuations"][0]["nextContinuationData"][ "continuation"] except (KeyError, IndexError): # if there is an error, no continuation is available continuation = None # remove duplicates return uniqueify( list( # only extract the video ids from the video data map( lambda x: (f"/watch?v={x['playlistVideoRenderer']['videoId']}"), videos))), continuation
def get_url_list_from_file(file, retry): file = file or defaultIni downloads = list() urls = list() if file and os.path.exists(file): with open(file, "r") as fp: for line in fp: downloads.append(line.strip('\n')) downloads = uniqueify(downloads) with open(file, "w") as f: for url in downloads: f.write(f"{url}\n") for url in downloads: urls += get_url_by_item(url, retry) urls = uniqueify(urls) with open(file, "w") as f: for url in urls: f.write(f"{url}\n") return urls
def _extract_videos(raw_json: str) -> Tuple[ List[Tuple[str, str]], Optional[str]]: """ @returns: Tuple[Tuple[endpoint, title], Continuation[Optional]] """ initial_data = json.loads(raw_json) try: important_content = \ initial_data["contents"]["twoColumnBrowseResultsRenderer"][ "tabs"][ 0][ "tabRenderer"]["content"]["sectionListRenderer"][ "contents"][0][ "itemSectionRenderer"]["contents"][0][ "playlistVideoListRenderer"] except (KeyError, IndexError, TypeError): try: important_content = \ initial_data[1]["response"][ "continuationContents"][ "playlistVideoListContinuation"] except (KeyError, IndexError, TypeError) as p: print(p) return [], None videos = important_content["contents"] try: continuation = \ important_content["continuations"][0]["nextContinuationData"][ "continuation"] except (KeyError, IndexError): continuation = None return uniqueify( list( map( lambda x: ( f"/watch?v={x['playlistVideoRenderer']['videoId']}", x["playlistVideoRenderer"]["title"].get("simpleText", "")), videos ) ) ), continuation
def get_correct_videos_from_playlist(url, retry): videos = list() title = None i = 1 while len(videos) == 0 or title == None: videos, title = get_videos_from_playlist(url) logger.debug(f"{i} retry in get_correct_videos_from_playlist()") # logger.info(fib(i)) i += 1 if i > retry + 100: break logger.info('Playlist = {url}'.format(url=url)) logger.info('Title = {title}'.format(title=title)) logger.info( '{videos} Videos found from playlist'.format(videos=len(videos))) return uniqueify(videos)
def get_videos_from_channel(url): videos = list() try: channel_id: str = regex_search(r"(?:channel|\/)([0-9A-Za-z_-]{24}).*", url, group=1) except IndexError: # assume that url is just the id channel_id = url channel_url = f"https://www.youtube.com/channel/{channel_id}/videos" html = request.get(channel_url) video_regex = re.compile(r"href=\"(/watch\?v=[\w-]*)") videos = uniqueify(video_regex.findall(html)) videos = [f"https://www.youtube.com{video_id}" for video_id in videos] return videos
def test_uniqueify(): non_unique_list = [1, 2, 3, 3, 4, 5] expected = [1, 2, 3, 4, 5] result = uniqueify(non_unique_list) assert result == expected
def _extract_videos(self, html: str) -> List[str]: return uniqueify(self._video_regex.findall(html))
def _extract_videos_old(self, html: str) -> List[Tuple[str, str]]: matches = self._video_regex_2.findall(html) _list: List[Tuple[str, str]] = [] for match in matches: _list.append((self._video_url(match[0]), match[1])) return uniqueify(_list)