def get_playlist(self): # Retrieve HTML source if it has not been retrieved already if not self.html_src: source = self.retrieve_html_source() else: source = self.html_src #split to find the playlist name name_source = source.split(r'<h1 class="main">')[1] name_source = name_source.split('</span>')[0] playlist_name = re.findall(r'\">(.*)</a>', name_source)[0] # Remove everything before the playlist section songs_source = source.split("<tbody data-bind=\"foreach: tracks\"")[1] # Divide up into songs songs = songs_source.split("</tr>") # Create a array of dictionaries of all the songs songs_dict = [] for song in songs: try: title = re.findall(r'<td.*>(.*)<\/div>', song, re.S)[0] artist = re.findall(r'spotify:artist:.*>(.*)<\/a>', song)[0] album = re.findall(r'spotify:album.*>(.*)<\/a>', song)[0] song_time = re.findall(r'tl-time\">([\w|:]*)<\/td>', song, re.S)[0] title = re.sub(r" - \w* Edit", "", title, re.IGNORECASE) title = re.sub(r" -.*Version.*", "", title, re.IGNORECASE) title = re.sub(r" -.*Remaster(ed)?.*", "", title, re.IGNORECASE) title = re.sub(r" \(Remaster(ed)?\) *", "", title, re.IGNORECASE) title = re.sub(r" -.*Anniversary Mix.*", "", title, re.IGNORECASE) song_dict = { 'title': Util.html_to_ascii(title), 'artist': Util.html_to_ascii(artist), 'album': Util.html_to_ascii(album), 'time': Util.time_in_seconds(song_time), } songs_dict.append(song_dict) except IndexError: pass return [playlist_name, songs_dict]
def _get_search_info(self, song_search_url): """ Downloads the page source of the song_search_url, and returns a list of dictionaries containing the information for each search result. The dictionaries contain 'title', 'url', and 'time' (in seconds) fields. :param song_search_url: The url of a search for a song :return: A list of dictionaries, each containing the 'title', 'url', and 'time' (in seconds) info of each search result """ with urllib.request.urlopen(song_search_url) as response: html = response.read() # decodes html source from binary bytes to string search_source = html.decode("UTF-8", "ignore") # parse source for vid info search_info = [] # Isolate the list of results in the source results_source = re.split( r"<ol id=\"item-section-.*?\" class=\"item-section\">", search_source)[1] results_source = re.split( r"<div class=\"branded-page-box search-pager.*\">", results_source, 1)[0] # split by video in list, returns the type of entry (video, playlist, channel) results_source = re.split( r"<li><div class=\"yt-lockup yt-lockup-tile yt-lockup-(.*?) vve-check clearfix.*?\"", results_source)[1:] index = 0 while len(search_info) < self.MAX_NUM_SEARCH_RESULTS and index < len( results_source) - 1: source_type = results_source[index] source = results_source[index + 1] if source_type == "video": video_url = re.findall(r"href=\"\/watch\?v=(.*?)\"", source)[0] video_url = self.SONG_URL_RESULT_ROOT + video_url video_title = re.findall(r"title=\"(.*?)\"", source)[2] video_title = Util.html_to_ascii(video_title) video_time = re.findall(r"Duration: (\d+:\d+)", source)[0] video_time = re.split(r":", video_time) video_time = int(video_time[0]) * 60 + int(video_time[1]) search_info.append({ "url": video_url, "title": video_title, "time": video_time }) index += 2 return search_info
def _get_search_info(self, song_search_url): """ Downloads the page source of the song_search_url, and returns a list of dictionaries containing the information for each search result. The dictionaries contain 'title', 'url', and 'time' (in seconds) fields. :param song_search_url: The url of a search for a song :return: A list of dictionaries, each containing the 'title', 'url', and 'time' (in seconds) info of each search result """ with urllib.request.urlopen(song_search_url) as response: html = response.read() # decodes html source from binary bytes to string search_source = html.decode("UTF-8", "ignore") # parse source for vid info search_info = [] # Isolate the list of results in the source results_source = re.split(r"<div class=\"searchResultGroupHeading\">", search_source)[1] results_source = re.split(r"</ul>", results_source, 1)[0] # split by search result results_source = re.split(r"<div class=\"searchItem\">", results_source)[1:] # This code theoretically works, but urllib can't access all of Soundclouds # websource because it thinks it's an invalid browser of something index = 0 while len(search_info) < self.MAX_NUM_SEARCH_RESULTS and index < len( results_source): source = results_source[index] artist = re.findall( r"<span class=\"soundTitle_+usernameText\">(.*)</span>", source)[0] title = re.findall(r"<span class=\"\">(.*)</span>", source)[0] url = re.findall( r"<a class=\"soundTitle_+title sc-link-dark\" href=\"(.*)\">", source)[0] title = Util.html_to_ascii(artist + " " + title) url = self.SONG_URL_RESULT_ROOT + url search_info.append({"url": url, "title": title, "time": None}) index += 1 return search_info