예제 #1
0
    def get_playlist(self):
        # Retrieve HTML source if it has not been retrieved already
        if not self.html_src:
            source = self.retrieve_html_source()
        else:
            source = self.html_src

        #split to find the playlist name
        name_source = source.split(r'<h1 class="main">')[1]
        name_source = name_source.split('</span>')[0]
        playlist_name = re.findall(r'\">(.*)</a>', name_source)[0]

        # Remove everything before the playlist section
        songs_source = source.split("<tbody data-bind=\"foreach: tracks\"")[1]
        # Divide up into songs
        songs = songs_source.split("</tr>")

        # Create a array of dictionaries of all the songs
        songs_dict = []
        for song in songs:
            try:
                title = re.findall(r'<td.*>(.*)<\/div>', song, re.S)[0]
                artist = re.findall(r'spotify:artist:.*>(.*)<\/a>', song)[0]
                album = re.findall(r'spotify:album.*>(.*)<\/a>', song)[0]
                song_time = re.findall(r'tl-time\">([\w|:]*)<\/td>', song,
                                       re.S)[0]

                title = re.sub(r" - \w* Edit", "", title, re.IGNORECASE)
                title = re.sub(r" -.*Version.*", "", title, re.IGNORECASE)
                title = re.sub(r" -.*Remaster(ed)?.*", "", title,
                               re.IGNORECASE)
                title = re.sub(r" \(Remaster(ed)?\) *", "", title,
                               re.IGNORECASE)
                title = re.sub(r" -.*Anniversary Mix.*", "", title,
                               re.IGNORECASE)

                song_dict = {
                    'title': Util.html_to_ascii(title),
                    'artist': Util.html_to_ascii(artist),
                    'album': Util.html_to_ascii(album),
                    'time': Util.time_in_seconds(song_time),
                }
                songs_dict.append(song_dict)
            except IndexError:
                pass
        return [playlist_name, songs_dict]
예제 #2
0
    def _get_search_info(self, song_search_url):
        """
        Downloads the page source of the song_search_url, and returns a list of dictionaries containing
        the information for each search result. The dictionaries contain 'title', 'url', and 'time' (in seconds) fields.

        :param song_search_url: The url of a search for a song
        :return: A list of dictionaries, each containing the 'title', 'url', and 'time' (in seconds) info of each search result
        """
        with urllib.request.urlopen(song_search_url) as response:
            html = response.read()

        # decodes html source from binary bytes to string
        search_source = html.decode("UTF-8", "ignore")

        # parse source for vid info
        search_info = []

        # Isolate the list of results in the source
        results_source = re.split(
            r"<ol id=\"item-section-.*?\" class=\"item-section\">",
            search_source)[1]
        results_source = re.split(
            r"<div class=\"branded-page-box search-pager.*\">", results_source,
            1)[0]

        # split by video in list, returns the type of entry (video, playlist, channel)
        results_source = re.split(
            r"<li><div class=\"yt-lockup yt-lockup-tile yt-lockup-(.*?) vve-check clearfix.*?\"",
            results_source)[1:]

        index = 0
        while len(search_info) < self.MAX_NUM_SEARCH_RESULTS and index < len(
                results_source) - 1:
            source_type = results_source[index]
            source = results_source[index + 1]

            if source_type == "video":
                video_url = re.findall(r"href=\"\/watch\?v=(.*?)\"", source)[0]
                video_url = self.SONG_URL_RESULT_ROOT + video_url
                video_title = re.findall(r"title=\"(.*?)\"", source)[2]
                video_title = Util.html_to_ascii(video_title)
                video_time = re.findall(r"Duration: (\d+:\d+)", source)[0]
                video_time = re.split(r":", video_time)
                video_time = int(video_time[0]) * 60 + int(video_time[1])

                search_info.append({
                    "url": video_url,
                    "title": video_title,
                    "time": video_time
                })

            index += 2

        return search_info
    def _get_search_info(self, song_search_url):
        """
        Downloads the page source of the song_search_url, and returns a list of dictionaries containing
        the information for each search result. The dictionaries contain 'title', 'url', and 'time' (in seconds) fields.

        :param song_search_url: The url of a search for a song
        :return: A list of dictionaries, each containing the 'title', 'url', and 'time' (in seconds) info of each search result
        """
        with urllib.request.urlopen(song_search_url) as response:
            html = response.read()

        # decodes html source from binary bytes to string
        search_source = html.decode("UTF-8", "ignore")

        # parse source for vid info
        search_info = []

        # Isolate the list of results in the source
        results_source = re.split(r"<div class=\"searchResultGroupHeading\">",
                                  search_source)[1]
        results_source = re.split(r"</ul>", results_source, 1)[0]

        # split by search result
        results_source = re.split(r"<div class=\"searchItem\">",
                                  results_source)[1:]

        # This code theoretically works, but urllib can't access all of Soundclouds
        # websource because it thinks it's an invalid browser of something

        index = 0
        while len(search_info) < self.MAX_NUM_SEARCH_RESULTS and index < len(
                results_source):
            source = results_source[index]

            artist = re.findall(
                r"<span class=\"soundTitle_+usernameText\">(.*)</span>",
                source)[0]
            title = re.findall(r"<span class=\"\">(.*)</span>", source)[0]
            url = re.findall(
                r"<a class=\"soundTitle_+title sc-link-dark\" href=\"(.*)\">",
                source)[0]

            title = Util.html_to_ascii(artist + " " + title)
            url = self.SONG_URL_RESULT_ROOT + url

            search_info.append({"url": url, "title": title, "time": None})

            index += 1

        return search_info