Пример #1
0
    def scrape_lyrics_from_url(self, url):
        """Scrape lyrics from a URL. If no lyrics can be found, return None
        instead.
        """
        from bs4 import BeautifulSoup, Tag, Comment
        html = fetch_url(url)
        soup = BeautifulSoup(html)

        for tag in soup.findAll('br'):
            tag.replaceWith('\n')

        # Remove non relevant html parts
        [s.extract() for s in soup(['head', 'script'])]
        comments = soup.findAll(text=lambda text: isinstance(text, Comment))
        [s.extract() for s in comments]

        try:
            for tag in soup.findAll(True):
                tag.name = 'p'  # keep tag contents

        except Exception as e:
            log.debug('Error %s when replacing containing marker by p marker' %
                      e,
                      exc_info=True)

        # Make better soup from current soup! The previous unclosed <p> sections
        # are now closed.  Use str() rather than prettify() as it's more
        # conservative concerning EOL
        soup = BeautifulSoup(str(soup))

        # In case lyrics are nested in no markup but <body>
        # Insert the whole body in a <p>
        bodyTag = soup.find('body')
        if bodyTag:
            pTag = soup.new_tag("p")
            bodyTag.parent.insert(0, pTag)
            pTag.insert(0, bodyTag)

        tagTokens = []

        for tag in soup.findAll('p'):
            soup2 = BeautifulSoup(str(tag))
            # Extract all text of <p> section.
            tagTokens += soup2.findAll(text=True)

        if tagTokens:
            # Lyrics are expected to be the longest paragraph
            tagTokens = sorted(tagTokens, key=len, reverse=True)
            soup = BeautifulSoup(tagTokens[0])
            return unescape(tagTokens[0].strip("\n\r: "))
Пример #2
0
    def fetch_lyrics(self, artist, title):
        print('using lyricswiki')
        """Fetch lyrics from LyricsWiki."""
        url = LYRICSWIKI_URL_PATTERN % (self._lw_encode(artist), self._lw_encode(title))
        try:
            html = fetch_url(url)
        except HTTPError:
            return
        if not html:
            return

        lyrics = extract_text(html, "<div class='lyricbox'>")
        if lyrics and 'Unfortunately, we are not licensed' not in lyrics:
            return lyrics
Пример #3
0
    def scrape_lyrics_from_url(self, url):
        """Scrape lyrics from a URL. If no lyrics can be found, return None
        instead.
        """
        from bs4 import BeautifulSoup, Tag, Comment
        html = fetch_url(url)
        soup = BeautifulSoup(html)

        for tag in soup.findAll('br'):
            tag.replaceWith('\n')

        # Remove non relevant html parts
        [s.extract() for s in soup(['head', 'script'])]
        comments = soup.findAll(text=lambda text:isinstance(text, Comment))
        [s.extract() for s in comments]

        try:
            for tag in soup.findAll(True):
                tag.name = 'p'          # keep tag contents

        except Exception as e:
            log.debug('Error %s when replacing containing marker by p marker' % e,
                exc_info=True)

        # Make better soup from current soup! The previous unclosed <p> sections
        # are now closed.  Use str() rather than prettify() as it's more
        # conservative concerning EOL
        soup = BeautifulSoup(str(soup))

        # In case lyrics are nested in no markup but <body>
        # Insert the whole body in a <p>
        bodyTag = soup.find('body')
        if bodyTag:
            pTag = soup.new_tag("p")
            bodyTag.parent.insert(0, pTag)
            pTag.insert(0, bodyTag)

        tagTokens = []

        for tag in soup.findAll('p'):
            soup2 = BeautifulSoup(str(tag))
            # Extract all text of <p> section.
            tagTokens += soup2.findAll(text=True)

        if tagTokens:
            # Lyrics are expected to be the longest paragraph
            tagTokens = sorted(tagTokens, key=len, reverse=True)
            soup = BeautifulSoup(tagTokens[0])
            return unescape(tagTokens[0].strip("\n\r: "))
Пример #4
0
    def fetch_lyrics(self, artist, title):
        print('using lyricswiki')
        """Fetch lyrics from LyricsWiki."""
        url = LYRICSWIKI_URL_PATTERN % (self._lw_encode(artist),
                                        self._lw_encode(title))
        try:
            html = fetch_url(url)
        except HTTPError:
            return
        if not html:
            return

        lyrics = extract_text(html, "<div class='lyricbox'>")
        if lyrics and 'Unfortunately, we are not licensed' not in lyrics:
            return lyrics
Пример #5
0
    def fetch_lyrics(self, artist, title):
        """Fetch lyrics from Lyrics.com."""
        print('using lyricscom')
        url = URL_PATTERN % (self._lc_encode(title), self._lc_encode(artist))
        html = fetch_url(url)
        if not html:
            return

        lyrics = extract_text(html, '<div id="lyric_space">')
        if not lyrics:
            return
        for not_found_str in NOT_FOUND:
            if not_found_str in lyrics:
                return

        parts = lyrics.split('\n---\nLyrics powered by', 1)
        if parts:
            return parts[0]
Пример #6
0
    def fetch_lyrics(self, artist, title):
        """Fetch lyrics from Lyrics.com."""
        print('using lyricscom')
        url = URL_PATTERN % (self._lc_encode(title), self._lc_encode(artist))
        html = fetch_url(url)
        if not html:
            return

        lyrics = extract_text(html, '<div id="lyric_space">')
        if not lyrics:
            return
        for not_found_str in NOT_FOUND:
            if not_found_str in lyrics:
                return

        parts = lyrics.split('\n---\nLyrics powered by', 1)
        if parts:
            return parts[0]