def scrape_lyrics_from_url(self, url): """Scrape lyrics from a URL. If no lyrics can be found, return None instead. """ from bs4 import BeautifulSoup, Tag, Comment html = fetch_url(url) soup = BeautifulSoup(html) for tag in soup.findAll('br'): tag.replaceWith('\n') # Remove non relevant html parts [s.extract() for s in soup(['head', 'script'])] comments = soup.findAll(text=lambda text: isinstance(text, Comment)) [s.extract() for s in comments] try: for tag in soup.findAll(True): tag.name = 'p' # keep tag contents except Exception as e: log.debug('Error %s when replacing containing marker by p marker' % e, exc_info=True) # Make better soup from current soup! The previous unclosed <p> sections # are now closed. Use str() rather than prettify() as it's more # conservative concerning EOL soup = BeautifulSoup(str(soup)) # In case lyrics are nested in no markup but <body> # Insert the whole body in a <p> bodyTag = soup.find('body') if bodyTag: pTag = soup.new_tag("p") bodyTag.parent.insert(0, pTag) pTag.insert(0, bodyTag) tagTokens = [] for tag in soup.findAll('p'): soup2 = BeautifulSoup(str(tag)) # Extract all text of <p> section. tagTokens += soup2.findAll(text=True) if tagTokens: # Lyrics are expected to be the longest paragraph tagTokens = sorted(tagTokens, key=len, reverse=True) soup = BeautifulSoup(tagTokens[0]) return unescape(tagTokens[0].strip("\n\r: "))
def fetch_lyrics(self, artist, title): print('using lyricswiki') """Fetch lyrics from LyricsWiki.""" url = LYRICSWIKI_URL_PATTERN % (self._lw_encode(artist), self._lw_encode(title)) try: html = fetch_url(url) except HTTPError: return if not html: return lyrics = extract_text(html, "<div class='lyricbox'>") if lyrics and 'Unfortunately, we are not licensed' not in lyrics: return lyrics
def scrape_lyrics_from_url(self, url): """Scrape lyrics from a URL. If no lyrics can be found, return None instead. """ from bs4 import BeautifulSoup, Tag, Comment html = fetch_url(url) soup = BeautifulSoup(html) for tag in soup.findAll('br'): tag.replaceWith('\n') # Remove non relevant html parts [s.extract() for s in soup(['head', 'script'])] comments = soup.findAll(text=lambda text:isinstance(text, Comment)) [s.extract() for s in comments] try: for tag in soup.findAll(True): tag.name = 'p' # keep tag contents except Exception as e: log.debug('Error %s when replacing containing marker by p marker' % e, exc_info=True) # Make better soup from current soup! The previous unclosed <p> sections # are now closed. Use str() rather than prettify() as it's more # conservative concerning EOL soup = BeautifulSoup(str(soup)) # In case lyrics are nested in no markup but <body> # Insert the whole body in a <p> bodyTag = soup.find('body') if bodyTag: pTag = soup.new_tag("p") bodyTag.parent.insert(0, pTag) pTag.insert(0, bodyTag) tagTokens = [] for tag in soup.findAll('p'): soup2 = BeautifulSoup(str(tag)) # Extract all text of <p> section. tagTokens += soup2.findAll(text=True) if tagTokens: # Lyrics are expected to be the longest paragraph tagTokens = sorted(tagTokens, key=len, reverse=True) soup = BeautifulSoup(tagTokens[0]) return unescape(tagTokens[0].strip("\n\r: "))
def fetch_lyrics(self, artist, title): """Fetch lyrics from Lyrics.com.""" print('using lyricscom') url = URL_PATTERN % (self._lc_encode(title), self._lc_encode(artist)) html = fetch_url(url) if not html: return lyrics = extract_text(html, '<div id="lyric_space">') if not lyrics: return for not_found_str in NOT_FOUND: if not_found_str in lyrics: return parts = lyrics.split('\n---\nLyrics powered by', 1) if parts: return parts[0]