def extractFromHTML(self, html): self.log.info('parsing html') soup = BeautifulSoup(html, 'html.parser') p = soup.find(id='lyrics_text') if not p: self.log.warning('unable to extract') return None lyrics = nutils.encode(p.get_text()) self.log.info('lyrics extracted') self.log.debug('{}'.format(lyrics)) return lyrics
def extractFromHTML(self, html): self.log.info('parsing html') soup = BeautifulSoup(html, 'html.parser') div = soup.find(class_='lyricsh').find_next('div', class_=None) if not div: self.log.warning('unable to extract') return None lyrics = nutils.encode(div.get_text()) self.log.info('lyrics extracted') self.log.debug('{}'.format(lyrics)) return lyrics
def extractFromHTML(self, html): self.log.info('parsing html') soup = BeautifulSoup(html, 'html.parser') div = soup.find(id='lyrics-body-text') if not div: self.log.warning('unable to extract') return None lyrics = b' '.join([ nutils.encode(p.get_text()) if p.get('id') != 'mid-song-discussion' else b'' for p in div.findChildren('p') ]) self.log.info('lyrics extracted') self.log.debug('{}'.format(lyrics)) return lyrics
def _extractSongTitle(self, a_elem): text = nutils.encode(a_elem.get_text()) title = nutils.rreplace(text, b' Lyrics', b'').strip(b'\n\r\s\t') self.log.debug('song title {} extracted from {:s}'.format( nutils.decode(title), a_elem.prettify())) return title
def _extractArtistName(self, a_elem): text = nutils.encode(a_elem.get_text()) name = nutils.rreplace(text, b' Lyrics', b'').strip(b'\n\r\s\t') self.log.debug('artist name {} extracted from {:s}'.format( nutils.decode(name), a_elem.prettify())) return name