示例#1
0
 def extractFromHTML(self, html):
     self.log.info('parsing html')
     soup = BeautifulSoup(html, 'html.parser')
     p = soup.find(id='lyrics_text')
     if not p:
         self.log.warning('unable to extract')
         return None
     lyrics = nutils.encode(p.get_text())
     self.log.info('lyrics extracted')
     self.log.debug('{}'.format(lyrics))
     return lyrics
示例#2
0
 def extractFromHTML(self, html):
     self.log.info('parsing html')
     soup = BeautifulSoup(html, 'html.parser')
     div = soup.find(class_='lyricsh').find_next('div', class_=None)
     if not div:
         self.log.warning('unable to extract')
         return None
     lyrics = nutils.encode(div.get_text())
     self.log.info('lyrics extracted')
     self.log.debug('{}'.format(lyrics))
     return lyrics
示例#3
0
 def extractFromHTML(self, html):
     self.log.info('parsing html')
     soup = BeautifulSoup(html, 'html.parser')
     div = soup.find(id='lyrics-body-text')
     if not div:
         self.log.warning('unable to extract')
         return None
     lyrics = b' '.join([
         nutils.encode(p.get_text())
         if p.get('id') != 'mid-song-discussion' else b''
         for p in div.findChildren('p')
     ])
     self.log.info('lyrics extracted')
     self.log.debug('{}'.format(lyrics))
     return lyrics
示例#4
0
 def _extractSongTitle(self, a_elem):
     text = nutils.encode(a_elem.get_text())
     title = nutils.rreplace(text, b' Lyrics', b'').strip(b'\n\r\s\t')
     self.log.debug('song title {} extracted from {:s}'.format(
         nutils.decode(title), a_elem.prettify()))
     return title
示例#5
0
 def _extractArtistName(self, a_elem):
     text = nutils.encode(a_elem.get_text())
     name = nutils.rreplace(text, b' Lyrics', b'').strip(b'\n\r\s\t')
     self.log.debug('artist name {} extracted from {:s}'.format(
         nutils.decode(name), a_elem.prettify()))
     return name