Exemplo n.º 1
0
 def _parseSongsTable(self, artist, table):
     self.log.info('parsing songs table')
     output_rows = []
     for row in table.tbody.findAll('tr'):
         song_a = row.find('td', class_=None).find_next('a', href=True)
         title = self._extractSongTitle(song_a)
         lyrics_url = song_a['href']
         output_rows.append({
             'trackid': self.currid + 1,
             'url': lyrics_url,
             'artist': nutils.decode(artist),
             'title': nutils.decode(title)
         })
         self.currid += 1
         self.log.info('new lyrics URL crawled - {:s}'.format(lyrics_url))
     self._batchWrite(output_rows)
Exemplo n.º 2
0
 def _parseArtistsTable(self, table):
     self.log.info('parsing artists table')
     for row in table.tbody.findAll('tr'):
         artist_a = row.find('td').find_next('a', href=True)
         artist = self._extractArtistName(artist_a)
         songs_pattern = self._extractArtistSongsPagePattern(artist_a)
         page = 1
         url, response = self._requestSongsPage(songs_pattern, page)
         while response and url == response.geturl():
             html = response.read()
             soup = BeautifulSoup(html, 'html.parser')
             table = soup.find('table', class_='songs-table compact')
             if table:
                 self._parseSongsTable(artist, table)
             else:
                 self.log.warning(
                     'cannot crawl from {:s} - skipping'.format(url))
             page += 1
             if page > self.max_depth:
                 self.log.warning('reached max depth - skipping')
                 break
             url, response = self._requestSongsPage(songs_pattern, page)
         if response:
             self.log.info('no more songs for artist {:s}'.format(
                 nutils.decode(artist)))
         else:
             self.log.warning('cannot open URL {:s} - skipping'.format(url))
Exemplo n.º 3
0
 def work(self):
     with open(self.fout, 'w', encoding='utf8') as tsvout:
         writer = csv.DictWriter(tsvout,
                                 delimiter='\t',
                                 fieldnames=self.tsv_headers)
         writer.writeheader()
         tot = len(self.tracks)
         for i, track in enumerate(self.tracks):
             self.log.info('track {:d}/{:d} - {}'.format((i + 1), tot,
                                                         track))
             trackid = track['trackid']
             url = track['url']
             extractor = self._selectExtractor(url)
             if not extractor:
                 self.log.warning(
                     'no extractor suitable for {:s} - skipping'.format(
                         url))
                 continue
             lyrics = self._extract(url, extractor)
             if lyrics:
                 lyrics = nutils.inline(unidecode(nutils.decode(lyrics)),
                                        lower=True)
                 self.log.debug('lyrics normalized - {}'.format(lyrics))
                 self.log.info('writing data to output file')
                 writer.writerow({'trackid': trackid, 'lyrics': lyrics})
             else:
                 self.log.warning(
                     'cannot extract from {} - skipping'.format(url))
         self.log.info('worker {} finished'.format(self.wid))
Exemplo n.º 4
0
 def _extractSongTitle(self, a_elem):
     text = nutils.encode(a_elem.get_text())
     title = nutils.rreplace(text, b' Lyrics', b'').strip(b'\n\r\s\t')
     self.log.debug('song title {} extracted from {:s}'.format(
         nutils.decode(title), a_elem.prettify()))
     return title
Exemplo n.º 5
0
 def _extractArtistName(self, a_elem):
     text = nutils.encode(a_elem.get_text())
     name = nutils.rreplace(text, b' Lyrics', b'').strip(b'\n\r\s\t')
     self.log.debug('artist name {} extracted from {:s}'.format(
         nutils.decode(name), a_elem.prettify()))
     return name