def _process_lyrics_table(self, lang_row, lyrics): for lang, td in zip(('Korean', 'English'), lang_row.find_all('td')[1:]): td_str = str(td) td_str = td_str[:4] + '<p>' + td_str[4:] fixed_td = soupify(re.sub('(?<!</p>|<td>)<p>', '</p><p>', td_str)) log.log(5, 'Fixed td:\n{}\n\n'.format(fixed_td)) for p in fixed_td.find_all('p'): lines = [ l for l in p.get_text().replace('<br/>', '\n').splitlines() if l ] for j, line in enumerate(lines): if line.startswith('<span'): lines[j] = soupify(line).find('span').get_text() log.log( 9, '{}: found stanza with {} lines'.format(lang, len(lines))) lines.append('<br/>') lyrics[lang].extend(lines)
def get_english_translation(self, song): html = soupify(self.get_page(song), 'lxml') artist_ele = html.find('li', class_='song-node-info-artist') artist = artist_ele.text.replace('Artist:', '').strip() title = html.find('h2', class_='title-h2').text full_title = '{} - {}'.format(artist, title) content = html.find('div', class_='ltf') lines = [] for par in content.find_all('div', class_='par'): stanza = par.get_text().splitlines() log.log( 19, '{}: found stanza with {} lines'.format( 'English', len(stanza))) lines.extend(stanza) lines.append('<br/>') return lines, full_title
def get_lyrics(self, song, title=None, *, kor_endpoint=None, eng_endpoint=None): lyrics = {'Korean': [], 'English': [], 'title': title} html = soupify(self.get_page(song), 'lxml') content = html.find('div', class_='td-post-content') for h2 in content.find_all('h2'): if h2.text.endswith('Hangul'): lang = 'Korean' if title is None: lyrics['title'] = re.match('^(.*?)\s+Hangul$', h2.text).group(1) elif h2.text.endswith('English Translation'): lang = 'English' else: continue log.debug('Found {} section'.format(lang)) ele = h2.next_sibling while ele.name in (None, 'p'): log.log( 9, 'Processing element: <{0}>{1}</{0}>'.format(ele.name, ele)) if ele.name == 'p': lines = [ l for l in ele.text.replace('<br/>', '\n').splitlines() if l ] log.log( 19, '{}: found stanza with {} lines'.format( lang, len(lines))) lines.append('<br/>') lyrics[lang].extend(lines) ele = ele.next_sibling return lyrics
def get_index_results(self, query): album_soup = self.get_index(query) results = [] for a in album_soup.find_all('a', href=re.compile('/album/.*')): year = a.parent.next_sibling.get_text() album = '[{}] {}'.format(year, a.get_text()) link = a.get('href') album_page = soupify(self.get_page(link)) for track_a in album_page.find_all( 'a', href=re.compile('/lyrics/.*(?<!/edit)$')): track_link = track_a.get('href') track_name = track_a.find( 'h2', class_=re.compile('.*title$')).get_text() results.append({ 'Album': album, 'Song': track_name, 'Link': track_link }) # print(album, link) return results
def get_lyrics(self, song, title=None, *, kor_endpoint=None, eng_endpoint=None) -> Dict[str, Union[str, List[str]]]: log.debug(f'Getting lyrics for {song=!r}') html = soupify(self.get_page(song), 'lxml') lyrics = { 'Korean': [], 'English': [], 'title': title or html.find('h1', class_='entry-title').get_text(), } try: lang_row = html.find( 'th', text='Romanization').parent.next_sibling.next_sibling except AttributeError: self._process_lyrics_nontable(html, lyrics) else: self._process_lyrics_table(lang_row, lyrics) return lyrics
def get_lyrics(self, song, title=None, *, kor_endpoint=None, eng_endpoint=None) -> Dict[str, Union[str, List[str]]]: html = soupify(self.get_page(self._normalize_title(song)), 'lxml') title_header = html.find('div', class_='mxm-track-title') track_title = list(title_header.find('h1').children)[-1] track_artist = title_header.find('h2').get_text() lyrics = { 'Korean': [], 'English': [], 'title': title or '{} - {}'.format(track_artist, track_title) } lang_names = {0: 'Korean', 1: 'English'} container = html.find('div', class_='mxm-track-lyrics-container') for row in container.find_all('div', class_='mxm-translatable-line-readonly'): # log.debug(f'Found {row=}') last_i = -1 for i, div in enumerate(row.find_all(lyric_part_match)): # TODO: Need to add newlines between stanzas lang = lang_names[i] text = div.get_text() or '<br/>' # log.debug(f'Found {lang=} {text=}') lyrics[lang].append(text) last_i = i if (last_i == 0) and (len(lyrics['Korean']) != len( lyrics['English'])): lyrics['English'].append('<br/>') return lyrics
def get_index(self, *args, **kwargs): return soupify(self._index(*args, **kwargs))
def search(self, *args, **kwargs): return soupify(self._search(*args, **kwargs))