def parse_genre(url): soup = fetch_soup(url) name, engname = left_partition(phyhtml.tag_of_class(soup, 'h3', 'bigtext').text.strip()) lines = [] for soup in phyhtml.tag_of_class(soup, 'div', 'content fold'): if soup.name is None: lines.append(soup.strip()) else: lines.append('<br/>') intro = '\n'.join(lines) return name, engname, intro
def search_artist(key, func, page=None, size=30, filter=None): if page is not None: return fetch_artists(SEARCH_SONG_URL_PAGED.format(page, key), func, filter) soup = fetch_soup(SEARCH_SONG_URL.format(key)) total = int(phyhtml.tag_of_class(soup, 'p', 'seek_counts ok').next.next.text) for page in range(1, int(math.ceil(total / size)) + 1): search_artist(key, func, page, size, filter)
def search_album(key, func, page=None, size=30, filter=None): if page is not None: print("fetch album in page:", page) fetch_albums(SEARCH_ALBUM_URL_PAGED.format(page, key), func, filter) soup = fetch_soup(SEARCH_ALBUM_URL.format(key)) total = int(phyhtml.tag_of_class(soup, 'p', 'seek_counts').next.next.text) count = int(math.ceil(total / size)) print("found", total, 'albums in', count, 'pages') for page in range(1, count + 1): search_album(key, func, page, size, filter)
def parse_artist(url): soup = fetch_soup(url) name = alias = None for tag in soup.find('h1'): if tag.name is None: name = tag.strip() else: alias = tag.text.strip().strip('“”"') location = profile = label = None genres = [] for tag in soup.find('div', {'id': 'artist_info'}).find_all('td'): if label is not None: if label == '地区:': location = right_partition(tag.text.strip()) elif label == '风格:': for a in tag.find_all('a'): genres.append((left_partition(a.text.strip()), escape_href(a['href'], soup))) elif label == '档案:': a = phyhtml.tag_of_class(tag, 'a', 'more') if a: profile = escape_href(a['href'], soup) label = None else: label = tag.text.strip() cover = soup.find('a', dict(id='cover_lightbox'))['href'] intro = None if profile: soup = fetch_soup(profile) div = phyhtml.tag_of_class(soup, 'div', 'profile') lines = [] if div: for tag in div.find_all('p'): lines.append(str(tag)) else: for tag in soup.find('div', dict(id='artist-record')): if tag.name is None: lines.append(tag.strip()) elif tag.name == 'br': lines.append('<br/>') intro = ''.join(lines) return name, alias, cover, location, genres, intro
def parse_song(url): soup = fetch_soup(url) title = alias = None for tag in soup.find('h1'): if tag.name is None: title = tag.strip() else: alias = tag.text.strip().strip('“”"') main = soup.find('div', dict(id='main')) label = album = lyricist = composer = arranger = None artists = [] for tag in phyhtml.tag_of_class(main, 'div', 'album_relation').find_all('td'): text = tag.text.strip() if label is not None: if label == '所属专辑:': a = tag.find('a') album = (text, escape_href(a['href'], soup) if a else None) elif label == '作词:': lyricist = text elif label == '演唱者:': for a in tag.find_all('a'): name = a.text.strip() if name: artists.append((name, escape_href(a['href'], soup))) elif label == '作曲:': composer = text elif label == '编曲:': arranger = text label = None else: label = text lines = [] for tag in phyhtml.tag_of_class(main, 'div', 'lrc_main'): if tag.name is None: lines.append(tag.strip()) else: lines.append('<br/>') lyric = ''.join(lines) return title, alias, album, artists, lyricist, composer, arranger, lyric
def do_filter(li): a = phyhtml.tag_of_class(li, 'a', 'singer') if a: return a.text == artist
def fetch_albums(url, func, filter=None): soup = fetch_soup(url) for tag in phyhtml.tag_of_class(soup, 'div', 'albumBlock_list').find_all('li'): if not filter or filter(tag): func(parse_album(phyhtml.tag_of_class(tag, 'p', 'cover').next['href']))