예제 #1
0
def crawl_lyrics(singer):
    """逐页抓取歌词"""
    html = crawl_html(BASE_URL, params={'key': singer}, timeout=10)
    page_num = int(html.select('a.page-navigator-number')[-1].text)
    print('歌手 %s 共 %d 页' % (singer, page_num))

    SINGER_DIR = os.path.join(BASE_DIR, singer)
    if not os.path.exists(SINGER_DIR):
        os.makedirs(SINGER_DIR)

    cnt = 1
    for i in range(page_num):
        params = {'key': singer, 'start': i * 20}
        content_list = crawl_html(BASE_URL, params=params,
                                  timeout=10).select('li.bb')
        for content in content_list:
            try:
                title = content.find(
                    'span', class_='song-title').find('a').text.strip()
                author = content.find('span',
                                      class_='author_list')['title'].strip()
                if singer not in author:
                    continue
                lyric = content.find(
                    'div', class_='lrc-content').find('p').text.strip()
                filename = title + ' - ' + author + ' - ' + '{0:0>4}'.format(
                    cnt) + '.txt'
                filename = filename.replace('/', '-')
                openf(os.path.join(SINGER_DIR, filename), 'w').write(lyric)
                print(filename)
                cnt += 1
            except:
                pass
        print('完成,第 %d 页' % (i + 1))
def find_artist_ids():
    """只能拿到前100位的歌手ID"""
    url = 'http://music.163.com/api/artist/top?limit=100&offset=0'
    html = crawl_html(url, return_format='json', headers=headers)
    artists = html['artists']
    with openf(BASE_DIR + 'artists.txt', 'w') as fa:
        for artist in artists:
            artist_name = artist['name'].strip().replace(" ", "_")
            fa.write(artist_name + ' ' + str(artist['id']) + '\n')
def crawl_lyrics(art_id):
    """抓取一整个歌手的所有歌词"""
    html = crawl_html(START_URL.format(art_id), headers=headers)  # 先抓该歌手的专辑列表

    artist = html.find('h2', id='artist-name').text.replace(' ', '_').strip()
    artist_dir = BASE_DIR + artist
    if not os.path.exists(artist_dir):  # 歌手目录
        os.makedirs(artist_dir)
    print("歌手名:", artist)

    albums = html.find('ul', class_='m-cvrlst').find_all('a',
                                                         class_='msk')  # 专辑列表
    for album in albums:
        html = crawl_html(BASE_URL + album.get('href'),
                          headers=headers)  # 再抓取该专辑下歌曲列表

        album_title = html.find('h2',
                                class_='f-ff2').text.replace(' ', '_').replace(
                                    '/', '_').strip()  # '/'会影响目录
        album_dir = os.path.join(artist_dir, album_title)
        if not os.path.exists(album_dir):  # 专辑目录
            os.mkdir(album_dir)
        print("  " + artist + "---" + album_title)

        links = html.find('ul', class_='f-hide').find_all('a')  # 歌曲列表
        for link in links:
            song_name = link.text.replace(' ', '_').replace('/', '_').strip()
            song_id = link.get('href').split('=')[1]
            try:
                lyric_json = crawl_html(SONG_URL.format(song_id),
                                        return_format='json',
                                        headers=headers)  # 抓取歌词
                lyric_text = lyric_json['lrc']['lyric']
                openf(os.path.join(album_dir, song_name + '.txt'),
                      'w').write(lyric_text)
                print("    " + song_name + ", URL: " +
                      SONG_URL.format(song_id))
            except:
                print("    " + song_name + ": 无歌词, URL: " +
                      SONG_URL.format(song_id))
        print()
예제 #4
0
def crawl_himym():
    all_links = get_all_links(9)  # 9 pages for HIMYM TV series
    print('Total links:', len(all_links))

    if not os.path.exists(SCRIPT_DIR):
        os.makedirs(SCRIPT_DIR)

    for link in all_links:
        all_ps, page_head = crawl_content(link)
        with openf(os.path.join(SCRIPT_DIR, page_head + '.txt'), 'w') as f:
            f.write(all_ps + '\n')
        print('Finished: ' + page_head)
예제 #5
0
def crawl_tbbt():
    all_links = get_all_links()
    print('Total links:', len(all_links))

    if not os.path.exists(SCRIPT_DIR):
        os.makedirs(SCRIPT_DIR)

    for link in all_links:
        all_ps, page_head = crawl_content(link)
        page_head = page_head.replace('/', '_')
        with openf(os.path.join(SCRIPT_DIR, page_head + '.txt'), 'w') as f:
            f.write(all_ps + '\n')
        print('Finished: ' + page_head)
예제 #6
0
def crawl_imsdb():
    start_url = 'http://www.imsdb.com/all%20scripts/'
    paragraphs = crawl_html(start_url).find_all('p')

    if not os.path.exists(SCRIPT_DIR):
        os.makedirs(SCRIPT_DIR)

    for p in paragraphs:
        relative_link = p.a['href']
        title, script = get_script(relative_link)
        if not script:
            continue
        cur_filename = os.path.join(SCRIPT_DIR, title.strip('.html') + '.txt')
        with openf(cur_filename, 'w') as f:
            f.write(script)