Python openf 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: http_utils

메소드/함수: openf

hotexamples.com에서의 예제들: 6

Python openf - 6개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 http_utils.openf에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

def crawl_lyrics(singer):
    """逐页抓取歌词"""
    html = crawl_html(BASE_URL, params={'key': singer}, timeout=10)
    page_num = int(html.select('a.page-navigator-number')[-1].text)
    print('歌手 %s 共 %d 页' % (singer, page_num))

    SINGER_DIR = os.path.join(BASE_DIR, singer)
    if not os.path.exists(SINGER_DIR):
        os.makedirs(SINGER_DIR)

    cnt = 1
    for i in range(page_num):
        params = {'key': singer, 'start': i * 20}
        content_list = crawl_html(BASE_URL, params=params,
                                  timeout=10).select('li.bb')
        for content in content_list:
            try:
                title = content.find(
                    'span', class_='song-title').find('a').text.strip()
                author = content.find('span',
                                      class_='author_list')['title'].strip()
                if singer not in author:
                    continue
                lyric = content.find(
                    'div', class_='lrc-content').find('p').text.strip()
                filename = title + ' - ' + author + ' - ' + '{0:0>4}'.format(
                    cnt) + '.txt'
                filename = filename.replace('/', '-')
                openf(os.path.join(SINGER_DIR, filename), 'w').write(lyric)
                print(filename)
                cnt += 1
            except:
                pass
        print('完成，第 %d 页' % (i + 1))

예제 #2

파일 보기

파일: netease_lyric_crawler.py 프로젝트: gaussic/crawl_scripts

def find_artist_ids():
    """只能拿到前100位的歌手ID"""
    url = 'http://music.163.com/api/artist/top?limit=100&offset=0'
    html = crawl_html(url, return_format='json', headers=headers)
    artists = html['artists']
    with openf(BASE_DIR + 'artists.txt', 'w') as fa:
        for artist in artists:
            artist_name = artist['name'].strip().replace(" ", "_")
            fa.write(artist_name + ' ' + str(artist['id']) + '\n')

예제 #3

파일 보기

파일: netease_lyric_crawler.py 프로젝트: gaussic/crawl_scripts

def crawl_lyrics(art_id):
    """抓取一整个歌手的所有歌词"""
    html = crawl_html(START_URL.format(art_id), headers=headers)  # 先抓该歌手的专辑列表

    artist = html.find('h2', id='artist-name').text.replace(' ', '_').strip()
    artist_dir = BASE_DIR + artist
    if not os.path.exists(artist_dir):  # 歌手目录
        os.makedirs(artist_dir)
    print("歌手名：", artist)

    albums = html.find('ul', class_='m-cvrlst').find_all('a',
                                                         class_='msk')  # 专辑列表
    for album in albums:
        html = crawl_html(BASE_URL + album.get('href'),
                          headers=headers)  # 再抓取该专辑下歌曲列表

        album_title = html.find('h2',
                                class_='f-ff2').text.replace(' ', '_').replace(
                                    '/', '_').strip()  # '/'会影响目录
        album_dir = os.path.join(artist_dir, album_title)
        if not os.path.exists(album_dir):  # 专辑目录
            os.mkdir(album_dir)
        print("  " + artist + "---" + album_title)

        links = html.find('ul', class_='f-hide').find_all('a')  # 歌曲列表
        for link in links:
            song_name = link.text.replace(' ', '_').replace('/', '_').strip()
            song_id = link.get('href').split('=')[1]
            try:
                lyric_json = crawl_html(SONG_URL.format(song_id),
                                        return_format='json',
                                        headers=headers)  # 抓取歌词
                lyric_text = lyric_json['lrc']['lyric']
                openf(os.path.join(album_dir, song_name + '.txt'),
                      'w').write(lyric_text)
                print("    " + song_name + ", URL: " +
                      SONG_URL.format(song_id))
            except:
                print("    " + song_name + ": 无歌词, URL: " +
                      SONG_URL.format(song_id))
        print()

예제 #4

파일 보기

def crawl_himym():
    all_links = get_all_links(9)  # 9 pages for HIMYM TV series
    print('Total links:', len(all_links))

    if not os.path.exists(SCRIPT_DIR):
        os.makedirs(SCRIPT_DIR)

    for link in all_links:
        all_ps, page_head = crawl_content(link)
        with openf(os.path.join(SCRIPT_DIR, page_head + '.txt'), 'w') as f:
            f.write(all_ps + '\n')
        print('Finished: ' + page_head)

예제 #5

파일 보기

def crawl_tbbt():
    all_links = get_all_links()
    print('Total links:', len(all_links))

    if not os.path.exists(SCRIPT_DIR):
        os.makedirs(SCRIPT_DIR)

    for link in all_links:
        all_ps, page_head = crawl_content(link)
        page_head = page_head.replace('/', '_')
        with openf(os.path.join(SCRIPT_DIR, page_head + '.txt'), 'w') as f:
            f.write(all_ps + '\n')
        print('Finished: ' + page_head)

예제 #6

파일 보기

파일: imsdb_crawler.py 프로젝트: gaussic/crawl_scripts

def crawl_imsdb():
    start_url = 'http://www.imsdb.com/all%20scripts/'
    paragraphs = crawl_html(start_url).find_all('p')

    if not os.path.exists(SCRIPT_DIR):
        os.makedirs(SCRIPT_DIR)

    for p in paragraphs:
        relative_link = p.a['href']
        title, script = get_script(relative_link)
        if not script:
            continue
        cur_filename = os.path.join(SCRIPT_DIR, title.strip('.html') + '.txt')
        with openf(cur_filename, 'w') as f:
            f.write(script)