Python get_urls 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: lib.geturls

메소드/함수: get_urls

hotexamples.com에서의 예제들: 6

Python get_urls - 6개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 lib.geturls.get_urls에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

def from_file():
    urls = get_urls()
    print u'获取到如下链接列表'
    #print urls
    config.TOTAL_COUNT = len(urls)
    print u'共有', config.TOTAL_COUNT, u'个链接'
    count = int(get_count())
    print u'上次爬取到第', int(count) + 1, u'个链接, 继续爬取'
    print u'输入 1 继续爬取,输入 2 重新爬取:'
    num = raw_input()
    if num == '2':
        count = 0
        print u'开始重新爬取'
    if count < config.TOTAL_COUNT:
        for count in range(count, config.TOTAL_COUNT):
            write_count(count, config.COUNT_TXT)
            url = urls[count]
            print u'正在爬取第', count + 1, u'个网页, 共', config.TOTAL_COUNT, u'个'
            config.NOW_COUNT = count
            scrap(url)
            count = count + 1
            print u'当前已完成采集', config.NOW_COUNT + 1, u'个, 共', config.TOTAL_COUNT, u'个'
        print u'采集结束,完成了', len(urls), u'个链接的采集'
    else:
        print u'链接上次已经全部爬取完毕'

예제 #2

파일 보기

파일: main.py 프로젝트: walleleung/TaobaoComments

def from_file():
    urls = get_urls()
    print u'获取到如下链接列表'
    print urls
    config.TOTAL_COUNT = len(urls)
    print u'共有', config.TOTAL_COUNT, u'个链接'
    count = int(get_count())
    print u'上次爬取到第', int(count) + 1, u'个链接, 继续爬取'
    print u'输入 1 继续爬取,输入 2 重新爬取:'
    num = raw_input()
    if num == '2':
        count = 0
        print u'开始重新爬取'
    if count < config.TOTAL_COUNT:
        for count in range(count, config.TOTAL_COUNT):
            write_count(count, config.COUNT_TXT)
            url = urls[count]
            print u'正在爬取第', count+1, u'个网页, 共', config.TOTAL_COUNT, u'个'
            config.NOW_COUNT = count
            scrap(url)
            count = count + 1
            print u'当前已完成采集', config.NOW_COUNT + 1, u'个, 共', config.TOTAL_COUNT, u'个'
        print u'采集结束,完成了', len(urls), u'个链接的采集'
    else:
        print u'链接上次已经全部爬取完毕'

예제 #3

파일 보기

def main():
    urls = get_urls()
    print u'获取到如下链接列表'
    # print urls
    config.TOTAL_COUNT = len(urls)
    print u'共有', config.TOTAL_COUNT, u'个链接'
    count = int(get_count())
    if count < config.TOTAL_COUNT:
        for count in range(count, config.TOTAL_COUNT):
            try:
                write_count(count, config.COUNT_TXT)
                url = urls[count]
                print u'正在爬取第', count + 1, u'个网页, 共', config.TOTAL_COUNT, u'个'
                config.NOW_COUNT = count
                html = crawl(url)
                fileName = 'file/full_page/page.' + str(count)
                print u'写入临时文件'
                write_to_txt(html, fileName, url)
                print u'当前已完成采集', config.NOW_COUNT + 1, u'个, 共', config.TOTAL_COUNT, u'个'
                js = commands.getstatusoutput(
                    'grep "<script>(function(w, d)" ' + fileName)
                if len(js) < 20:
                    js = commands.getstatusoutput('grep "sellerId" ' +
                                                  fileName)
                # commands.getstatusoutput('rm -f ' + fileName)

                count = count + 1
                # 获取销售id, 商品id
                (sellerId, itemId) = get_itemId_sellerId(js)
                # 获取评论并写入文件
                get_comments_by_sellerId_itemId(sellerId, itemId)
            except Exception as e:
                count = count + 1
                print u'程序异常，跳过url: ' + url
                print e

        print u'采集结束,完成了', len(urls), u'个链接的采集'
    else:
        print u'链接上次已经全部爬取完毕'

예제 #4

파일 보기

파일: test_main.py 프로젝트: fangjq/webserver

def test_trans_rate():
    urls = get_urls()
    for url in urls:
        html = crawl(url)
        json_res = get_transaction_rate(BeautifulSoup(html))
        print json_res

예제 #5

파일 보기

파일: test_main.py 프로젝트: fangjq/webserver

def test_avg_price():
    urls = get_urls()
    for url in urls:
        html = crawl(url)
        json_res = get_avg_price(BeautifulSoup(html))
        print json_res

예제 #6

파일 보기

파일: test_main.py 프로젝트: fangjq/webserver

def batch_test():
    urls = get_urls()
    for url in urls:
        json_res = get_crawled_result(url, True)
        # json_res = get_comments(url)
        print json_res