Exemplo n.º 1
0
def from_file():
    urls = get_urls()
    print u'获取到如下链接列表'
    #print urls
    config.TOTAL_COUNT = len(urls)
    print u'共有', config.TOTAL_COUNT, u'个链接'
    count = int(get_count())
    print u'上次爬取到第', int(count) + 1, u'个链接, 继续爬取'
    print u'输入 1 继续爬取,输入 2 重新爬取:'
    num = raw_input()
    if num == '2':
        count = 0
        print u'开始重新爬取'
    if count < config.TOTAL_COUNT:
        for count in range(count, config.TOTAL_COUNT):
            write_count(count, config.COUNT_TXT)
            url = urls[count]
            print u'正在爬取第', count + 1, u'个网页, 共', config.TOTAL_COUNT, u'个'
            config.NOW_COUNT = count
            scrap(url)
            count = count + 1
            print u'当前已完成采集', config.NOW_COUNT + 1, u'个, 共', config.TOTAL_COUNT, u'个'
        print u'采集结束,完成了', len(urls), u'个链接的采集'
    else:
        print u'链接上次已经全部爬取完毕'
Exemplo n.º 2
0
def from_file():
    urls = get_urls()
    print u'获取到如下链接列表'
    print urls
    config.TOTAL_COUNT = len(urls)
    print u'共有', config.TOTAL_COUNT, u'个链接'
    count = int(get_count())
    print u'上次爬取到第', int(count) + 1, u'个链接, 继续爬取'
    print u'输入 1 继续爬取,输入 2 重新爬取:'
    num = raw_input()
    if num == '2':
        count = 0
        print u'开始重新爬取'
    if count < config.TOTAL_COUNT:
        for count in range(count, config.TOTAL_COUNT):
            write_count(count, config.COUNT_TXT)
            url = urls[count]
            print u'正在爬取第', count+1, u'个网页, 共', config.TOTAL_COUNT, u'个'
            config.NOW_COUNT = count
            scrap(url)
            count = count + 1
            print u'当前已完成采集', config.NOW_COUNT + 1, u'个, 共', config.TOTAL_COUNT, u'个'
        print u'采集结束,完成了', len(urls), u'个链接的采集'
    else:
        print u'链接上次已经全部爬取完毕'
Exemplo n.º 3
0
def main():
    urls = get_urls()
    print u'获取到如下链接列表'
    # print urls
    config.TOTAL_COUNT = len(urls)
    print u'共有', config.TOTAL_COUNT, u'个链接'
    count = int(get_count())
    if count < config.TOTAL_COUNT:
        for count in range(count, config.TOTAL_COUNT):
            try:
                write_count(count, config.COUNT_TXT)
                url = urls[count]
                print u'正在爬取第', count + 1, u'个网页, 共', config.TOTAL_COUNT, u'个'
                config.NOW_COUNT = count
                html = crawl(url)
                fileName = 'file/full_page/page.' + str(count)
                print u'写入临时文件'
                write_to_txt(html, fileName, url)
                print u'当前已完成采集', config.NOW_COUNT + 1, u'个, 共', config.TOTAL_COUNT, u'个'
                js = commands.getstatusoutput(
                    'grep "<script>(function(w, d)" ' + fileName)
                if len(js) < 20:
                    js = commands.getstatusoutput('grep "sellerId" ' +
                                                  fileName)
                # commands.getstatusoutput('rm -f ' + fileName)

                count = count + 1
                # 获取销售id, 商品id
                (sellerId, itemId) = get_itemId_sellerId(js)
                # 获取评论并写入文件
                get_comments_by_sellerId_itemId(sellerId, itemId)
            except Exception as e:
                count = count + 1
                print u'程序异常,跳过url: ' + url
                print e

        print u'采集结束,完成了', len(urls), u'个链接的采集'
    else:
        print u'链接上次已经全部爬取完毕'
Exemplo n.º 4
0
def test_trans_rate():
    urls = get_urls()
    for url in urls:
        html = crawl(url)
        json_res = get_transaction_rate(BeautifulSoup(html))
        print json_res
Exemplo n.º 5
0
def test_avg_price():
    urls = get_urls()
    for url in urls:
        html = crawl(url)
        json_res = get_avg_price(BeautifulSoup(html))
        print json_res
Exemplo n.º 6
0
def batch_test():
    urls = get_urls()
    for url in urls:
        json_res = get_crawled_result(url, True)
        # json_res = get_comments(url)
        print json_res