def from_file(): urls = get_urls() print u'获取到如下链接列表' #print urls config.TOTAL_COUNT = len(urls) print u'共有', config.TOTAL_COUNT, u'个链接' count = int(get_count()) print u'上次爬取到第', int(count) + 1, u'个链接, 继续爬取' print u'输入 1 继续爬取,输入 2 重新爬取:' num = raw_input() if num == '2': count = 0 print u'开始重新爬取' if count < config.TOTAL_COUNT: for count in range(count, config.TOTAL_COUNT): write_count(count, config.COUNT_TXT) url = urls[count] print u'正在爬取第', count + 1, u'个网页, 共', config.TOTAL_COUNT, u'个' config.NOW_COUNT = count scrap(url) count = count + 1 print u'当前已完成采集', config.NOW_COUNT + 1, u'个, 共', config.TOTAL_COUNT, u'个' print u'采集结束,完成了', len(urls), u'个链接的采集' else: print u'链接上次已经全部爬取完毕'
def from_file(): urls = get_urls() print u'获取到如下链接列表' print urls config.TOTAL_COUNT = len(urls) print u'共有', config.TOTAL_COUNT, u'个链接' count = int(get_count()) print u'上次爬取到第', int(count) + 1, u'个链接, 继续爬取' print u'输入 1 继续爬取,输入 2 重新爬取:' num = raw_input() if num == '2': count = 0 print u'开始重新爬取' if count < config.TOTAL_COUNT: for count in range(count, config.TOTAL_COUNT): write_count(count, config.COUNT_TXT) url = urls[count] print u'正在爬取第', count+1, u'个网页, 共', config.TOTAL_COUNT, u'个' config.NOW_COUNT = count scrap(url) count = count + 1 print u'当前已完成采集', config.NOW_COUNT + 1, u'个, 共', config.TOTAL_COUNT, u'个' print u'采集结束,完成了', len(urls), u'个链接的采集' else: print u'链接上次已经全部爬取完毕'
def main(): urls = get_urls() print u'获取到如下链接列表' # print urls config.TOTAL_COUNT = len(urls) print u'共有', config.TOTAL_COUNT, u'个链接' count = int(get_count()) if count < config.TOTAL_COUNT: for count in range(count, config.TOTAL_COUNT): try: write_count(count, config.COUNT_TXT) url = urls[count] print u'正在爬取第', count + 1, u'个网页, 共', config.TOTAL_COUNT, u'个' config.NOW_COUNT = count html = crawl(url) fileName = 'file/full_page/page.' + str(count) print u'写入临时文件' write_to_txt(html, fileName, url) print u'当前已完成采集', config.NOW_COUNT + 1, u'个, 共', config.TOTAL_COUNT, u'个' js = commands.getstatusoutput( 'grep "<script>(function(w, d)" ' + fileName) if len(js) < 20: js = commands.getstatusoutput('grep "sellerId" ' + fileName) # commands.getstatusoutput('rm -f ' + fileName) count = count + 1 # 获取销售id, 商品id (sellerId, itemId) = get_itemId_sellerId(js) # 获取评论并写入文件 get_comments_by_sellerId_itemId(sellerId, itemId) except Exception as e: count = count + 1 print u'程序异常,跳过url: ' + url print e print u'采集结束,完成了', len(urls), u'个链接的采集' else: print u'链接上次已经全部爬取完毕'
def test_trans_rate(): urls = get_urls() for url in urls: html = crawl(url) json_res = get_transaction_rate(BeautifulSoup(html)) print json_res
def test_avg_price(): urls = get_urls() for url in urls: html = crawl(url) json_res = get_avg_price(BeautifulSoup(html)) print json_res
def batch_test(): urls = get_urls() for url in urls: json_res = get_crawled_result(url, True) # json_res = get_comments(url) print json_res