def crawl_dianping_shop(shop_id): global shop_count url = "http://www.dianping.com/shop/%d" % int(shop_id) #url = "http://www.dianping.com/shop/531684" #url = 'http://www.dianping.com/shop/2744077' print "shop url:%s" % url shop_count = shop_count + 1 print "shop_count=%d" % shop_count tstart = datetime.now() downloader = DownloadManager(None, None, None) error_msg, url, redirected_url, html = downloader.download(url) tend = datetime.now() c = tend - tstart print c if html is None: print "download error" return None # write file to local folder file_path = BASE_PATH + "shop/" + shop_id file = open(file_path,"wb") file.write(html) file.close() html_encoding_match = None regexp = re.compile('<\s*meta[^>]+charset=[\'"]?([^>]*?)[;\'">]', re.I) html_encoding_match = regexp.search(html) if html_encoding_match is not None: html_encoding = html_encoding_match.groups()[0].lower() if html_encoding == "gb2312": soup = BeautifulSoup(html, fromEncoding='GB18030') else: soup = BeautifulSoup(html) # basic info block shop_info_inner_blocks = soup.findAll(True, {'class': re.compile(r'\bshop-info-inner\b')}) for shop_info_inner_block in shop_info_inner_blocks: pass """shop_name_tag = shop_info_inner_block.findNext('h1')
def crawl_top_category_list(url): global category_count # download this link for iphone5 # http://tech.sina.com.cn/z/iphone5/index.shtml #url = "http://www.dianping.com/search/category/9/10/g473" proxy = {'http' : '79.127.144.2:8080'} #downloader = DownloadManager(None, None, proxy) tstart = datetime.now() downloader = DownloadManager(None, None, None) error_msg, url, redirected_url, html = downloader.download(url) tend = datetime.now() c = tend - tstart print "download time" print c category_count = category_count + 1 print "category_count=%d" % category_count encoding_bug = None print "get list" soup = BeautifulSoup(html) shop_lists= soup.find("div", {"id":"searchList"}) #for item in shop_lists.dl: # print item #get all shops from shop_anchor_list = shop_lists.findAll('a', href=re.compile('/shop/(\d+)', re.I)) for link in shop_anchor_list: p = re.compile('/shop/(\d+)', re.I) m = p.match(link['href']) g = m.group(0) g = m.group(1) print "shop id:%s" % g crawl_dianping_shop(g) print link['href'] #get category #http://www.dianping.com/search/category/9/10/g473p2 #http://www.dianping.com/search/category/9/10/g473r45/g10g473r45 category_lists= soup.findAll("a", href=re.compile('/search/category/.+', re.I)) for link in category_lists: url = "http://www.dianping.com" + link['href'] crawl_top_category_list(url)