def crawl_dianping_shop(shop_id): global shop_count url = "http://www.dianping.com/shop/%d" % int(shop_id) #url = "http://www.dianping.com/shop/531684" #url = 'http://www.dianping.com/shop/2744077' print "shop url:%s" % url shop_count = shop_count + 1 print "shop_count=%d" % shop_count tstart = datetime.now() downloader = DownloadManager(None, None, None) error_msg, url, redirected_url, html = downloader.download(url) tend = datetime.now() c = tend - tstart print c if html is None: print "download error" return None # write file to local folder file_path = BASE_PATH + "shop/" + shop_id file = open(file_path,"wb") file.write(html) file.close() html_encoding_match = None regexp = re.compile('<\s*meta[^>]+charset=[\'"]?([^>]*?)[;\'">]', re.I) html_encoding_match = regexp.search(html) if html_encoding_match is not None: html_encoding = html_encoding_match.groups()[0].lower() if html_encoding == "gb2312": soup = BeautifulSoup(html, fromEncoding='GB18030') else: soup = BeautifulSoup(html) # basic info block shop_info_inner_blocks = soup.findAll(True, {'class': re.compile(r'\bshop-info-inner\b')}) for shop_info_inner_block in shop_info_inner_blocks: pass """shop_name_tag = shop_info_inner_block.findNext('h1')
def crawl_top_category_list(url): global category_count # download this link for iphone5 # http://tech.sina.com.cn/z/iphone5/index.shtml #url = "http://www.dianping.com/search/category/9/10/g473" proxy = {'http' : '79.127.144.2:8080'} #downloader = DownloadManager(None, None, proxy) tstart = datetime.now() downloader = DownloadManager(None, None, None) error_msg, url, redirected_url, html = downloader.download(url) tend = datetime.now() c = tend - tstart print "download time" print c category_count = category_count + 1 print "category_count=%d" % category_count encoding_bug = None print "get list" soup = BeautifulSoup(html) shop_lists= soup.find("div", {"id":"searchList"}) #for item in shop_lists.dl: # print item #get all shops from shop_anchor_list = shop_lists.findAll('a', href=re.compile('/shop/(\d+)', re.I)) for link in shop_anchor_list: p = re.compile('/shop/(\d+)', re.I) m = p.match(link['href']) g = m.group(0) g = m.group(1) print "shop id:%s" % g crawl_dianping_shop(g) print link['href'] #get category #http://www.dianping.com/search/category/9/10/g473p2 #http://www.dianping.com/search/category/9/10/g473r45/g10g473r45 category_lists= soup.findAll("a", href=re.compile('/search/category/.+', re.I)) for link in category_lists: url = "http://www.dianping.com" + link['href'] crawl_top_category_list(url)
def __init__(self, user, pwd): self.BUY_SYSTEM_GOODS_PRICE_MAX = 0 self.BUY_SYSTEM_GOODS_PRICE_MIN = 999999999 self.STALL_GOODS_QUANTITY_MAX = 0 self.BUY_USER_GOODS_PRICE_MAX = 0 self.BUY_USER_GOODS_PRICE_MIN = 999999999 self.STORE_GOODS_QUANTITY_MAX = 0 self.MONEY_KEEP = 0 self.URL_KAIXIN_HOME = "http://kaixin001.com/" self.URL_KAIXIN_LOGIN = '******' self.downloader = DownloadManager() self.kaixindb = WebpageDB(user+'.db') self.load_settings(user+'.cfg') self.MY_STALL_ID = None self.MY_TOTAL_MONEY = None
import time from crawler.downloader import DownloadManager# python-crawler from crawler.webpage import WebPage # python-crawler from crawler.database import WebpageDB import lxml.html # python-lxml import json import time import random import re import sys import json import random downloader = DownloadManager() item_prices = {} def login(user, pwd): url = "http://kaixin001.com/" error_msg, url, redirected_url, html = download(url) page = WebPage(url, html) action, fields = page.get_form(0) fields['email'] = user fields['password'] = pwd fields['remember'] = 0 url = 'http://www.kaixin001.com/login/login.php' error_msg, url, redirected_url, html = download(url , fields)