class City: def __init__(self): self.headers = HEADER_INFO self.db = DB() # 获取城市百度百度可经纬度介绍 def get_city_intro(self): city_list = self.db.select_cityname() for x in city_list: url = 'https://baike.baidu.com/item/' + x[0] try: content = requests.get(url, headers=self.headers) print content.status_code html_contents = BeautifulSoup(content.content, 'html.parser') content = html_contents.find('div', class_='lemma-summary').text print content self.db.insert_city_intro(city=x[0], intro=content) time.sleep(0.5) except Exception as e: print e.message #获取城市经纬度 def get_city_lat_lng(self): city_list = self.db.select_city_loc() for x in city_list: city_name = x[0] url = "http://api.map.baidu.com/geocoder/v2/?address=%s&output=json&ak=%s&callback=showLocation" % ( city_name, BAIDUMAP_AK) print url # 更新之前先对表检查,确保经纬为为空的行才被更改 location = self.getlal(url) print location self.db.insert_city_loc(location=location, city=x[0]) #通过城市名获取经纬度 def getlal(self, lal): try: time.sleep(0.02) res = requests.get(url=lal) # {"status":0,"result":{"location":{"lng":114.0259736573215,"lat":22.546053546205248},"precise":0,"confidence":14,"level":"城市"}} json_data = res.text value_lal = re.findall(r'\([\s\S]*\)', json_data)[0][1:-1] jd = json.loads(value_lal) # 得到纬度 lat = jd['result']['location']['lat'] # print lat # 得到纬度 lng = jd['result']['location']['lng'] # print lng return [lat, lng] except Exception as e: print e.args print '网络错误'
def __init__(self, driver): self.driver = driver self.db = DB() self.time = timeHelper()
class Food: def __init__(self, driver): self.driver = driver self.db = DB() self.time = timeHelper() # 提取城市 def get_city(self): city = self.db.select_crawltime() for x in city: # 获取时间 if x[3] is None: # 爬取完毕,写入时间 self.db.insert_crawltime(time=self.time.now_time(), city=x[0], colunm_name='food_time_crawl') cityname = x[0].replace('市', '') self.crawl_food(cityname) elif x[3] is not None: days = self.time.time_helper(str(x[3]), self.time.now_time()) print '天数是', days print '开始检查时间是否大于七天' if days > 7: # 爬取完毕,写入时间 self.db.insert_crawltime(time=self.time.now_time(), city=x[0], colunm_name='food_time_crawl') cityname = x[0].replace('市', '') self.crawl_food(cityname) else: print '不进行搜索' # 爬取数据, # 接收城市参数 # 需要爬取图片,名字 def crawl_food(self, city): url = 'http://home.meishichina.com/search/recipe/%s/?' % (city) self.driver.get(url) # 将页面滚动条拖到底部 js = "var q=document.documentElement.scrollTop=100000" self.driver.execute_script(js) page_content = BeautifulSoup(self.driver.page_source, 'html.parser') try: all_pages = page_content.find( 'div', class_='ui_title_wrap').find('span').text #结果数 all_page = re.findall(r'\d+', all_pages)[0] print all_page if int(round(float(all_page) / 20)) + 1 == 1: self.analyse_html(page_content=page_content, city=city) # 根据不同条件点击下一页 else: try: for p in xrange(1, int(round(float(all_page) / 20)) + 1): if p == 1: self.analyse_html(page_content=page_content, city=city) else: # 找到下一页按钮点击 self.driver.find_element_by_link_text( '下一页').click() time.sleep(10) # 将页面滚动条拖到底部 js = "var q=document.documentElement.scrollTop=100000" self.driver.execute_script(js) page_content01 = BeautifulSoup( self.driver.page_source, 'html.parser') self.analyse_html(page_content=page_content01, city=city) time.sleep(5) except Exception as e: print e.message except Exception as e: print '找不到页数' def analyse_html(self, page_content, city): try: content_list = page_content.find( 'div', id='search_res_list').find_all('li') for x in content_list: food_link = x.find('div', class_='pic').find('a')['href'] food_pic = x.find('div', class_='pic').find('img')['src'] food_name = x.find('div', class_='detail').find('h4').text data = self.db.if_exist(city_name=city + '市', table_name='food', title=food_name) print data if data == 0: self.db.insert_food_mess(city=city + '市', food_link=food_link, food_pic=food_pic, food_name=food_name) else: print '存在' except Exception as e: print e.message sendMail('美食爬虫出错啦,请注意') print '网页具体内容找不到'
def __init__(self): self.headers = HEADER_INFO self.db = DB()
class Scenic: def __init__(self, driver): self.driver = driver self.db = DB() self.time = timeHelper() #获取城市,包括时间为空或者时间超过当前时间7天的 def get_city(self): city = self.db.select_crawltime() for x in city: #获取时间 if x[1] is None: url = "http://piao.qunar.com/ticket/list.htm?keyword=%s®ion=&from=mpl_search_suggest" % x[ 0] # 爬取完毕,写入时间 self.db.insert_crawltime(time=self.time.now_time(), city=x[0], colunm_name='scenic_time_crawl') # print urlj #调用爬虫方法 self.crawl_pages(to_city=x[0], url=url) elif x[1] is not None: days = self.time.time_helper(str(x[1]), self.time.now_time()) print '开始检查时间是否大于七天' if days > 7: url = "http://piao.qunar.com/ticket/list.htm?keyword=%s®ion=&from=mpl_search_suggest" % x[ 0] # 爬取完毕,写入时间 self.db.insert_crawltime(time=self.time.now_time(), city=x[0]) # print urlj # 调用爬虫方法 self.crawl_pages(to_city=x[0], url=url) #爬取指定城市的所有景点 #包括标题、景区等级、地址、热度、票价、爬取时间、月销售量、来源、简介、图片 #控制休眠时间 def crawl_pages(self, to_city, url): response = requests.get(url, headers=HEADER_INFO) if response.status_code == 200: #获取到总页数 content_tree = BeautifulSoup(response.content, 'html.parser') try: pages = content_tree.find( 'div', id='pager-container').find_all('a')[-2] all_pages = pages.text.strip() print all_pages except Exception as e: print '只有一页 ' all_pages = 1 self.craw_by_pages(to_city=to_city, pages=int(all_pages), url=url) else: print '错误' #根据页数自动爬取 def craw_by_pages(self, to_city, pages, url): #生成url for q in range(1, pages + 1): to_url = url + "&page=%s" % q print '开始爬取网址', to_url #使用webdriver进行爬取 self.driver.get(to_url) # 将页面滚动条拖到底部 js = "var q=document.documentElement.scrollTop=100000" self.driver.execute_script(js) time.sleep(2) # response = requests.get(to_url, headers=HEADER_INFO).content # 获取到正文 content_tree = BeautifulSoup(self.driver.page_source, 'html.parser') try: result_list = content_tree.find( 'div', class_='result_list').find_all( 'div', class_='sight_item sight_itempos') for item_list in result_list: # 获取img try: img_src = item_list.find( 'img', class_='img_opacity load')['src'] except Exception as e: img_src = 'null' print '找不到图片地址' # 标题 try: title = item_list.find( 'h3', class_='sight_item_caption').text except Exception as e: title = '暂无标题' print '找不到标题' # 价格 try: price = item_list.find( 'div', class_='sight_item_pop').text.split()[0][1:] if price.isdigit(): price = float(price) else: price = float(0) except Exception as e: print '找不到价格信息' price = float(0) #url try: urls = item_list.find( 'h3', class_='sight_item_caption').find('a')['href'] except Exception as e: urls = 'null' print '找不到链接' #热度 try: hot = item_list.find( 'span', class_='product_star_level').text.split()[1] except Exception as e: hot = '0.0' print '找不到热度' #等级 try: level = item_list.find('span', class_='level').text except Exception as e: print '找不到等级信息' level = '无等级' #介绍 try: intro = item_list.find('div', class_='intro').text except Exception as e: print '找不到介绍' intro = '暂无介绍' # 酒店查找 num = self.db.if_exist(city_name=to_city, table_name='senic_spot', title=title) if num == 0: self.db.insert_scenic_mess( senic_spot_name=title, introduction=intro, city=to_city, price=price, pic=img_src, types='1000', url='http://piao.qunar.com' + urls, levels=level, hot=hot) else: print '存在' except Exception as e: print '----错误啦---' sendMail('景点爬虫出错啦,请注意') print e.message
class Hotel: def __init__(self, driver): self.db = DB() self.pinyin = Pinyin() self.driver = driver self.time = timeHelper() #获取到城市 def get_city(self): city = self.db.select_crawltime() for x in city: # 获取时间 if x[2] is None: # 爬取完毕,写入时间 self.db.insert_crawltime(time=self.time.now_time(), city=x[0], colunm_name='hotel_time_crawl') self.to_pinyin(x[0]) elif x[2] is not None: days = self.time.time_helper(str(x[2]), self.time.now_time()) print '天数是', days print '开始检查时间是否大于七天' if days > 7: # 爬取完毕,写入时间 self.db.insert_crawltime(time=self.time.now_time(), city=x[0], colunm_name='hotel_time_crawl') self.to_pinyin(x[0]) else: print '不进行搜索' def to_pinyin(self, city): cityname = city.replace('市', '') pinyins = self.pinyin.get_pinyin(cityname, '') # 组装url to_url = 'http://hotel.elong.com/%s/' % pinyins # 调用爬取的方法 print '开始爬取网站', to_url try: self.crawl_hotel(url=to_url, city_name=city) except Exception as e: print '超时' #爬取内容 def crawl_hotel(self, url, city_name): self.driver.get(url) # 将页面滚动条拖到底部 js = "var q=document.documentElement.scrollTop=100000" self.driver.execute_script(js) time.sleep(2) #捉取页数 try: a_tag = BeautifulSoup(self.driver.page_source, 'html.parser').find('div', id='pageContainer') page = a_tag.find_all('a')[-2].text except Exception as e: page = 0 #如何page是1页 if page != 0: #先获取页面内容,再刷新页面 page = int(page) + 1 print '共有页数为', page for x in xrange(1, page): print x try: if x == 1: content = BeautifulSoup(self.driver.page_source, 'html.parser').find( 'div', id='hotelContainer') hotel_list = content.find_all('div', class_='h_item') for hl in hotel_list: self.analyse_html(hl=hl, city=city_name) else: self.driver.find_element_by_class_name( 'page_next').click() time.sleep(10) # 将页面滚动条拖到底部 js = "var q=document.documentElement.scrollTop=100000" self.driver.execute_script(js) content = BeautifulSoup(self.driver.page_source, 'html.parser').find( 'div', id='hotelContainer') hotel_list = content.find_all('div', class_='h_item') for h2 in hotel_list: self.analyse_html(hl=h2, city=city_name) except Exception as e: print e.message print '无内容' def analyse_html(self, hl, city): # 酒店名 try: hotel_name = hl.find('p', class_='h_info_b1').text.strip() to_hotel_name = re.sub('\d*', '', hotel_name) # 价格 try: hotel_price = hl.find('span', class_='h_pri_num').text.strip() hotel_price = float(hotel_price) except Exception as e: hotel_price = float(0) print '找不到酒店价格' # 位置 try: hotel_address = hl.find('p', class_='h_info_b2').text.strip() except Exception as e: hotel_address = '未发现' print '找不到酒店地址' # 图片 try: hotel_img = hl.find( 'div', class_='h_info_pic').find_all('img')[0]['src'] except Exception as e: print '找不到图片' # url try: hotel_link = hl.find('p', class_='h_info_b1').find('a')['href'] except Exception as e: print '找不大链接' #酒店查找 num = self.db.if_exist(city_name=city, table_name='hotel', title=hotel_name) if num == 0: self.db.insert_hotel_mess(hotel_name=to_hotel_name, hotel_address=hotel_address, hotel_price=hotel_price, hotel_link=hotel_link, hotel_pic='http://hotel.elong.com' + hotel_img, city=city, source='艺龙网') else: print '存在' except Exception as e: print e.message print '找不到酒店名,跳过'