class City:
    def __init__(self):
        self.headers = HEADER_INFO
        self.db = DB()

    # 获取城市百度百度可经纬度介绍
    def get_city_intro(self):

        city_list = self.db.select_cityname()
        for x in city_list:
            url = 'https://baike.baidu.com/item/' + x[0]
            try:
                content = requests.get(url, headers=self.headers)
                print content.status_code
                html_contents = BeautifulSoup(content.content, 'html.parser')
                content = html_contents.find('div',
                                             class_='lemma-summary').text
                print content
                self.db.insert_city_intro(city=x[0], intro=content)
                time.sleep(0.5)
            except Exception as e:
                print e.message

    #获取城市经纬度
    def get_city_lat_lng(self):
        city_list = self.db.select_city_loc()

        for x in city_list:
            city_name = x[0]
            url = "http://api.map.baidu.com/geocoder/v2/?address=%s&output=json&ak=%s&callback=showLocation" % (
                city_name, BAIDUMAP_AK)
            print url
            # 更新之前先对表检查,确保经纬为为空的行才被更改
            location = self.getlal(url)
            print location
            self.db.insert_city_loc(location=location, city=x[0])

    #通过城市名获取经纬度
    def getlal(self, lal):
        try:
            time.sleep(0.02)
            res = requests.get(url=lal)
            # {"status":0,"result":{"location":{"lng":114.0259736573215,"lat":22.546053546205248},"precise":0,"confidence":14,"level":"城市"}}
            json_data = res.text
            value_lal = re.findall(r'\([\s\S]*\)', json_data)[0][1:-1]
            jd = json.loads(value_lal)
            # 得到纬度
            lat = jd['result']['location']['lat']
            # print lat
            # 得到纬度
            lng = jd['result']['location']['lng']
            # print lng
            return [lat, lng]
        except Exception as e:
            print e.args
            print '网络错误'
示例#2
0
 def __init__(self, driver):
     self.driver = driver
     self.db = DB()
     self.time = timeHelper()
示例#3
0
class Food:
    def __init__(self, driver):
        self.driver = driver
        self.db = DB()
        self.time = timeHelper()

    # 提取城市
    def get_city(self):
        city = self.db.select_crawltime()
        for x in city:
            # 获取时间
            if x[3] is None:
                # 爬取完毕,写入时间
                self.db.insert_crawltime(time=self.time.now_time(),
                                         city=x[0],
                                         colunm_name='food_time_crawl')
                cityname = x[0].replace('市', '')
                self.crawl_food(cityname)
            elif x[3] is not None:
                days = self.time.time_helper(str(x[3]), self.time.now_time())
                print '天数是', days
                print '开始检查时间是否大于七天'
                if days > 7:
                    # 爬取完毕,写入时间
                    self.db.insert_crawltime(time=self.time.now_time(),
                                             city=x[0],
                                             colunm_name='food_time_crawl')
                    cityname = x[0].replace('市', '')
                    self.crawl_food(cityname)
            else:
                print '不进行搜索'

    # 爬取数据,
    # 接收城市参数
    # 需要爬取图片,名字
    def crawl_food(self, city):
        url = 'http://home.meishichina.com/search/recipe/%s/?' % (city)
        self.driver.get(url)
        # 将页面滚动条拖到底部
        js = "var q=document.documentElement.scrollTop=100000"
        self.driver.execute_script(js)
        page_content = BeautifulSoup(self.driver.page_source, 'html.parser')

        try:
            all_pages = page_content.find(
                'div', class_='ui_title_wrap').find('span').text
            #结果数
            all_page = re.findall(r'\d+', all_pages)[0]
            print all_page

            if int(round(float(all_page) / 20)) + 1 == 1:
                self.analyse_html(page_content=page_content, city=city)
            # 根据不同条件点击下一页
            else:
                try:
                    for p in xrange(1, int(round(float(all_page) / 20)) + 1):
                        if p == 1:
                            self.analyse_html(page_content=page_content,
                                              city=city)
                        else:
                            # 找到下一页按钮点击
                            self.driver.find_element_by_link_text(
                                '下一页').click()
                            time.sleep(10)
                            # 将页面滚动条拖到底部
                            js = "var q=document.documentElement.scrollTop=100000"
                            self.driver.execute_script(js)
                            page_content01 = BeautifulSoup(
                                self.driver.page_source, 'html.parser')
                            self.analyse_html(page_content=page_content01,
                                              city=city)
                            time.sleep(5)
                except Exception as e:
                    print e.message
        except Exception as e:
            print '找不到页数'

    def analyse_html(self, page_content, city):
        try:
            content_list = page_content.find(
                'div', id='search_res_list').find_all('li')
            for x in content_list:
                food_link = x.find('div', class_='pic').find('a')['href']
                food_pic = x.find('div', class_='pic').find('img')['src']
                food_name = x.find('div', class_='detail').find('h4').text
                data = self.db.if_exist(city_name=city + '市',
                                        table_name='food',
                                        title=food_name)
                print data
                if data == 0:
                    self.db.insert_food_mess(city=city + '市',
                                             food_link=food_link,
                                             food_pic=food_pic,
                                             food_name=food_name)
                else:
                    print '存在'
        except Exception as e:
            print e.message
            sendMail('美食爬虫出错啦,请注意')
            print '网页具体内容找不到'
 def __init__(self):
     self.headers = HEADER_INFO
     self.db = DB()
class Scenic:
    def __init__(self, driver):
        self.driver = driver
        self.db = DB()
        self.time = timeHelper()

    #获取城市,包括时间为空或者时间超过当前时间7天的
    def get_city(self):
        city = self.db.select_crawltime()
        for x in city:
            #获取时间

            if x[1] is None:

                url = "http://piao.qunar.com/ticket/list.htm?keyword=%s&region=&from=mpl_search_suggest" % x[
                    0]

                # 爬取完毕,写入时间
                self.db.insert_crawltime(time=self.time.now_time(),
                                         city=x[0],
                                         colunm_name='scenic_time_crawl')
                # print urlj
                #调用爬虫方法
                self.crawl_pages(to_city=x[0], url=url)
            elif x[1] is not None:
                days = self.time.time_helper(str(x[1]), self.time.now_time())
                print '开始检查时间是否大于七天'
                if days > 7:
                    url = "http://piao.qunar.com/ticket/list.htm?keyword=%s&region=&from=mpl_search_suggest" % x[
                        0]

                    # 爬取完毕,写入时间
                    self.db.insert_crawltime(time=self.time.now_time(),
                                             city=x[0])
                    # print urlj
                    # 调用爬虫方法
                    self.crawl_pages(to_city=x[0], url=url)

    #爬取指定城市的所有景点
    #包括标题、景区等级、地址、热度、票价、爬取时间、月销售量、来源、简介、图片
    #控制休眠时间
    def crawl_pages(self, to_city, url):

        response = requests.get(url, headers=HEADER_INFO)
        if response.status_code == 200:
            #获取到总页数
            content_tree = BeautifulSoup(response.content, 'html.parser')
            try:
                pages = content_tree.find(
                    'div', id='pager-container').find_all('a')[-2]
                all_pages = pages.text.strip()
                print all_pages
            except Exception as e:
                print '只有一页 '
                all_pages = 1
            self.craw_by_pages(to_city=to_city, pages=int(all_pages), url=url)
        else:
            print '错误'

    #根据页数自动爬取
    def craw_by_pages(self, to_city, pages, url):
        #生成url
        for q in range(1, pages + 1):
            to_url = url + "&page=%s" % q
            print '开始爬取网址', to_url
            #使用webdriver进行爬取
            self.driver.get(to_url)
            # 将页面滚动条拖到底部
            js = "var q=document.documentElement.scrollTop=100000"
            self.driver.execute_script(js)
            time.sleep(2)
            # response = requests.get(to_url, headers=HEADER_INFO).content
            # 获取到正文
            content_tree = BeautifulSoup(self.driver.page_source,
                                         'html.parser')
            try:
                result_list = content_tree.find(
                    'div', class_='result_list').find_all(
                        'div', class_='sight_item sight_itempos')
                for item_list in result_list:
                    # 获取img
                    try:
                        img_src = item_list.find(
                            'img', class_='img_opacity load')['src']
                    except Exception as e:
                        img_src = 'null'
                        print '找不到图片地址'

                    # 标题
                    try:
                        title = item_list.find(
                            'h3', class_='sight_item_caption').text
                    except Exception as e:
                        title = '暂无标题'
                        print '找不到标题'
                    # 价格
                    try:
                        price = item_list.find(
                            'div', class_='sight_item_pop').text.split()[0][1:]
                        if price.isdigit():
                            price = float(price)
                        else:
                            price = float(0)
                    except Exception as e:
                        print '找不到价格信息'
                        price = float(0)
                    #url
                    try:
                        urls = item_list.find(
                            'h3',
                            class_='sight_item_caption').find('a')['href']
                    except Exception as e:
                        urls = 'null'
                        print '找不到链接'
                    #热度
                    try:
                        hot = item_list.find(
                            'span',
                            class_='product_star_level').text.split()[1]
                    except Exception as e:
                        hot = '0.0'
                        print '找不到热度'
                    #等级
                    try:
                        level = item_list.find('span', class_='level').text
                    except Exception as e:
                        print '找不到等级信息'
                        level = '无等级'
                    #介绍
                    try:
                        intro = item_list.find('div', class_='intro').text
                    except Exception as e:
                        print '找不到介绍'
                        intro = '暂无介绍'

                    # 酒店查找
                    num = self.db.if_exist(city_name=to_city,
                                           table_name='senic_spot',
                                           title=title)
                    if num == 0:
                        self.db.insert_scenic_mess(
                            senic_spot_name=title,
                            introduction=intro,
                            city=to_city,
                            price=price,
                            pic=img_src,
                            types='1000',
                            url='http://piao.qunar.com' + urls,
                            levels=level,
                            hot=hot)
                    else:
                        print '存在'
            except Exception as e:
                print '----错误啦---'
                sendMail('景点爬虫出错啦,请注意')
                print e.message
class Hotel:
    def __init__(self, driver):
        self.db = DB()
        self.pinyin = Pinyin()
        self.driver = driver
        self.time = timeHelper()

    #获取到城市
    def get_city(self):
        city = self.db.select_crawltime()
        for x in city:
            # 获取时间
            if x[2] is None:
                # 爬取完毕,写入时间
                self.db.insert_crawltime(time=self.time.now_time(),
                                         city=x[0],
                                         colunm_name='hotel_time_crawl')
                self.to_pinyin(x[0])
            elif x[2] is not None:
                days = self.time.time_helper(str(x[2]), self.time.now_time())
                print '天数是', days
                print '开始检查时间是否大于七天'
                if days > 7:
                    # 爬取完毕,写入时间
                    self.db.insert_crawltime(time=self.time.now_time(),
                                             city=x[0],
                                             colunm_name='hotel_time_crawl')
                    self.to_pinyin(x[0])
            else:
                print '不进行搜索'

    def to_pinyin(self, city):
        cityname = city.replace('市', '')
        pinyins = self.pinyin.get_pinyin(cityname, '')
        # 组装url
        to_url = 'http://hotel.elong.com/%s/' % pinyins
        # 调用爬取的方法
        print '开始爬取网站', to_url
        try:
            self.crawl_hotel(url=to_url, city_name=city)
        except Exception as e:
            print '超时'

    #爬取内容
    def crawl_hotel(self, url, city_name):
        self.driver.get(url)
        # 将页面滚动条拖到底部
        js = "var q=document.documentElement.scrollTop=100000"
        self.driver.execute_script(js)
        time.sleep(2)
        #捉取页数
        try:
            a_tag = BeautifulSoup(self.driver.page_source,
                                  'html.parser').find('div',
                                                      id='pageContainer')
            page = a_tag.find_all('a')[-2].text
        except Exception as e:
            page = 0

        #如何page是1页
        if page != 0:
            #先获取页面内容,再刷新页面
            page = int(page) + 1
            print '共有页数为', page
            for x in xrange(1, page):
                print x
                try:
                    if x == 1:
                        content = BeautifulSoup(self.driver.page_source,
                                                'html.parser').find(
                                                    'div', id='hotelContainer')
                        hotel_list = content.find_all('div', class_='h_item')
                        for hl in hotel_list:
                            self.analyse_html(hl=hl, city=city_name)
                    else:
                        self.driver.find_element_by_class_name(
                            'page_next').click()
                        time.sleep(10)
                        # 将页面滚动条拖到底部
                        js = "var q=document.documentElement.scrollTop=100000"
                        self.driver.execute_script(js)
                        content = BeautifulSoup(self.driver.page_source,
                                                'html.parser').find(
                                                    'div', id='hotelContainer')
                        hotel_list = content.find_all('div', class_='h_item')
                        for h2 in hotel_list:
                            self.analyse_html(hl=h2, city=city_name)

                except Exception as e:
                    print e.message
                    print '无内容'

    def analyse_html(self, hl, city):
        # 酒店名
        try:
            hotel_name = hl.find('p', class_='h_info_b1').text.strip()
            to_hotel_name = re.sub('\d*', '', hotel_name)

            # 价格
            try:
                hotel_price = hl.find('span', class_='h_pri_num').text.strip()
                hotel_price = float(hotel_price)
            except Exception as e:
                hotel_price = float(0)
                print '找不到酒店价格'

            # 位置
            try:
                hotel_address = hl.find('p', class_='h_info_b2').text.strip()
            except Exception as e:
                hotel_address = '未发现'
                print '找不到酒店地址'

            # 图片
            try:
                hotel_img = hl.find(
                    'div', class_='h_info_pic').find_all('img')[0]['src']
            except Exception as e:
                print '找不到图片'

            # url
            try:
                hotel_link = hl.find('p', class_='h_info_b1').find('a')['href']
            except Exception as e:
                print '找不大链接'

            #酒店查找
            num = self.db.if_exist(city_name=city,
                                   table_name='hotel',
                                   title=hotel_name)
            if num == 0:
                self.db.insert_hotel_mess(hotel_name=to_hotel_name,
                                          hotel_address=hotel_address,
                                          hotel_price=hotel_price,
                                          hotel_link=hotel_link,
                                          hotel_pic='http://hotel.elong.com' +
                                          hotel_img,
                                          city=city,
                                          source='艺龙网')
            else:
                print '存在'

        except Exception as e:
            print e.message
            print '找不到酒店名,跳过'