예제 #1
0
    def get_detail(self, shop_id, request_type='proxy, cookie'):
        url = 'http://www.dianping.com/shop/' + str(shop_id)
        r = requests_util.get_requests(url, request_type=request_type)
        if r.status_code == 403:
            print('检查浏览器,处理验证码,替换cookie,输入y解除限制',
                  'http://www.dianping.com/shop/' + str(shop_id))
            while input() != 'y':
                import time
                time.sleep(1)
            requests_util.update_cookie()
            r = requests_util.get_requests(url, request_type=request_type)
        text = r.text
        # 获取加密文件
        file_map = get_search_map_file(text)
        # 替换加密字符串
        text = requests_util.replace_search_html(text, file_map)
        # 网页解析
        html = BeautifulSoup(text, 'lxml')
        """
        解析格式1(一般餐饮居多)
        """
        # 基础信息
        main_info = html.select('.main')[0]

        shop_name = '-'
        review_count = '-'
        avg_price = '-'
        score = '-'
        address = '-'
        phone = '-'
        other_info = '-'
        try:
            base_info = main_info.select('#basic-info')[0]
            try:
                shop_name = base_info.select('.shop-name')[0].text
                # 过滤标题后缀,例:手机扫码 优惠买单
                remove_a = base_info.select('a')
                for each in remove_a:
                    shop_name = shop_name.replace(each.text, '')
                shop_name = shop_name.strip()
            except:
                shop_name = '-'
            try:
                brief_info = main_info.select('.brief-info')[0]
                # Todo 单独json接口响应,js加密参数,由后期慢慢解决,但是仍然保留这个字段,其他解析方式有时可以解析这个字段
                # try:
                #     score = brief_info.select('.star-wrapper')[0].select('.mid-score')[0].text.strip()
                # except:
                #     score = None
                try:
                    review_count = brief_info.select(
                        '#reviewCount')[0].text.strip()
                except:
                    review_count = '-'
                try:
                    avg_price = brief_info.select(
                        '#avgPriceTitle')[0].text.strip()
                except:
                    avg_price = '-'

                # Todo 这个建议使用info中信息,这里的有可能会不准,动态参数由json返回
                # try:
                #     comment_score = brief_info.select('#comment_score')[0].text.strip()
                # except:
                #     comment_score = None

                try:
                    address = main_info.find(attrs={
                        'itemprop': 'street-address'
                    }).text.strip()
                except:
                    address = '-'

                try:
                    phone = main_info.select('.tel')[0].text.strip()
                except:
                    phone = '-'

                try:
                    other_info = main_info.select('.other')[0].text.replace(
                        '修改', '').strip()
                except:
                    other_info = '-'
            except:
                # Todo 前台显示手动滑动解锁
                # self.get_detail(shop_id)
                pass
            # Todo 促销信息 (单独接口 js加密)
            # try:
            #     sale_info = ''
            #     sales = main_info.select('#sales')
            #     for sale in sales:
            #         for tag in sale.select('.item'):
            #             try:
            #                 title = tag.select('.title')[0].text
            #                 price = tag.select('.price')[0].text
            #                 del_price = tag.select('.del-price')[0].text
            #                 sale_info += title + '\t' + price + '\t' + del_price + '\n'
            #             except:
            #                 continue
            # except:
            #     sales = None
        except:
            # 切换解析方式
            pass
        """
        解析格式2(一般酒店居多)
        """
        # Todo 这种解析方式没有加密,会在解析加密文件时报错,反正这种格式数量不多,暂时不做更改了
        # if shop_name is '-':
        #     # 名称解析不到,换一种解析方式
        #     try:
        #         base_info = html.select('base-info')[0]
        #         try:
        #             shop_name = base_info.select('.hotel-title')[0].text
        #         except:
        #             shop_name = None
        #         try:
        #             address = base_info.find(attrs={'itemprop': 'address'}).text.strip()
        #         except:
        #             address = None
        #         try:
        #             score = base_info.select('.hotel-scope')[0].select('.score')[0].text
        #         except:
        #             score = None
        #     except:
        #         # Todo 前台显示手动滑动解锁
        #         # self.get_detail(shop_id)
        #         pass
        #     pass
        detail_info = {
            '店铺id': shop_id,
            '店铺名': shop_name,
            '评论总数': review_count,
            '人均价格': avg_price,
            '店铺地址': address,
            '店铺电话': phone,
            '其他信息': other_info
        }
        return detail_info
예제 #2
0
    def search(self, key_word, only_need_first=True, needed_pages=50):
        """
        搜索
        :param key_word: 关键字
        :param only_need_first: 只需要第一条
        :param needed_pages: 需要多少页
        :return:
        """
        # Todo 不需要详情页和评论,只需要首页搜索 不需要cookie
        assert isinstance(key_word, str)
        assert key_word != None or key_word.strip() != ''
        if self.custom_search_url != '':
            key_word = self.custom_search_url
        logger.info('开始搜索:' + key_word)
        # header = self.get_header()
        for i in tqdm(range(1, needed_pages + 1), desc='页数'):
            # 针对只需要收条的情况,跳出页数循环
            if only_need_first is True and i != 1:
                break

            url = 'http://www.dianping.com/search/keyword/' + str(
                self.location_id) + '/' + str(
                    self.channel_id) + '_' + str(key_word) + '/p' + str(i)
            if self.custom_search_url != '':
                url = self.custom_search_url + str(i)
            r = requests_util.get_requests(url)
            # r = requests.get(url, headers=header)
            text = r.text
            # 获取加密文件
            file_map = get_search_map_file(text)
            # 替换加密文件
            text = requests_util.replace_search_html(text, file_map)

            # 网页解析
            html = BeautifulSoup(text, 'lxml')
            shop_all_list = html.select('.shop-list')[0].select('li')

            search_res = []
            for shop in shop_all_list:
                try:
                    image_path = shop.select('.pic')[0].select('a')[0].select(
                        'img')[0]['src']
                except:
                    image_path = '-'
                try:
                    shop_id = shop.select('.txt')[0].select('.tit')[0].select(
                        'a')[0]['data-shopid']
                except:
                    shop_id = '-'
                try:
                    detail_url = shop.select('.txt')[0].select(
                        '.tit')[0].select('a')[0]['href']
                except:
                    detail_url = '-'
                try:
                    name = shop.select('.txt')[0].select('.tit')[0].select(
                        'a')[0].text.strip()
                except:
                    name = '-'
                # 两个star方式,有的页面显示详细star分数,有的显示icon
                # 解析icon
                try:
                    star_point = \
                        shop.select('.txt')[0].select('.comment')[0].select('.star_icon')[0].select('span')[0]['class'][
                            1].split('_')[1]
                    star_point = float(star_point) / 10
                    star_point = str(star_point)
                except:
                    star_point = '-'
                # 解析详细star
                try:
                    star_point = \
                        shop.select('.txt')[0].select('.comment')[0].select('.star_score')[0].text
                    star_point = float(star_point)
                    star_point = str(star_point)
                except:
                    pass
                try:
                    review_number = shop.select('.txt')[0].select(
                        '.comment')[0].select('.review-num')[0].text.replace(
                            '\n', '')
                except:
                    review_number = '-'
                try:
                    mean_price = shop.select('.txt')[0].select('.comment')[
                        0].select('.mean-price')[0].select('b')[0].text
                except:
                    mean_price = '¥0'
                try:
                    tags = shop.select('.txt')[0].select(
                        '.tag-addr')[0].select('.tag')
                    tag1 = tags[0].text.replace('\n', ' ').strip()
                    tag2 = tags[1].text.replace('\n', ' ').strip()
                except:
                    tag1 = '-'
                    tag2 = '-'
                try:
                    addr = shop.select('.txt')[0].select(
                        '.tag-addr')[0].select('.addr')[0].text.replace(
                            '\n', ' ').strip()
                except:
                    addr = '-'
                try:
                    recommend = shop.select('.recommend')[0].text.replace(
                        '\n', ' ').strip()
                except:
                    recommend = '-'
                try:
                    commend_list = shop.select(
                        '.comment-list')[0].text.replace('\n', ' ').strip()
                except:
                    commend_list = '-'
                one_step_search_res = [
                    shop_id, name, star_point, review_number, mean_price, tag1,
                    tag2, addr, recommend, commend_list, image_path,
                    detail_url, 1, 1
                ]  # 最后两位是搜索标记
                # 这个数据结构暂时没用
                search_res.append(one_step_search_res)
                # 只要首条,跳出
                if only_need_first is True:
                    break
                # 解析详情页
                if self.need_detail == '1':
                    try:
                        detail = Detail().get_detail(shop_id)
                        print('\n' + ','.join(detail) + '\n')
                        self.saver.save_data([detail], 'detail')
                    except:
                        # 设置标记
                        one_step_search_res[-2] = 0
                        logger.warning('详情信息获取失败,失败id:' + shop_id)
                        print('\n' + ','.join(one_step_search_res) + '\n')
                        if self.jump_wait is False:
                            print(
                                '检查浏览器,处理验证码,输入y程序继续运行,输入n跳过检查',
                                'http://www.dianping.com/shop/' + str(shop_id))
                            if input() == 'y':
                                continue
                            elif input() == 'n':
                                self.jump_wait = True
                else:
                    print('\n' + ','.join(one_step_search_res) + '\n')
                # 解析评论页
                if self.need_comment == '1':
                    try:
                        review = Review().get_review(shop_id)
                        print('获取', name, '评论', len(review), '条')
                        self.saver.save_data(review, 'review')
                    except:
                        # 设置标记
                        one_step_search_res[-1] = 0
                        logger.warning('评论获取失败,失败id:' + shop_id)

                # 保存数据
                self.saver.save_data([one_step_search_res], 'search')
        logger.info('解析完成:' + key_word)
예제 #3
0
    def search(self, search_url, request_type='proxy, cookie'):
        """
        搜索
        :param key_word: 关键字
        :param only_need_first: 只需要第一条
        :param needed_pages: 需要多少页
        :return:
        """
        r = requests_util.get_requests(search_url, request_type=request_type)
        text = r.text
        # 获取加密文件
        file_map = get_search_map_file(text)
        # 替换加密文件
        text = requests_util.replace_search_html(text, file_map)

        # 网页解析
        html = BeautifulSoup(text, 'lxml')
        shop_all_list = html.select('.shop-list')[0].select('li')

        search_res = []
        for shop in shop_all_list:
            try:
                image_path = shop.select('.pic')[0].select('a')[0].select('img')[0]['src']
            except:
                image_path = '-'
            try:
                shop_id = shop.select('.txt')[0].select('.tit')[0].select('a')[0]['data-shopid']
            except:
                shop_id = '-'
            try:
                detail_url = shop.select('.txt')[0].select('.tit')[0].select('a')[0]['href']
            except:
                detail_url = '-'
            try:
                name = shop.select('.txt')[0].select('.tit')[0].select('a')[0].text.strip()
            except:
                name = '-'
            # 两个star方式,有的页面显示详细star分数,有的显示icon
            # 解析icon
            try:
                star_point = \
                    shop.select('.txt')[0].select('.comment')[0].select('.star_icon')[0].select('span')[0]['class'][
                        1].split('_')[1]
                star_point = float(star_point) / 10
                star_point = str(star_point)
            except:
                star_point = '-'
            # 解析详细star
            try:
                star_point = \
                    shop.select('.txt')[0].select('.comment')[0].select('.star_score')[0].text
                star_point = float(star_point)
                star_point = str(star_point)
            except:
                pass
            try:
                review_number = shop.select('.txt')[0].select('.comment')[0].select('.review-num')[0].text.replace(
                    '\n', '')
            except:
                review_number = '-'
            try:
                mean_price = shop.select('.txt')[0].select('.comment')[0].select('.mean-price')[0].select('b')[
                    0].text
            except:
                mean_price = '¥0'
            try:
                tags = shop.select('.txt')[0].select('.tag-addr')[0].select('.tag')
                tag1 = tags[0].text.replace('\n', ' ').strip()
                tag2 = tags[1].text.replace('\n', ' ').strip()
            except:
                tag1 = '-'
                tag2 = '-'
            try:
                addr = shop.select('.txt')[0].select('.tag-addr')[0].select('.addr')[0].text.replace('\n',
                                                                                                     ' ').strip()
            except:
                addr = '-'
            try:
                recommend = shop.select('.recommend')[0].text.replace('\n', ' ').strip()
            except:
                recommend = '-'
            try:
                comment_list = shop.select('.comment-list')[0].text.replace('\n', ' ').strip()
            except:
                comment_list = '-'
            one_step_search_res = {
                '店铺id': shop_id,
                '店铺名': name,
                '评论个数': review_number,
                '人均价格': mean_price,
                '标签1': tag1,
                '标签2': tag2,
                '店铺地址': addr,
                '详情链接': detail_url,
                '图片链接': image_path,
                '详细评分': comment_list,
                '推荐菜': recommend,
                '店铺均分': star_point,
            }
            search_res.append(one_step_search_res)
            # yield one_step_search_res
        return search_res