示例#1
0
def get_data(url):
    """
    :param page_url: 待获取url
    :return:
    """
    try:
        # 获取url中的html文本
        con = CrawlerUtils.get_html(url,
                                    headers=CrawlerUtils.get_headers(COOKIES),
                                    proxies=proxies)
        # 获取css url,及tag,由于文本与数字class 前缀不同,所以需要分别获取
        num_re_compile = "<b><span class=\"(.*?)\"></span>"  # 定义数字class匹配正则
        str_re_compile = '<span class="addr"><span class=\"(.*?)\"></span>'  # 定义地址class匹配正则
        # 获取css的URL
        css_url = get_css(con)
        # 获取数字class前缀
        num_svg_tag = get_tag(re.findall(num_re_compile, con))
        # 获取文本class前缀
        str_svg_tag = get_tag(re.findall(str_re_compile, con))
        # 获取css对应名与像素的映射字典
        css_and_px_dict = get_css_and_px_dict(css_url)
        # 对html文本使用 etree.HTML(html)解析,得到Element对象
        doc = etree.HTML(con)
        # 获取所有商家标签
        shops = doc.xpath('//div[@id="shop-all-list"]/ul/li')
        for shop in shops[1:]:
            # 商铺id
            shop_id = shop.xpath(
                './/div[@class="tit"]/a')[0].attrib["data-shopid"]
            logger.info("shop_id", shop_id)
            # 店名
            shop_name = shop.xpath('.//div[@class="tit"]/a')[0].attrib["title"]
            shop_name = bytes(shop_name.encode('utf-8')).decode('utf-8')

            comment_num = 0
            per_capita_price = taste = service = environment = 0

            # 获取星级评分数
            star_num = int(
                str(
                    shop.xpath('.//div[@class="comment"]/span')
                    [0].attrib["class"]).split("sml-str")[-1])
            # 获取点评总数
            comment_and_price_datas = shop.xpath('.//div[@class="comment"]')
            for comment_and_price_data in comment_and_price_datas:
                _comment_data = comment_and_price_data.xpath(
                    'a[@class="review-num"]/b/node()')
                comment_num = get_last_value(css_url, _comment_data,
                                             css_and_px_dict, num_svg_tag)
                comment_num = int(comment_num) if comment_num else 0
                if comment_num == 0:
                    return

                # 获取人均消费价格
                _price = comment_and_price_data.xpath(
                    'a[@class="mean-price"]/b/node()')
                per_capita_price = get_last_value(css_url, _price,
                                                  css_and_px_dict, num_svg_tag)
                per_capita_price = int(
                    per_capita_price[1:]) if per_capita_price else 0

            # 获取口味,环境,服务评分
            others_num_node = shop.xpath('.//span[@class="comment-list"]/span')
            for others_datas in others_num_node:
                if others_datas.xpath('text()') and others_datas.xpath(
                        'text()')[0] == u"口味":
                    _taste_data = others_datas.xpath('b/node()')
                    taste = get_last_value(css_url, _taste_data,
                                           css_and_px_dict, num_svg_tag)
                    taste = float(taste) if taste else 0

                if others_datas.xpath('text()') and others_datas.xpath(
                        'text()')[0] == u"服务":
                    _taste_data = others_datas.xpath('b/node()')
                    service = get_last_value(css_url, _taste_data,
                                             css_and_px_dict, num_svg_tag)
                    service = float(service) if service else 0

                if others_datas.xpath('text()') and others_datas.xpath(
                        'text()')[0] == u"环境":
                    _taste_data = others_datas.xpath('b/node()')
                    environment = get_last_value(css_url, _taste_data,
                                                 css_and_px_dict, num_svg_tag)
                    environment = float(environment) if environment else 0

            # 获取地址
            _ress = shop.xpath(
                './/div[@class="tag-addr"]/span[@class="addr"]/node()')
            address = get_last_value(css_url,
                                     _ress,
                                     css_and_px_dict,
                                     str_svg_tag,
                                     is_num=False)

            data = {
                'shop_id': shop_id,
                'shop_name': shop_name,
                'address': address,
                'per_capita_price': per_capita_price,
                'total_number_comments': comment_num,
                'stars': int(star_num),
                'taste': taste,
                'surroundings': environment,
                'serve': service,
            }
            logger.info('开始保存数据: %s' % (data, ))
            # 保存数据
            DBUtils.save_to_db(Shop, data)

    except Exception as e:
        logger.error(e)
示例#2
0
def spiderDazhong(shop_id, now_page):
    global page
    global proxies
    try:
        url = "http://www.dianping.com/shop/" + str(
            shop_id) + "/review_all/p" + str(now_page)
        print("url==========", url)
        req = requests.get(url, headers=headers)  # , proxies=proxies
        if CrawlerUtils.get_bs(req.text, 'div#log'):
            proxies = CrawlerUtils.get_proxies(api_url)
            page -= 1
            return True
        # 获取svg的阈值与数字集合的映射
        css_url = CrawlerUtils.get_css(req.content.decode('utf-8'))
        str_re_compile = '<span class=\"(.*?)\"></span>'
        addr_tag = CrawlerUtils.get_bs(req.text, 'div.review-truncated-words')

        if addr_tag != []:
            addr_tag = str(addr_tag[0])
        else:

            addr_tag = str(
                CrawlerUtils.get_bs(req.text, 'div.review-words')[0])

        _tag = CrawlerUtils.get_tag(re.findall(str_re_compile, addr_tag))
        css_and_px_dict = CrawlerUtils.get_css_and_px_dict(css_url)

        doc = pq(req.text)
        # print(doc)
        if doc:
            # # 存入csv文件
            # out = open('./data/Stu_csv.csv', 'a', newline='', encoding="utf-8")
            # # 设定写入模式
            # csv_write = csv.writer(out, dialect='excel')
            # shopName = doc("div.review-list-header > h1 > a").text()
            # shopurl = "http://www.dianping.com" + doc("div.review-list-header > h1 > a").attr("href")
            # csv_write.writerow(["店铺名称", "店铺网址"])
            # csv_write.writerow([shopName, shopurl])
            # csv_write.writerow(["用户名", "用户ID链接", "评定星级", "评论描述", "评论详情", "评论时间", "评论商铺", "评论图片","是否VIP","推荐菜品"])
            # 解析评论
            pinglunLi = doc("div.reviews-items > ul > li").items()
            li_num = 1
            for data in pinglunLi:
                userName = data(
                    "div.main-review > div.dper-info > a").text()  # 用户名
                shopName = data(
                    "div.main-review > div.misc-info.clearfix > span.shop"
                ).text()  #

                userID = ''
                user_id_link = data(
                    "div.main-review > div.dper-info > a").attr("href")
                if user_id_link != None:
                    userID = "http://www.dianping.com" + user_id_link

                start_sapn = data(
                    "div.review-rank > span.sml-rank-stars.sml-str10.star")
                if start_sapn:
                    startShop = str(
                        start_sapn.attr("class")).split(" ")[1].replace(
                            "sml-str", "")
                else:
                    startShop = 0

                describeShop = data("div.review-rank > span.score").text()
                msg_values = etree.HTML(req.content.decode('utf-8')).xpath(
                    '//div[@class="reviews-items"]/ul/li[' + str(li_num) +
                    ']/div[@class="main-review"]/div[@class="review-words Hide"]/node()'
                )  #
                if msg_values == []:
                    msg_values = etree.HTML(req.content.decode('utf-8')).xpath(
                        '//div[@class="reviews-items"]/ul/li[' + str(li_num) +
                        ']/div[@class="main-review"]/div[@class="review-words"]/node()'
                    )
                pinglunShop = CrawlerUtils.get_last_value(css_url,
                                                          msg_values,
                                                          css_and_px_dict,
                                                          _tag,
                                                          is_num=False)
                if pinglunShop == '':
                    continue
                timeShop = data(
                    "div.main-review > div.misc-info.clearfix > span.time"
                ).text()
                Shop = data(
                    "div.main-review > div.misc-info.clearfix > span.shop"
                ).text()
                imgShop = data(
                    "div > div.review-pictures > ul > li> a").items()
                imgList = []
                for img in imgShop:
                    imgList.append("http://www.dianping.com" +
                                   img.attr("href"))
                vip_span = data("div.main-review > div.dper-info > span")
                is_vip = 1 if vip_span.attr("class") == "vip" else 0
                recommend = data(
                    "div.main-review > div.review-recommend > a").text()
                print("===========", pinglunShop)
                uuid = CrawlerUtils.get_md5_value(
                    str(shop_id) + pinglunShop + str(timeShop))

                # # 写入具体内容
                # csv_write.writerow([userName, userID, startShop, describeShop, pinglunShop, timeShop, Shop, imgList])
                # print("successful insert csv!")
                print(", 用户名:userName: {}\n, "
                      "商铺名称:shoName: {}\n, "
                      "用户ID链接:userID: {}\n, "
                      "评定星级:startShop: {}\n, "
                      "评论描述:describeShop : {}\n, "
                      "星级评分:startShop: {}\n, "
                      "评论详情:pinglunShop:{}\n, "
                      "评论时间:timeShop :{}\n, "
                      "评论商铺:Shop:{}\n, "
                      "评论图片:imgList:{}\n, "
                      "推荐菜品:recommend:{}\n, "
                      "是否vip:is_vip:{} "
                      "\n ".format(userName, shopName, userID, startShop,
                                   describeShop, startShop, pinglunShop,
                                   timeShop, Shop, imgList, recommend, is_vip))

                data = {
                    'uuid': uuid,
                    'shop_id': shop_id,
                    'username': userName,
                    'shop_name': shopName,
                    'is_vip': is_vip,
                    'comment': pinglunShop,
                    'recommend': recommend,
                    'stars': startShop,
                    'comment_create_time': timeShop,
                }
                DBUtils.save_to_db(Comment, data)
                li_num += 1
                sleep(1)
            if not CrawlerUtils.get_tags(req.text, '.NextPage'):
                return False

            return True

    except Exception as e:
        raise e