示例#1
0
def JinDongCommentCases(url_list):
    prod_url = url_list[2]
    prod_topic = url_list[1]
    prod_id = re.findall('(\d+)', prod_url)[0]
    print("开始获取商品  " + prod_topic + ':' + str(prod_id) + '评论信息')

    # 保存评论数据集合
    rates_list = []

    # 评论网址第一页
    rate_urls = [
        # 默认评论接口
        'https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv44&productId={0}&score=0&sortType=6&page={1}&pageSize=10&isShadowSku=0&fold=1',
        # 参考价值不大接口
        'https://club.jd.com/comment/getProductPageFoldComments.action?callback=jQuery1719501&productId={0}&score=4&sortType=5&page={1}&pageSize=5&_=1573096406813'
    ]

    # 1.获取连接
    # 请求头
    headers = {
        'accept':
        '*/*',
        'accept-encoding':
        'gzip, deflate, br',
        'accept-language':
        'zh-CN,zh;q=0.9',
        "cookie":
        '__jdu=1150543271; shshshfpa=0cb162de-cb82-21b8-49a7-7e1fd26a3efd-1570864191; user-key=d5809892-c823-402e-9748-c84b2469d56f; cn=0; shshshfpb=eTsoprn6f4hkN00S8LggPuQ%3D%3D; unpl=V2_ZzNtbRYAS0Z8WkQAehlVB2JQRl0SUUcVd1oTAC8YVFIyV0BYclRCFX0URlVnG10UZwYZWEtcRx1FCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZHsZWARjBhBeRFdzJXI4dmR%2bG1gDbwIiXHJWc1chVEVSexlcDSoDEllDU0YXdg5GZHopXw%3d%3d; __jdv=76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_ef9b8c3e01834be1a7513cdee09fdec4|1572418139698; shshshfp=4ecb84897eabb0f7a4c6348b7cdc7d0a; __jda=122270672.1150543271.1570864187.1572825530.1573090824.9; __jdc=122270672; areaId=12; ipLoc-djd=12-984-3384-0; wlfstk_smdl=gcda47s1yytkclehvxho46m7ddz5g7ow; TrackID=1KNUUCIn3e7IMNektPzhbcu7wSO0kDr7PEe_KWvFCOXkJh4Zo6p9lf8KOj5iwp4Yidll4C9iAu7fQF6LVOjeB1LGNsaTdxOTqpshIt79InXGwUBG-R8JW8h4lpF-aMXFlBoc7nuE4YFFi_IXSENLUoA; thor=F5548B286F0AC84835F479E2098B937588592D856D78425D7FC38CD7238081AFCBA255023DFA3D8E13AF80EB0481FBDF4DA6C1A35102B43FEA63A3914094409E2250E5F462224217F1004694F9EC7CF2DA417BF181A528377DE99BED15AD4C25157B03BD7C98D6058B3B22E3F300B51E9F9A64987B3D551B14DCFF630D20CCBF954CBC1087415F2C2203531C10B881874F74CD45F930D0F4802E5F203320EEDE; pinId=eqbOg6AqvNqT4t6ZRIp7VrV9-x-f3wj7; pin=jd_5580681fb886d; unick=jd_181685ayj; ceshi3.com=103; _tp=OQVsjG6Pu5TIXKleFObW0uc7fxOqC8rImaa7i%2FLjfqM%3D; _pst=jd_5580681fb886d; shshshsID=d4ef035cd6502b3e3bbb5e5859bb09c1_2_1573090894262; __jdb=122270672.4.1150543271|9.1573090824; 3AB9D23F7A4B3C9B=4WQN5JCPKTD4EYGF7GGHYDUIBN64EH5SZHPCNA56CB2G7HP52UGN73YBUMQ2EOMZI4WXVSWB3CSTQT2KOLQIVGGV5A; JSESSIONID=99B9C173D8D05BABCE00F2429A497E26.s1',
        "referer":
        "{0}".format(prod_url),
        "user-agent":
        "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3732.400 QQBrowser/10.5.3819.400"
    }
    response = requests.get(rate_urls[1].format(prod_id, 0), headers=headers)
    rates_jsons = json.loads(re.findall('{.*}', response.text)[0])
    sleep(3)
    # 获取总页数
    pages = rates_jsons['productCommentSummary']['commentCount']
    print("===============================")
    for rate_url in rate_urls:
        for page in range(0, int(pages / 10 + 1)):
            print("总共" + str(pages) + "条评论,正在获取第" + str(page) + "页")
            sleep(3)
            try:
                rates_responses = requests.get(rate_url.format(prod_id, page),
                                               headers=headers)
                rates = json.loads(re.findall('{.*}', rates_responses.text)[0])
            except BaseException:
                print("无数据")
                break

            rates_lists = rates['comments']
            for rate_list in rates_lists:
                rate = delSpecialChars(rate_list['content'])
                prod_color = rate_list['productColor']
                prod_name = rate_list['referenceName']
                rate_score = rate_list['score']
                rate_dict = {
                    'add_time': now_time,
                    'prod_name': prod_name,
                    'rate_score': rate_score,
                    'rate': rate,
                    'prod_color': prod_color,
                    'prod_url': prod_url,
                    'prod_topic': prod_topic,
                    'prod_id': prod_id,
                    'sale_num': str(pages)
                }
                print(rate_dict)
                rates_list.append(rate_dict)
            if (rates_lists == []):
                break

    # 保存数据到文件
    fileUtils().saveAsCsv(rates_list, './Data/Rates/{0}'.format(str(prod_id)))
示例#2
0
        ).until(lambda d: d.find_element_by_xpath(
            '//div[@class="searchbox-content-button right-button loading-button cancel-button"]'
        ))
        search_button.click()
        sleep(2)
        print()
    driver.close()


if __name__ == '__main__':
    # 查询条件
    queryKeys = [
        '小牛', '立马', '新大洲', '新蕾', '金箭', '小刀', '台铃', '倍特', '杰宝大王', '绿佳', '绿驹',
        '玉骑铃', '比德文', '雅迪', '爱玛', '绿源', '新日'
    ]
    for queryKey in queryKeys:
        stores_list = []
        BaiDuStoreInfo(queryKey, stores_list)
        # 保存数据到文件
        fileUtils().saveAsCsv(stores_list,
                              './Data/Stores/{0}'.format(str2pinyon(queryKey)))
        # print(stores_list)

    # 建表
    resData = pd.read_csv('./Data/Stores/xiaodao.csv', encoding='utf-8')
    resData = resData.astype(object).where(pd.notnull(resData), None)
    createTable(resData, 'spider', 'bd_store_info', '232')

    # 保存数据
    file_addr = './Data/Stores'
    save_to_mysql(file_addr, 'spider', 'bd_store_info', '232')
示例#3
0
def BaiDuStoreInfo(queryKey, stores_list):
    url = 'https://map.baidu.com'
    option = webdriver.ChromeOptions()
    # option.add_argument('headless')
    # 要换成适应自己操作系统的chromedriver
    driver = webdriver.Chrome(executable_path='./chromedriver.exe',
                              chrome_options=option)

    # 打开网站
    driver.get(url)

    citys = fileUtils().getCsvFile('./Data/Baidu_cityCode.csv')
    for city in citys:
        querycity = city[1]
        # 在搜索框中输入文字
        timeout = 5
        search_content = WebDriverWait(driver, timeout).until(
            lambda d: d.find_element_by_xpath('//input[@id="sole-input"]'))
        search_content.send_keys('{0}'.format(querycity + queryKey + '电动车'))
        sleep(2)

        # 模拟点击“搜索”
        search_button = WebDriverWait(driver, timeout).until(
            lambda d: d.find_element_by_xpath('//button[@id="search-button"]'))
        search_button.click()
        sleep(2)
        xpath_date = etree.HTML(driver.page_source)
        count_num = 1
        # 总共多少结果
        try:
            count_num = re.findall('共找到(\d+)个搜索结果', driver.page_source)[0]
            print(querycity + '共找到{0}个搜索结果'.format(count_num))
        except IndexError:
            print("没有搜索结果,进入下一个城市")
            try:
                # 点击清空搜索
                search_button = WebDriverWait(
                    driver, timeout
                ).until(lambda d: d.find_element_by_xpath(
                    '//div[@class="searchbox-content-button right-button loading-button cancel-button"]'
                ))
                search_button.click()
            except BaseException:
                print("没有清按钮")
            continue

        # 循环翻页获取所有门店
        for i in range(0, int(float(count_num) / 10) + 1):
            sleep(2)
            xpath_date = etree.HTML(driver.page_source)
            print("开始获取" + queryKey + "==>" + querycity + "第" + str(i + 1) +
                  "页数据")
            li_lists = xpath_date.xpath('//ul[@class="poilist"]/li')
            for li_list in li_lists:
                # 存储结果
                store_address = ''
                store_name = ''
                lon = ''
                lat = ''
                try:
                    store_address = li_list.xpath(
                        './div[@class="cf"]//div[@class="row addr"]/span[@class="n-grey"]/text()'
                    )[0]
                    store_name = li_list.xpath(
                        './div[@class="cf"]//div[@class="row"]/span[1]/a/text()'
                    )[0]
                    # 请求百度接口获取经纬度
                    reponse = requests.get(
                        'http://api.map.baidu.com/geocoding/v3/?address={0}&output=json&ak=K2WGZeDWlluoHpEpt5qo5Sx6VNyvffLB&callback=showLocation&city={1}'
                        .format(store_address, querycity)).text
                    lon_lat = (json.loads(re.findall(
                        '{.*}', reponse)[0])['result']['location'])
                    lon = lon_lat['lng']
                    lat = lon_lat['lat']
                except BaseException:
                    print("数据错误或者无法解析经纬度")
                    continue
                # print(re.findall('showLocation&&showLocation(.*)',json.loads(resonsnse)))
                store_dict = {
                    'topic': queryKey,
                    'store_address': store_address,
                    'store_name': store_name,
                    'lon': lon,
                    'lat': lat,
                    'add_time': now_time
                }
                print(store_dict)
                stores_list.append(store_dict)

            # 模拟点击“下一页”
            # xpath_date = etree.HTML(driver.page_source)
            try:
                search_button = WebDriverWait(
                    driver, timeout).until(lambda d: d.find_element_by_xpath(
                        '//a[@tid="toNextPage"]'))
                search_button.click()
            except BaseException:
                print('没有下一页,退出循环')
                break

        # 点击清空搜索
        search_button = WebDriverWait(
            driver, timeout
        ).until(lambda d: d.find_element_by_xpath(
            '//div[@class="searchbox-content-button right-button loading-button cancel-button"]'
        ))
        search_button.click()
        sleep(2)
        print()
    driver.close()
示例#4
0
def TianMaoCommentCases(index, url_list):
    # 保存评论数据集合
    rate_list = []

    # 评论网址第一页
    rate_url = url_list[4]
    # print(rate_url)
    refer_url = url_list[3]
    # print(refer_url)
    topic = url_list[0]
    store_name = url_list[1]
    shop_title = url_list[2]

    # 获取随机cookie
    cookie_list = fileUtils().getCsvFile('../Data/Cookie.csv')
    cookie = random.choice(cookie_list)[0]
    # 获取随机浏览器信息
    ua_list = fileUtils().getCsvFile('../Data/UserAgent.csv')
    ua = random.choice(ua_list)[0]

    # 1.获取连接
    # 请求头
    headers1 = {
        "accept": "*/*",
        "accept-encoding": "gzip, deflate, br",
        "accept-language": "zh-CN,zh;q=0.9",
        "cookie": "{0}".format(cookie),
        "referer": "{0}".format(refer_url),
        "user-agent": "{0}".format(ua)
    }

    print("睡眠15s,开始获取评论信息")
    sleep(15)

    # 获取商品评论
    response = requests.get(rate_url, headers=headers1)
    data = response.text
    comment_datas = re.findall('{.*}', data)[0]
    print(comment_datas)

    # 2.获取总页数和需要分页的个数
    comment_count = json.loads(
        comment_datas)['rateDetail']['rateCount']['total']
    comment_pages = int(int(comment_count) / 20) + 2
    print("总评论数:" + str(comment_count) + "===总页数:" + str(comment_pages) +
          "===等待5s")
    sleep(12)

    # 获取平均评价评分
    avgrate_url = rate_url.replace('rate.tmall.com','dsr-rate.tmall.com')\
        .replace('list_detail_rate.htm','list_dsr_info.htm')\
        .replace('&order=3&currentPage=1','')
    avg_response = requests.get(avgrate_url, headers=headers1).text
    avg_rate = json.loads(re.findall('{.*}',
                                     avg_response)[0])['dsr']['gradeAvg']
    print("该商品平均评分:" + str(avg_rate) + "分,等待15s")
    sleep(12)

    # 3.循环获取评论
    for comment_page in range(1, comment_pages):
        # 随机等待时间
        sleep_list = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
        sleep_time = random.choice(sleep_list)
        print('====睡眠' + str(sleep_time) + 's,总共' + str(comment_pages) +
              '页,开始爬取第' + str(comment_page) + '页评论====')
        sleep(sleep_time)
        # 分页请求网址
        comment_url = rate_url.replace('currentPage=1',
                                       'currentPage={0}').format(comment_page)
        # 获取随机cookie
        cookie_list = fileUtils().getCsvFile('../Data/Cookie.csv')
        cookie = random.choice(cookie_list)[0]
        # 获取随机浏览器信息
        ua_list = fileUtils().getCsvFile('../Data/UserAgent.csv')
        ua = random.choice(ua_list)[0]
        # 请求头
        headers2 = {
            "accept": "*/*",
            "accept-encoding": "gzip, deflate, br",
            "accept-language": "zh-CN,zh;q=0.9",
            "cookie": "{0}".format(cookie),
            "referer": "{0}".format(refer_url),
            "user-agent": "{0}".format(ua)
        }
        print(headers2)

        # 评论数据
        comment_data = re.findall(
            '{.*}',
            requests.get(comment_url, headers=headers2).text)[0]
        print(comment_data)
        try:
            comment_json = json.loads(comment_data)['rateDetail']['rateList']
            for rate in comment_json:
                rateDate = rate['rateDate']
                rateContent = rate['rateContent']
                auctionSku = rate['auctionSku']
                cmsSource = rate['cmsSource']
                if (rateContent != '此用户没有填写评论!'):
                    rate_list.append({
                        'rate_date': rateDate,
                        'rate_content': rateContent,
                        'auction_sku': auctionSku,
                        'cms_source': cmsSource,
                        'avg_rate': avg_rate,
                        'topic': topic,
                        'store_name': store_name,
                        'shop_title': shop_title,
                        'add_date': now_time
                    })
                    print(rateContent)
        except Exception:
            print("Error: 没有找到网页内容")
        finally:
            print("继续执行")

        print('===========================================')
    # print(rate_list)
    # 保存数据到文件
    fileUtils().saveAsCsv(rate_list,
                          './Data/Rates/{0}'.format(str(int(index) + 1)))
    sleep(10)
示例#5
0
                    })
                    print(rateContent)
        except Exception:
            print("Error: 没有找到网页内容")
        finally:
            print("继续执行")

        print('===========================================')
    # print(rate_list)
    # 保存数据到文件
    fileUtils().saveAsCsv(rate_list,
                          './Data/Rates/{0}'.format(str(int(index) + 1)))
    sleep(10)


if __name__ == '__main__':

    # 获取商品url和评论的url
    url_lists = fileUtils().getCsvFile('./Data/store_url.csv')

    # 爬取商品评论
    # for index in range(143, 169):
    #
    #     url_list = url_lists[index]
    #     print(url_list)
    #     # 获取天猫评论
    #     TianMaoCommentCases(index, url_list)

    file_addr = './Data/Rates'
    save_to_mysql(file_addr, 'spider', 'pt_tm_ec_rates_info')
                if (int(prod_price) >= 1000):
                    product_list.append(prod_dict)
            except ValueError:
                print('不符合条件的商品')
            print("产品信息:" + str(prod_dict))

    # 关闭driver
    driver.quit()
    return product_list


if __name__ == '__main__':
    car_names = [
        '雅迪电动车', '新日电动车', '小牛电动车', '绿源电动车', '小刀电动车', '台铃电动车', '比德文电动车',
        '立马电动车', '新大洲电动车', '杰宝大王电动车'
    ]
    car_names = ['电动车']
    # 爬取商品数据
    for car_name in car_names:
        prod_results = TMCarsInfo(car_name)
        fileUtils().saveAsCsv(prod_results,
                              './Data/Product/{0}'.format(car_name))

    # # 建表
    # resData = pd.read_csv('./Data/Product/台铃电动车.csv',encoding='utf-8')
    # resData = resData.astype(object).where(pd.notnull(resData), None)
    # createTable(resData,'spider','pt_tm_ec_products_info',154)
    #
    # # 保存数据
    # file_addr = './Data/Product'
    # save_to_mysql(file_addr,'spider','pt_tm_ec_products_info',154)
def TMProductInfos(topic_list):
    option = webdriver.ChromeOptions()
    option.add_argument('--proxy--server=127.0.0.1:8080')
    # 防止机器识别
    option.add_experimental_option('excludeSwitches', ['enable-automation'])
    # 不加载图片,加快访问速度
    option.add_experimental_option(
        "prefs", {"profile.managed_default_content_settings.images": 2})
    # option.add_argument('headless')

    # 要换成适应自己操作系统的chromedriver
    driver = webdriver.Chrome(
        executable_path='D:\Maven\YadeaSpider\chromedriver.exe',
        chrome_options=option)
    # 登陆淘宝
    login_url = 'https://login.taobao.com/member/login.jhtml'
    driver.get(login_url)
    # 等待 密码登录选项 出现
    password_login = driver.find_element_by_xpath(
        '//div[@class="login-links"]/a[@class="forget-pwd J_Quick2Static"]')
    password_login.click()
    # 等待 微博登录选项 出现
    weibo_login = driver.find_element_by_xpath('//a[@class="weibo-login"]')
    weibo_login.click()
    # 等待 微博账号 出现
    weibo_user = WebDriverWait(
        driver, timeout).until(lambda d: d.find_element_by_xpath(
            '//div[@id="pl_login_logged"]/div/div[2]/div/input'))
    weibo_user.send_keys('18168546559')
    sleep(1)
    # 等待 微博密码 出现
    weibo_pwd = WebDriverWait(
        driver, timeout).until(lambda d: d.find_element_by_xpath(
            '//div[@id="pl_login_logged"]/div/div[3]/div/input'))
    weibo_pwd.send_keys('zj123!')
    # 等待 登录按钮 出现
    submit = WebDriverWait(
        driver, timeout).until(lambda d: d.find_element_by_xpath(
            '//div[@id="pl_login_logged"]/div/div[7]/div[1]/a/span'))
    submit.click()
    sleep(10)

    for topic in topic_list:
        print("=====开始爬取" + topic + "商品信息=====")
        #产品信息集合
        products_lists = []
        # 搜索主页(切换天猫)
        driver.get('https://www.tmall.com/')
        # 搜索天猫
        search_input = WebDriverWait(
            driver, timeout
        ).until(lambda d: d.find_element_by_xpath(
            '/html/body/div[1]/div[2]/div/div/div/div[2]/form/fieldset/div/div/div/input'
        ))
        search_input.send_keys(topic + '电动车')
        submit = WebDriverWait(
            driver, timeout
        ).until(lambda d: d.find_element_by_xpath(
            '/html/body/div[1]/div[2]/div/div/div/div[2]/form/fieldset/div/button'
        ))
        submit.click()
        sleep(2)
        # 请求地址获取连接
        xpath_date = etree.HTML(driver.page_source)
        # 获取当前搜素商品总页数
        page_sum = xpath_date.xpath(
            '//div[@class="ui-page-wrap"]/b[2]/form/input[3]/@value')[0]
        print("=====当前搜索总共" + page_sum + "页=====")
        # 循环获取每页的部分数据
        for index in range(0, int(page_sum)):
            print("=====开始获取第" + str(index + 1) + "页商品数据=====")
            # 获取每个商品的xpath集合
            xpath_dates = etree.HTML(driver.page_source)
            product_lists = xpath_dates.xpath('//div[@id="J_ItemList"]/div')
            option.add_argument('headless')
            driver2 = webdriver.Chrome(
                executable_path='D:\Maven\YadeaSpider\chromedriver.exe',
                chrome_options=option)
            # 获取每个商品属性
            for product_list in product_lists:
                # 商品属性
                product_price = product_list.xpath(
                    './div/p[@class="productPrice"]/em/@title')[0]
                product_price = product_price[:-(
                    (product_price.index('.')) - 1)].replace('.', '')
                if (product_price == ''):
                    print("价格小于1000,非整车,跳出循环")
                    continue
                if (int(product_price) < 1000):
                    print("价格小于1000,非整车,跳出循环")
                    continue
                product_title = product_list.xpath(
                    './div/p[@class="productTitle"]/a/@title')[0]
                product_url = 'http:' + product_list.xpath(
                    './div/p[@class="productTitle"]/a/@href')[0]
                product_id = re.findall(
                    '//detail.tmall.com/item.htm\?id=(\d+)&', product_url)[0]
                shop_url = 'http:' + product_list.xpath(
                    './div/div[@class="productShop"]/a[@class="productShop-name"]/@href'
                )[0]
                shop_id = re.findall('user_number_id=(\d+)&', shop_url)[0]
                shop_name = product_list.xpath(
                    './div/p[@class="productStatus"]/span[3]/@data-nick')[0]
                month_sale = product_list.xpath(
                    './div/p[@class="productStatus"]/span/em/text()')[0]
                comment_sum = product_list.xpath(
                    './div/p[@class="productStatus"]/span[2]/a/text()')[0]
                comment_url = 'https:' + product_list.xpath(
                    './div/p[@class="productStatus"]/span[2]/a/@href')[0]

                # 跳转商品url,获取详细信息
                driver2.get(product_url)
                selector = etree.HTML(driver2.page_source)
                shop_items = selector.xpath('//*[@id="J_AttrUL"]/li')
                # 商品详细配置描述
                shop_list = []
                for shop_item in shop_items:
                    shop_list.append(
                        delSpecialChars(shop_item.xpath('./text()')[0]))
                # 店铺评分
                score_list = []
                scores = selector.xpath('//*[@id="shop-info"]/div[2]/div')
                for score in scores:
                    res_score = delSpecialChars(
                        score.xpath('./div[1]/text()')[0]) + delSpecialChars(
                            score.xpath('./div[2]/span/text()')[0])
                    score_list.append(res_score)

                # 保存字典表
                prduct_dict = {
                    'product_title': product_title,
                    'product_id': product_id,
                    'product_url': product_url,
                    'product_price': product_price,
                    'month_sale': month_sale,
                    "shop_name": shop_name,
                    "shop_url": shop_url,
                    "shop_id": shop_id,
                    "comment_sum": comment_sum,
                    'add_time': add_time,
                    'topic': topic,
                    'shop_list': str(shop_list),
                    'score_list': str(score_list),
                    'comment_url': comment_url
                }
                print(prduct_dict)
                products_lists.append(prduct_dict)
                # sleep(1)
            try:
                # 跳转下一页
                next_button = WebDriverWait(
                    driver, timeout
                ).until(lambda d: d.find_element_by_xpath(
                    '//div[@class="ui-page"]/div/b/a[@class="ui-page-next"]'))
                next_button.click()
            except BaseException:
                print("没有下一页,跳出循环")
                break
            sleep(3)
            driver2.quit()
        fileUtils().saveAsCsv(products_lists,
                              './Data/Product/{0}'.format(topic))
    driver.quit()
示例#8
0
# Author:Aliex ZJ
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
from Utils.fileUtils import fileUtils

datas = fileUtils().getCsvFile(
    'D:\Maven\YadeaSpider\MapStoreAddress\Data\Baidu_cityCode.csv')
print(datas)