def JinDongCommentCases(url_list): prod_url = url_list[2] prod_topic = url_list[1] prod_id = re.findall('(\d+)', prod_url)[0] print("开始获取商品 " + prod_topic + ':' + str(prod_id) + '评论信息') # 保存评论数据集合 rates_list = [] # 评论网址第一页 rate_urls = [ # 默认评论接口 'https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv44&productId={0}&score=0&sortType=6&page={1}&pageSize=10&isShadowSku=0&fold=1', # 参考价值不大接口 'https://club.jd.com/comment/getProductPageFoldComments.action?callback=jQuery1719501&productId={0}&score=4&sortType=5&page={1}&pageSize=5&_=1573096406813' ] # 1.获取连接 # 请求头 headers = { 'accept': '*/*', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', "cookie": '__jdu=1150543271; shshshfpa=0cb162de-cb82-21b8-49a7-7e1fd26a3efd-1570864191; user-key=d5809892-c823-402e-9748-c84b2469d56f; cn=0; shshshfpb=eTsoprn6f4hkN00S8LggPuQ%3D%3D; unpl=V2_ZzNtbRYAS0Z8WkQAehlVB2JQRl0SUUcVd1oTAC8YVFIyV0BYclRCFX0URlVnG10UZwYZWEtcRx1FCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZHsZWARjBhBeRFdzJXI4dmR%2bG1gDbwIiXHJWc1chVEVSexlcDSoDEllDU0YXdg5GZHopXw%3d%3d; __jdv=76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_ef9b8c3e01834be1a7513cdee09fdec4|1572418139698; shshshfp=4ecb84897eabb0f7a4c6348b7cdc7d0a; __jda=122270672.1150543271.1570864187.1572825530.1573090824.9; __jdc=122270672; areaId=12; ipLoc-djd=12-984-3384-0; wlfstk_smdl=gcda47s1yytkclehvxho46m7ddz5g7ow; TrackID=1KNUUCIn3e7IMNektPzhbcu7wSO0kDr7PEe_KWvFCOXkJh4Zo6p9lf8KOj5iwp4Yidll4C9iAu7fQF6LVOjeB1LGNsaTdxOTqpshIt79InXGwUBG-R8JW8h4lpF-aMXFlBoc7nuE4YFFi_IXSENLUoA; thor=F5548B286F0AC84835F479E2098B937588592D856D78425D7FC38CD7238081AFCBA255023DFA3D8E13AF80EB0481FBDF4DA6C1A35102B43FEA63A3914094409E2250E5F462224217F1004694F9EC7CF2DA417BF181A528377DE99BED15AD4C25157B03BD7C98D6058B3B22E3F300B51E9F9A64987B3D551B14DCFF630D20CCBF954CBC1087415F2C2203531C10B881874F74CD45F930D0F4802E5F203320EEDE; pinId=eqbOg6AqvNqT4t6ZRIp7VrV9-x-f3wj7; pin=jd_5580681fb886d; unick=jd_181685ayj; ceshi3.com=103; _tp=OQVsjG6Pu5TIXKleFObW0uc7fxOqC8rImaa7i%2FLjfqM%3D; _pst=jd_5580681fb886d; shshshsID=d4ef035cd6502b3e3bbb5e5859bb09c1_2_1573090894262; __jdb=122270672.4.1150543271|9.1573090824; 3AB9D23F7A4B3C9B=4WQN5JCPKTD4EYGF7GGHYDUIBN64EH5SZHPCNA56CB2G7HP52UGN73YBUMQ2EOMZI4WXVSWB3CSTQT2KOLQIVGGV5A; JSESSIONID=99B9C173D8D05BABCE00F2429A497E26.s1', "referer": "{0}".format(prod_url), "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3732.400 QQBrowser/10.5.3819.400" } response = requests.get(rate_urls[1].format(prod_id, 0), headers=headers) rates_jsons = json.loads(re.findall('{.*}', response.text)[0]) sleep(3) # 获取总页数 pages = rates_jsons['productCommentSummary']['commentCount'] print("===============================") for rate_url in rate_urls: for page in range(0, int(pages / 10 + 1)): print("总共" + str(pages) + "条评论,正在获取第" + str(page) + "页") sleep(3) try: rates_responses = requests.get(rate_url.format(prod_id, page), headers=headers) rates = json.loads(re.findall('{.*}', rates_responses.text)[0]) except BaseException: print("无数据") break rates_lists = rates['comments'] for rate_list in rates_lists: rate = delSpecialChars(rate_list['content']) prod_color = rate_list['productColor'] prod_name = rate_list['referenceName'] rate_score = rate_list['score'] rate_dict = { 'add_time': now_time, 'prod_name': prod_name, 'rate_score': rate_score, 'rate': rate, 'prod_color': prod_color, 'prod_url': prod_url, 'prod_topic': prod_topic, 'prod_id': prod_id, 'sale_num': str(pages) } print(rate_dict) rates_list.append(rate_dict) if (rates_lists == []): break # 保存数据到文件 fileUtils().saveAsCsv(rates_list, './Data/Rates/{0}'.format(str(prod_id)))
).until(lambda d: d.find_element_by_xpath( '//div[@class="searchbox-content-button right-button loading-button cancel-button"]' )) search_button.click() sleep(2) print() driver.close() if __name__ == '__main__': # 查询条件 queryKeys = [ '小牛', '立马', '新大洲', '新蕾', '金箭', '小刀', '台铃', '倍特', '杰宝大王', '绿佳', '绿驹', '玉骑铃', '比德文', '雅迪', '爱玛', '绿源', '新日' ] for queryKey in queryKeys: stores_list = [] BaiDuStoreInfo(queryKey, stores_list) # 保存数据到文件 fileUtils().saveAsCsv(stores_list, './Data/Stores/{0}'.format(str2pinyon(queryKey))) # print(stores_list) # 建表 resData = pd.read_csv('./Data/Stores/xiaodao.csv', encoding='utf-8') resData = resData.astype(object).where(pd.notnull(resData), None) createTable(resData, 'spider', 'bd_store_info', '232') # 保存数据 file_addr = './Data/Stores' save_to_mysql(file_addr, 'spider', 'bd_store_info', '232')
def BaiDuStoreInfo(queryKey, stores_list): url = 'https://map.baidu.com' option = webdriver.ChromeOptions() # option.add_argument('headless') # 要换成适应自己操作系统的chromedriver driver = webdriver.Chrome(executable_path='./chromedriver.exe', chrome_options=option) # 打开网站 driver.get(url) citys = fileUtils().getCsvFile('./Data/Baidu_cityCode.csv') for city in citys: querycity = city[1] # 在搜索框中输入文字 timeout = 5 search_content = WebDriverWait(driver, timeout).until( lambda d: d.find_element_by_xpath('//input[@id="sole-input"]')) search_content.send_keys('{0}'.format(querycity + queryKey + '电动车')) sleep(2) # 模拟点击“搜索” search_button = WebDriverWait(driver, timeout).until( lambda d: d.find_element_by_xpath('//button[@id="search-button"]')) search_button.click() sleep(2) xpath_date = etree.HTML(driver.page_source) count_num = 1 # 总共多少结果 try: count_num = re.findall('共找到(\d+)个搜索结果', driver.page_source)[0] print(querycity + '共找到{0}个搜索结果'.format(count_num)) except IndexError: print("没有搜索结果,进入下一个城市") try: # 点击清空搜索 search_button = WebDriverWait( driver, timeout ).until(lambda d: d.find_element_by_xpath( '//div[@class="searchbox-content-button right-button loading-button cancel-button"]' )) search_button.click() except BaseException: print("没有清按钮") continue # 循环翻页获取所有门店 for i in range(0, int(float(count_num) / 10) + 1): sleep(2) xpath_date = etree.HTML(driver.page_source) print("开始获取" + queryKey + "==>" + querycity + "第" + str(i + 1) + "页数据") li_lists = xpath_date.xpath('//ul[@class="poilist"]/li') for li_list in li_lists: # 存储结果 store_address = '' store_name = '' lon = '' lat = '' try: store_address = li_list.xpath( './div[@class="cf"]//div[@class="row addr"]/span[@class="n-grey"]/text()' )[0] store_name = li_list.xpath( './div[@class="cf"]//div[@class="row"]/span[1]/a/text()' )[0] # 请求百度接口获取经纬度 reponse = requests.get( 'http://api.map.baidu.com/geocoding/v3/?address={0}&output=json&ak=K2WGZeDWlluoHpEpt5qo5Sx6VNyvffLB&callback=showLocation&city={1}' .format(store_address, querycity)).text lon_lat = (json.loads(re.findall( '{.*}', reponse)[0])['result']['location']) lon = lon_lat['lng'] lat = lon_lat['lat'] except BaseException: print("数据错误或者无法解析经纬度") continue # print(re.findall('showLocation&&showLocation(.*)',json.loads(resonsnse))) store_dict = { 'topic': queryKey, 'store_address': store_address, 'store_name': store_name, 'lon': lon, 'lat': lat, 'add_time': now_time } print(store_dict) stores_list.append(store_dict) # 模拟点击“下一页” # xpath_date = etree.HTML(driver.page_source) try: search_button = WebDriverWait( driver, timeout).until(lambda d: d.find_element_by_xpath( '//a[@tid="toNextPage"]')) search_button.click() except BaseException: print('没有下一页,退出循环') break # 点击清空搜索 search_button = WebDriverWait( driver, timeout ).until(lambda d: d.find_element_by_xpath( '//div[@class="searchbox-content-button right-button loading-button cancel-button"]' )) search_button.click() sleep(2) print() driver.close()
def TianMaoCommentCases(index, url_list): # 保存评论数据集合 rate_list = [] # 评论网址第一页 rate_url = url_list[4] # print(rate_url) refer_url = url_list[3] # print(refer_url) topic = url_list[0] store_name = url_list[1] shop_title = url_list[2] # 获取随机cookie cookie_list = fileUtils().getCsvFile('../Data/Cookie.csv') cookie = random.choice(cookie_list)[0] # 获取随机浏览器信息 ua_list = fileUtils().getCsvFile('../Data/UserAgent.csv') ua = random.choice(ua_list)[0] # 1.获取连接 # 请求头 headers1 = { "accept": "*/*", "accept-encoding": "gzip, deflate, br", "accept-language": "zh-CN,zh;q=0.9", "cookie": "{0}".format(cookie), "referer": "{0}".format(refer_url), "user-agent": "{0}".format(ua) } print("睡眠15s,开始获取评论信息") sleep(15) # 获取商品评论 response = requests.get(rate_url, headers=headers1) data = response.text comment_datas = re.findall('{.*}', data)[0] print(comment_datas) # 2.获取总页数和需要分页的个数 comment_count = json.loads( comment_datas)['rateDetail']['rateCount']['total'] comment_pages = int(int(comment_count) / 20) + 2 print("总评论数:" + str(comment_count) + "===总页数:" + str(comment_pages) + "===等待5s") sleep(12) # 获取平均评价评分 avgrate_url = rate_url.replace('rate.tmall.com','dsr-rate.tmall.com')\ .replace('list_detail_rate.htm','list_dsr_info.htm')\ .replace('&order=3¤tPage=1','') avg_response = requests.get(avgrate_url, headers=headers1).text avg_rate = json.loads(re.findall('{.*}', avg_response)[0])['dsr']['gradeAvg'] print("该商品平均评分:" + str(avg_rate) + "分,等待15s") sleep(12) # 3.循环获取评论 for comment_page in range(1, comment_pages): # 随机等待时间 sleep_list = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20] sleep_time = random.choice(sleep_list) print('====睡眠' + str(sleep_time) + 's,总共' + str(comment_pages) + '页,开始爬取第' + str(comment_page) + '页评论====') sleep(sleep_time) # 分页请求网址 comment_url = rate_url.replace('currentPage=1', 'currentPage={0}').format(comment_page) # 获取随机cookie cookie_list = fileUtils().getCsvFile('../Data/Cookie.csv') cookie = random.choice(cookie_list)[0] # 获取随机浏览器信息 ua_list = fileUtils().getCsvFile('../Data/UserAgent.csv') ua = random.choice(ua_list)[0] # 请求头 headers2 = { "accept": "*/*", "accept-encoding": "gzip, deflate, br", "accept-language": "zh-CN,zh;q=0.9", "cookie": "{0}".format(cookie), "referer": "{0}".format(refer_url), "user-agent": "{0}".format(ua) } print(headers2) # 评论数据 comment_data = re.findall( '{.*}', requests.get(comment_url, headers=headers2).text)[0] print(comment_data) try: comment_json = json.loads(comment_data)['rateDetail']['rateList'] for rate in comment_json: rateDate = rate['rateDate'] rateContent = rate['rateContent'] auctionSku = rate['auctionSku'] cmsSource = rate['cmsSource'] if (rateContent != '此用户没有填写评论!'): rate_list.append({ 'rate_date': rateDate, 'rate_content': rateContent, 'auction_sku': auctionSku, 'cms_source': cmsSource, 'avg_rate': avg_rate, 'topic': topic, 'store_name': store_name, 'shop_title': shop_title, 'add_date': now_time }) print(rateContent) except Exception: print("Error: 没有找到网页内容") finally: print("继续执行") print('===========================================') # print(rate_list) # 保存数据到文件 fileUtils().saveAsCsv(rate_list, './Data/Rates/{0}'.format(str(int(index) + 1))) sleep(10)
}) print(rateContent) except Exception: print("Error: 没有找到网页内容") finally: print("继续执行") print('===========================================') # print(rate_list) # 保存数据到文件 fileUtils().saveAsCsv(rate_list, './Data/Rates/{0}'.format(str(int(index) + 1))) sleep(10) if __name__ == '__main__': # 获取商品url和评论的url url_lists = fileUtils().getCsvFile('./Data/store_url.csv') # 爬取商品评论 # for index in range(143, 169): # # url_list = url_lists[index] # print(url_list) # # 获取天猫评论 # TianMaoCommentCases(index, url_list) file_addr = './Data/Rates' save_to_mysql(file_addr, 'spider', 'pt_tm_ec_rates_info')
if (int(prod_price) >= 1000): product_list.append(prod_dict) except ValueError: print('不符合条件的商品') print("产品信息:" + str(prod_dict)) # 关闭driver driver.quit() return product_list if __name__ == '__main__': car_names = [ '雅迪电动车', '新日电动车', '小牛电动车', '绿源电动车', '小刀电动车', '台铃电动车', '比德文电动车', '立马电动车', '新大洲电动车', '杰宝大王电动车' ] car_names = ['电动车'] # 爬取商品数据 for car_name in car_names: prod_results = TMCarsInfo(car_name) fileUtils().saveAsCsv(prod_results, './Data/Product/{0}'.format(car_name)) # # 建表 # resData = pd.read_csv('./Data/Product/台铃电动车.csv',encoding='utf-8') # resData = resData.astype(object).where(pd.notnull(resData), None) # createTable(resData,'spider','pt_tm_ec_products_info',154) # # # 保存数据 # file_addr = './Data/Product' # save_to_mysql(file_addr,'spider','pt_tm_ec_products_info',154)
def TMProductInfos(topic_list): option = webdriver.ChromeOptions() option.add_argument('--proxy--server=127.0.0.1:8080') # 防止机器识别 option.add_experimental_option('excludeSwitches', ['enable-automation']) # 不加载图片,加快访问速度 option.add_experimental_option( "prefs", {"profile.managed_default_content_settings.images": 2}) # option.add_argument('headless') # 要换成适应自己操作系统的chromedriver driver = webdriver.Chrome( executable_path='D:\Maven\YadeaSpider\chromedriver.exe', chrome_options=option) # 登陆淘宝 login_url = 'https://login.taobao.com/member/login.jhtml' driver.get(login_url) # 等待 密码登录选项 出现 password_login = driver.find_element_by_xpath( '//div[@class="login-links"]/a[@class="forget-pwd J_Quick2Static"]') password_login.click() # 等待 微博登录选项 出现 weibo_login = driver.find_element_by_xpath('//a[@class="weibo-login"]') weibo_login.click() # 等待 微博账号 出现 weibo_user = WebDriverWait( driver, timeout).until(lambda d: d.find_element_by_xpath( '//div[@id="pl_login_logged"]/div/div[2]/div/input')) weibo_user.send_keys('18168546559') sleep(1) # 等待 微博密码 出现 weibo_pwd = WebDriverWait( driver, timeout).until(lambda d: d.find_element_by_xpath( '//div[@id="pl_login_logged"]/div/div[3]/div/input')) weibo_pwd.send_keys('zj123!') # 等待 登录按钮 出现 submit = WebDriverWait( driver, timeout).until(lambda d: d.find_element_by_xpath( '//div[@id="pl_login_logged"]/div/div[7]/div[1]/a/span')) submit.click() sleep(10) for topic in topic_list: print("=====开始爬取" + topic + "商品信息=====") #产品信息集合 products_lists = [] # 搜索主页(切换天猫) driver.get('https://www.tmall.com/') # 搜索天猫 search_input = WebDriverWait( driver, timeout ).until(lambda d: d.find_element_by_xpath( '/html/body/div[1]/div[2]/div/div/div/div[2]/form/fieldset/div/div/div/input' )) search_input.send_keys(topic + '电动车') submit = WebDriverWait( driver, timeout ).until(lambda d: d.find_element_by_xpath( '/html/body/div[1]/div[2]/div/div/div/div[2]/form/fieldset/div/button' )) submit.click() sleep(2) # 请求地址获取连接 xpath_date = etree.HTML(driver.page_source) # 获取当前搜素商品总页数 page_sum = xpath_date.xpath( '//div[@class="ui-page-wrap"]/b[2]/form/input[3]/@value')[0] print("=====当前搜索总共" + page_sum + "页=====") # 循环获取每页的部分数据 for index in range(0, int(page_sum)): print("=====开始获取第" + str(index + 1) + "页商品数据=====") # 获取每个商品的xpath集合 xpath_dates = etree.HTML(driver.page_source) product_lists = xpath_dates.xpath('//div[@id="J_ItemList"]/div') option.add_argument('headless') driver2 = webdriver.Chrome( executable_path='D:\Maven\YadeaSpider\chromedriver.exe', chrome_options=option) # 获取每个商品属性 for product_list in product_lists: # 商品属性 product_price = product_list.xpath( './div/p[@class="productPrice"]/em/@title')[0] product_price = product_price[:-( (product_price.index('.')) - 1)].replace('.', '') if (product_price == ''): print("价格小于1000,非整车,跳出循环") continue if (int(product_price) < 1000): print("价格小于1000,非整车,跳出循环") continue product_title = product_list.xpath( './div/p[@class="productTitle"]/a/@title')[0] product_url = 'http:' + product_list.xpath( './div/p[@class="productTitle"]/a/@href')[0] product_id = re.findall( '//detail.tmall.com/item.htm\?id=(\d+)&', product_url)[0] shop_url = 'http:' + product_list.xpath( './div/div[@class="productShop"]/a[@class="productShop-name"]/@href' )[0] shop_id = re.findall('user_number_id=(\d+)&', shop_url)[0] shop_name = product_list.xpath( './div/p[@class="productStatus"]/span[3]/@data-nick')[0] month_sale = product_list.xpath( './div/p[@class="productStatus"]/span/em/text()')[0] comment_sum = product_list.xpath( './div/p[@class="productStatus"]/span[2]/a/text()')[0] comment_url = 'https:' + product_list.xpath( './div/p[@class="productStatus"]/span[2]/a/@href')[0] # 跳转商品url,获取详细信息 driver2.get(product_url) selector = etree.HTML(driver2.page_source) shop_items = selector.xpath('//*[@id="J_AttrUL"]/li') # 商品详细配置描述 shop_list = [] for shop_item in shop_items: shop_list.append( delSpecialChars(shop_item.xpath('./text()')[0])) # 店铺评分 score_list = [] scores = selector.xpath('//*[@id="shop-info"]/div[2]/div') for score in scores: res_score = delSpecialChars( score.xpath('./div[1]/text()')[0]) + delSpecialChars( score.xpath('./div[2]/span/text()')[0]) score_list.append(res_score) # 保存字典表 prduct_dict = { 'product_title': product_title, 'product_id': product_id, 'product_url': product_url, 'product_price': product_price, 'month_sale': month_sale, "shop_name": shop_name, "shop_url": shop_url, "shop_id": shop_id, "comment_sum": comment_sum, 'add_time': add_time, 'topic': topic, 'shop_list': str(shop_list), 'score_list': str(score_list), 'comment_url': comment_url } print(prduct_dict) products_lists.append(prduct_dict) # sleep(1) try: # 跳转下一页 next_button = WebDriverWait( driver, timeout ).until(lambda d: d.find_element_by_xpath( '//div[@class="ui-page"]/div/b/a[@class="ui-page-next"]')) next_button.click() except BaseException: print("没有下一页,跳出循环") break sleep(3) driver2.quit() fileUtils().saveAsCsv(products_lists, './Data/Product/{0}'.format(topic)) driver.quit()
# Author:Aliex ZJ #!/usr/bin/env python3 # -*- coding:utf-8 -*- from Utils.fileUtils import fileUtils datas = fileUtils().getCsvFile( 'D:\Maven\YadeaSpider\MapStoreAddress\Data\Baidu_cityCode.csv') print(datas)