def tmallBrandNews(): # TODO:XDF Chrome欲歌浏览器 options = webdriver.ChromeOptions() # options.add_extension('AdBlock_v3.15.0.crx') # TODO:XDF Chrome欲歌广告过滤插件 # 设置中文 options.add_argument('lang=zh_CN.UTF-8') # prefs = {"profile.managed_default_content_settings.images": 2} # options.add_experimental_option("prefs", prefs) # TODO:XDF 禁止加载图片 # 更换头部 options.add_argument( 'user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36"' ) # driver = webdriver.Chrome(executable_path=r'/Users/zhuoqin/Desktop/Python/SeleniumDemo/chromedriver')#chrome_options=options, driver = webdriver.Chrome( executable_path=r'/Users/zhuoqin/Downloads/123456/chromedriver' ) # chrome_options=options, wait = WebDriverWait(driver, 200, 0.5) # 表示给browser浏览器一个10秒的加载时间 print '窗口最大化----1' tmallLogin(driver, UnexpectedAlertPresentException, ActionChains) print '窗口最大化----2' # TODO:XDF PhantomJS无头浏览器 # dcap = dict(DesiredCapabilities.PHANTOMJS) # dcap["phantomjs.page.settings.userAgent"] = ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36") # 设置user-agent请求头 # dcap["phantomjs.page.settings.loadImages"] = False # 禁止加载图片 # # print ('即将开始。。。') # service_args = [] # service_args.append('--load-images=no') ##关闭图片加载 # service_args.append('--disk-cache=yes') ##开启缓存 # service_args.append('--ignore-ssl-errors=true') ##忽略https错误 # # #TODO:XDF 针对本地调试 # # driver = webdriver.PhantomJS(executable_path=r'/Users/zhuoqin/Desktop/Python/SeleniumDemo/phantomjs',service_args=service_args,desired_capabilities=dcap) # # driver = webdriver.PhantomJS(executable_path=r'/usr/bin/phantomjs', service_args=service_args,desired_capabilities=dcap) # TODO:XDF 针对Linux # # # TODO:XDF 针对Linux服务器 # # wait = WebDriverWait(driver, 60, 0.5) # 表示给browser浏览器一个10秒的加载时间 # # # driver.implicitly_wait(30) # driver.set_page_load_timeout(30) # wait = WebDriverWait(driver, 200, 0.5) # 表示给browser浏览器一个10秒的加载时间 try: print '进来窗口了' driver.maximize_window() driver.get("https://list.tmall.com/search_product.htm?q=123&type=p") print '窗口最大化----' except Exception as e: print '即将出错啦----%s' % e driver.quit() driver.close() return time.sleep(2) if 'login.tmall' in str(driver.current_url): print '即将登录' tmallLogin(driver, UnexpectedAlertPresentException, ActionChains) SelectShopTempletes() # BaseInfoList = SelectT_Treasures_BaseInfo() for i in range(0, len(allShopName)): InsertSqlList = [] print('进来了---%s' % i) driver.find_element_by_xpath('//*[@id="mq"]').clear() time.sleep(random.uniform(2, 4)) driver.find_element_by_xpath('//*[@id="mq"]').send_keys( allShopName['shopName'][i]) print('地址********%s' % (allShopName['shopName'][i])) time.sleep(random.uniform(4, 6)) if 'login.tmall' in str(driver.current_url): print '当前为登录页面' tmallLogin(driver, UnexpectedAlertPresentException, ActionChains) # driver.find_element_by_xpath('//*[@id="mallSearch"]/form/fieldset/div/button').click() driver.find_element_by_xpath( '//*[@id="mallSearch"]/form/fieldset/div/button').click() tmallLogin(driver, UnexpectedAlertPresentException, ActionChains) wait.until(EC.presence_of_element_located((By.ID, 'J_ItemList'))) time.sleep(random.uniform(2, 4)) print '赶紧睡吧' JudgeLoginSuccess(driver, UnexpectedAlertPresentException, ActionChains) time.sleep(random.randint(1, 3)) key = allShopName['shopName'][i] html = driver.page_source docs = pq(html) for j in range(0, int(getProjectPage(docs))): time.sleep(random.randint(4, 8)) urls = 'https://list.tmall.com/search_shopitem.htm?s=' + str( j * 60) + '&oq=' + str(key) + '&style=sg&sort=s&user_id=' + str( allShopName['user_id'][i]) + '&stype=search' print urls driver.get(urls) time.sleep(random.randint(1, 3)) if tmallCode(driver, wait, EC) == '访问过快': continue tmallLogin(driver, UnexpectedAlertPresentException, ActionChains) wait.until(EC.presence_of_element_located((By.ID, 'J_ItemList'))) time.sleep(random.uniform(3, 5)) html = driver.page_source doc = pq(html) list = doc('#J_ItemList .product .product-iWrap').items() resultData = search(list) lists = doc('#J_ItemList .product .product-iWrap').items() YuShouPriceData = YuShouPrice(lists) time.sleep(random.uniform(3, 5)) print '所有店铺ID----%s-----%s' % (len(resultData), resultData) print '所有预售价---%s' % YuShouPriceData getDetailFilterData(driver, wait, resultData, YuShouPriceData, allShopName['shopName'][i])
def getDetailFilterData(driver, wait, resultData, YuShouPriceData, ShopName): for i in range(0, len(resultData)): TreasureID = str(resultData[i]) presellPrice = clearToReplaceData(str(YuShouPriceData[i]), 1) print '------你大爷------' time.sleep(random.uniform(5, 8)) try: driver.get('https://detail.tmall.com/item.htm?id=' + str(TreasureID)) time.sleep(random.randint(1, 3)) print '地址----%s' % str(TreasureID) except Exception as e: print e continue if judgeProduct(driver) == True: print '商品不存在' continue if tmallCode(driver, wait, EC) == '访问过快': continue tmallLogin(driver, UnexpectedAlertPresentException, ActionChains) try: wait.until( EC.presence_of_element_located( (By.CLASS_NAME, 'tb-detail-hd'))) # 显性等待 except Exception as e: print '显性未加载成功---%s' % e continue time.sleep(random.randint(2, 5)) # 这里得让他睡眠一下,否则第二页开始会报错(加载数据) htmlDetail = driver.page_source # 这是一面的页面内容 try: doc = pq(htmlDetail) if WhetherYuShou(doc) == False: print '-------不是预售,不需要保存--------' time.sleep(random.uniform(2, 4)) continue detailURL = 'https://detail.tmall.com/item.htm?id=' + str( TreasureID) # TODO:XDF 这里需要注意一下,src图片链接可以不丰在https,需要自己手动拼接 mainPics = doc.find('#J_ImgBooth').attr('src') if 'https:' in mainPics: mainPic = mainPics else: mainPic = 'https:' + mainPics # 收藏人数 popularity = clearToReplaceData( doc.find('#J_CollectCount').text(), 0) # TODO:XDF 这里要注意,源码中可能存在xmlns,用pq是爬取不到的,要用lxml的tree抓取(非常坑爹) # presellPrice = clearToReplaceData(doc.find('#J_PromoBox').text(), 1) address = doc.find('#J_deliveryAdd').text() paymentDate = doc.find('.J_step2Time').text().split('~') paymentBeginDate = paymentDate[0] paymentFinishDate = paymentDate[1] reserveCount = clearToReplaceData( doc.find('.tb-wrt-guc').text(), 3) detailPrice = clearToReplaceData( doc.find('#J_StrPriceModBox .tm-price').text(), 5) categoryIdContent = clearToReplaceData( str(doc.find('#J_ZebraPriceDesc').attr('mdv-cfg')), 2) print(categoryIdContent) spuIds = 'TShop.Setup\((.*?)\);' apiData = re.findall(spuIds, htmlDetail, re.S)[0] datas = json.loads(apiData) brandId = datas['itemDO']['brandId'] categoryId = datas['itemDO']['categoryId'] rootCatId = datas['itemDO']['rootCatId'] spuId = datas['itemDO']['spuId'] title = datas['itemDO']['title'] sellerId = datas['rateConfig']['sellerId'] # URL_NO = datas['rstShopId'] shopID = sellerId shopName = doc.find('.slogo-shopname').text() categoryName = categoryNamesQly(str(TreasureID)) styleData = doc.find('#J_AttrUL').children().items() # 风格 StyleName = styleNames(styleData) # 因为styleData是一个迭代器,被循环完的就会被释放掉(品牌有可能在查找风格的时候循环过去了,已经被释放掉了),所以这里得重新赋值数据源 brandData = doc.find('#J_AttrUL').children().items() # 品牌 brand = brandName(brandData) # 评价描述评分 EvaluationScores = evaluationScoreURL(str(TreasureID), str(spuId), str(sellerId)) URL_NO = doc.find('#LineZing').attr('shopid') if ExistenceShopName(ShopName) == False: #这里证明这家店铺是没有被爬过的 InsertShopTempletes(ShopName, URL_NO) try: ShopURL = str(doc.find('.shopLink').attr('href')) if len(ShopURL): ShopURL = clearToReplaceData(ShopURL, 4) else: ShopURL = '-' except Exception as e: print e print str( TreasureID ), shopName, categoryName, datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S' ), detailPrice, address, title, mainPic, presellPrice, popularity # , paymentBeginDate, paymentFinishDate, reserveCount StartTime = datetime.datetime.strptime( str(allShopName['YuShouStartTime'][0]), '%Y-%m-%d %H:%M:%S') EndTime = datetime.datetime.strptime( str(allShopName['YuShouEndTime'][0]), '%Y-%m-%d %H:%M:%S') # print nowTime, type(nowTime), type(endTime), now(), endTime # if now() > str(allShopName['YuShouEndTime'][0]): # print '过期' # else: # print '没过期' product = { 'title': title, 'TreasureID': str(TreasureID), 'addRess': address, 'shopName': shopName, 'mainPic': mainPic, 'detailPrice': detailPrice, 'popularity': popularity, 'reserveCount': int(reserveCount), 'paymentBeginDate': paymentBeginDate, 'paymentFinishDate': paymentFinishDate, 'presellPrice': float(presellPrice), 'categoryId': int(categoryId), 'categoryName': categoryName, 'spiderTime': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'ShopID': shopID, 'brandId': brandId, 'brand': brand, 'spuId': spuId, 'rootCatId': int(rootCatId), 'StyleName': StyleName, 'EffectiveTime': '', 'ReservationStatus': 0, 'CollectionNum': int(popularity), 'ItemName': '', 'NCategory_Name': '', 'Is_Search': 1, 'NStyleName': ' ', 'NewstPrice': 0, 'EvaluationScores': float(EvaluationScores), 'URL_NO': URL_NO, 'ShopURL': ShopURL, 'sellerId': sellerId, 'productState': '1', 'JHSmodifyTime': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'modifyTime': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'StartTime': StartTime, 'EndTime': EndTime, 'detailURL': detailURL } currentTime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') rateData = getCommentBestNewTime(product) if rateData: # commentNewTime = getCommentBestNewTime(product) EvaluationNewTime = strToDateTime(str(rateData), 'sixLineTypes') else: EvaluationNewTime = '' if str(dbChoice['TmallYuShouEnemyShopSql'] [0]) == 'Mongodb': #保存到mongodb saveYuShouOrRemove(product, str(TreasureID)) else: print '保存到sqlserver数据库...' if judgeHaveTreasureID(product, '') == True: InsertOrUpdateBaseInfo(product, 'Update') else: InsertOrUpdateBaseInfo(product, 'Insert') InsertPreSaleNew(product) except Exception as e: print('error---%s' % e)
def tmallGivenIDAndShopName(): # TODO:XDF Chrome欲歌浏览器 options = webdriver.ChromeOptions() # options.add_extension('AdBlock_v3.15.0.crx') # TODO:XDF Chrome欲歌广告过滤插件 # 设置中文 options.add_argument('lang=zh_CN.UTF-8') prefs = {"profile.managed_default_content_settings.images": 2} options.add_experimental_option("prefs", prefs) # TODO:XDF 禁止加载图片 # 更换头部 options.add_argument( 'user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36"' ) driver = webdriver.Chrome( executable_path= r'/Users/zhuoqin/Desktop/Python/SeleniumDemo/chromedriver') wait = WebDriverWait(driver, 200, 0.5) # 表示给browser浏览器一个10秒的加载时间 # TODO:XDF PhantomJS无头浏览器 # dcap = dict(DesiredCapabilities.PHANTOMJS) # dcap["phantomjs.page.settings.userAgent"] = ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36") # 设置user-agent请求头 # dcap["phantomjs.page.settings.loadImages"] = False # 禁止加载图片 # # print ('即将开始。。。') # service_args = [] # service_args.append('--load-images=no') ##关闭图片加载 # service_args.append('--disk-cache=yes') ##开启缓存 # service_args.append('--ignore-ssl-errors=true') ##忽略https错误 # #TODO:XDF 针对本地调试 # driver = webdriver.PhantomJS(executable_path=r'/Users/zhuoqin/Desktop/Python/SeleniumDemo/phantomjs',service_args=service_args,desired_capabilities=dcap) # # # # # driver = webdriver.PhantomJS(executable_path=r'/usr/bin/phantomjs', service_args=service_args,desired_capabilities=dcap) # TODO:XDF 针对Linux # # # TODO:XDF 针对Linux服务器 # wait = WebDriverWait(driver, 60, 0.5) # 表示给browser浏览器一个10秒的加载时间 # # # driver.implicitly_wait(30) # driver.set_page_load_timeout(30) print('等待中。。。') while True: # for k in range(0, len(ownShopID)): for datas in selectAllProductID(): driver.get('https://detail.tmall.com/item.htm?id=%s' % str(datas[0])) print('https://detail.tmall.com/item.htm?id=%s' % str(datas[0])) TreasureID = str(datas[0]) JudgeLoginSuccess(driver) tmallLogin(driver) time.sleep(random.uniform(3, 5)) if judgeProduct(driver) == True: print '商品不存在' continue tmallCode(driver, wait) try: wait.until( EC.presence_of_element_located( (By.CLASS_NAME, 'tb-detail-hd'))) # 显性等待 except Exception as e: print '显性未加载成功---%s' % e time.sleep(random.randint(4, 5)) # 这里得让他睡眠一下,否则第二页开始会报错(加载数据) # driver.implicitly_wait(30) #隐性等待30秒,如果30之内页面加载完毕,往下执行,否则超时会报错,需要处理 html = driver.page_source # 这是一面的页面内容 # print '源码内容----%s'%html # print ('等待中。。。%s' % k) if 'tmall' in driver.current_url: detailURL = 'https://detail.tmall.com/item.htm?id=' + str( datas[0]) else: detailURL = 'https://item.taobao.com/item.htm?id=' + str( datas[0]) try: doc = pq(html) # TODO:XDF 这里需要注意一下,src图片链接可以不丰在https,需要自己手动拼接 mainPics = doc.find('#J_ImgBooth').attr('src') if 'https:' in mainPics: mainPic = mainPics else: mainPic = 'https:' + mainPics # TODO:XDF 这里要注意,源码中可能存在xmlns,用pq是爬取不到的,要用lxml的tree抓取(非常坑爹) presellPrice = clearToReplaceData( doc.find('#J_PromoBox').text(), 1) address = doc.find('#J_deliveryAdd').text() # 收藏人数 popularity = clearToReplaceData( doc.find('#J_CollectCount').text(), 0) reserveCount = clearToReplaceData( doc.find('.tb-wrt-guc').text(), 3) detailPrice = clearToReplaceData( doc.find('#J_StrPriceModBox .tm-price').text(), 5) categoryIdContent = clearToReplaceData( str(doc.find('#J_ZebraPriceDesc').attr('mdv-cfg')), 2) print(categoryIdContent) spuIds = 'TShop.Setup\((.*?)\);' apiData = re.findall(spuIds, html, re.S)[0] datas = json.loads(apiData) brandId = datas['itemDO']['brandId'] categoryId = datas['itemDO']['categoryId'] rootCatId = datas['itemDO']['rootCatId'] spuId = datas['itemDO']['spuId'] title = datas['itemDO']['title'] sellerId = datas['rateConfig']['sellerId'] shopID = sellerId shopName = doc.find('.slogo-shopname').text() categoryName = categoryNamesQly(TreasureID) print '类目名称------%s' % categoryName styleData = doc.find('#J_AttrUL').children().items() # 风格 StyleName = styleNames(styleData) # 因为styleData是一个迭代器,被循环完的就会被释放掉(品牌有可能在查找风格的时候循环过去了,已经被释放掉了),所以这里得重新赋值数据源 brandData = doc.find('#J_AttrUL').children().items() # 品牌 brand = brandName(brandData) # 评价描述评分 EvaluationScores = evaluationScoreURL(str(TreasureID), str(spuId), str(sellerId)) print '你大爷的------000' URL_NO = doc.find('#LineZing').attr('shopid') try: ShopURL = str(doc.find('.shopLink').attr('href')) if len(ShopURL): ShopURL = clearToReplaceData(ShopURL, 4) else: ShopURL = '-' except Exception as e: print e print '你大爷的------123' print TreasureID, shopName, categoryName, address, detailURL, title, mainPic, presellPrice, popularity # , paymentBeginDate, paymentFinishDate, reserveCount print '你大爷的------456' product = { 'title': title, 'TreasureID': TreasureID, 'addRess': address, 'shopName': shopName, 'mainPic': mainPic, 'detailURL': detailURL, 'detailPrice': detailPrice, 'popularity': popularity, 'reserveCount': reserveCount, 'presellPrice': presellPrice, 'categoryId': int(categoryId), 'categoryName': categoryName, 'spiderTime': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'ShopID': shopID, 'brandId': brandId, 'brand': brand, 'spuId': spuId, 'rootCatId': int(rootCatId), 'StyleName': StyleName, 'EffectiveTime': '', 'ReservationStatus': 0, 'ReNewPreSaleTime': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'JHSReNewTime': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'CollectionNum': 0, 'JHSmodifyTime': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'ItemName': '', 'EvaluationTime': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'NCategory_Name': '', 'Is_Search': 1, 'NStyleName': '', 'NewstPrice': 0, 'SkuModifyDate': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'TempleteTime': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'EvaluationScores': float(EvaluationScores), 'URL_NO': URL_NO, 'ShopURL': ShopURL, 'sellerId': sellerId, 'NumTimes': '第' + str(productNumberTimes(TreasureID) + 1) + '次爬取' } saveTmallGivenIDTB(product) # if judgeHaveTreasureID(product) == True: # print '存在------' # InsertOrUpdateBaseInfo(product,'Update') # print '更新成功------' # else: # print '不存在吧------' # InsertOrUpdateBaseInfo(product, 'Insert') # print '存入成功------' # # InsertPreSaleNew(product) except Exception as e: print('error---%s' % e) break print('---------------名字----1') time.sleep(random.randint(4, 7)) driver.close() driver.quit()
def getTaoBaoCommentData(): # TODO:XDF Chrome欲歌浏览器 options = webdriver.ChromeOptions() # options.add_extension('AdBlock_v3.15.0.crx') # TODO:XDF Chrome欲歌广告过滤插件 # 设置中文 options.add_argument('lang=zh_CN.UTF-8') prefs = {"profile.managed_default_content_settings.images": 2} options.add_experimental_option("prefs", prefs) # TODO:XDF 禁止加载图片 # 更换头部 options.add_argument( 'user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36"' ) driver = webdriver.Chrome( executable_path=r'/Users/zhuoqin/Downloads/123456/chromedriver' ) # chrome_options=options, wait = WebDriverWait(driver, 200, 0.5) # 表示给browser浏览器一个10秒的加载时间 # TODO:XDF PhantomJS无头浏览器 # dcap = dict(DesiredCapabilities.PHANTOMJS) # dcap["phantomjs.page.settings.userAgent"] = ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36") # 设置user-agent请求头 # dcap["phantomjs.page.settings.loadImages"] = False # 禁止加载图片 # # print ('即将开始。。。') # service_args = [] # service_args.append('--load-images=no') ##关闭图片加载 # service_args.append('--disk-cache=yes') ##开启缓存 # service_args.append('--ignore-ssl-errors=true') ##忽略https错误 # #TODO:XDF 针对本地调试 # driver = webdriver.PhantomJS(executable_path=r'/Users/zhuoqin/Desktop/Python/SeleniumDemo/phantomjs',service_args=service_args,desired_capabilities=dcap) # # # # # driver = webdriver.PhantomJS(executable_path=r'/usr/bin/phantomjs', service_args=service_args,desired_capabilities=dcap) # TODO:XDF 针对Linux # # # TODO:XDF 针对Linux服务器 # wait = WebDriverWait(driver, 60, 0.5) # 表示给browser浏览器一个10秒的加载时间 # # # driver.implicitly_wait(30) # driver.set_page_load_timeout(30) print('等待中。。。') driver.maximize_window() driver.get('https://item.taobao.com/item.htm?id=541901945837') BounceCommentLogin(driver) tmallLogin(driver, UnexpectedAlertPresentException, ActionChains) time.sleep(random.uniform(3, 5)) tmallCode(driver, wait, EC) try: wait.until( EC.presence_of_element_located( (By.CLASS_NAME, 'attributes-list'))) # 显性等待 except Exception as e: print '显性未加载成功---%s' % e time.sleep(random.randint(5, 6)) # 这里得让他睡眠一下,否则第二页开始会报错(加载数据) doc = pq(driver.page_source) shopName = doc.find('.shop-name-link').text() Title = doc.find('#J_Title .tb-main-title').attr('data-title') # print '类型---%s'%categoryId styleData = doc.find('.attributes-list').children().items() # 风格 StyleName = styleNames(styleData) # 因为styleData是一个迭代器,被循环完的就会被释放掉(品牌有可能在查找风格的时候循环过去了,已经被释放掉了),所以这里得重新赋值数据源 brandData = doc.find('.attributes-list').children().items() # 品牌 brand = brandName(brandData) categoryName = categoryNamesQly('541901945837') spuIds = "shopId : '(.*?)'," shopId = re.findall(spuIds, driver.page_source, re.S)[0] sellerIds = "sellerId : '(.*?)'," sellerId = re.findall(sellerIds, driver.page_source, re.S)[0] categoryIds = " cid : '(.*?)'," categoryId = re.findall(categoryIds, driver.page_source, re.S)[0] print '店铺名-----%s' % shopName, Title, StyleName, brand, categoryName, shopId, sellerId, categoryId time.sleep(random.randint(15, 20)) # 这里得让他睡眠一下,否则第二页开始会报错(加载数据) time.sleep(random.randint(4, 5)) # 这里得让他睡眠一下,否则第二页开始会报错(加载数据) js = "var q = document.documentElement.scrollTop = 1000" driver.execute_script(js) time.sleep(random.uniform(3, 5)) driver.find_element_by_xpath('//*[@id="J_TabBar"]/li[2]/a').click() time.sleep(random.uniform(3, 5)) BounceCommentLogin(driver) dragger = driver.find_element_by_class_name("sorting") action = ActionChains(driver) action.move_to_element(dragger) time.sleep(random.uniform(3, 5)) action.click_and_hold(on_element=dragger).perform() driver.find_element_by_xpath( '//*[@id="reviews"]/div/div/div/div/div/div[1]/div[2]/div/ul/li[2]' ).click() # print '源码-----%s'%driver.page_source action.release() time.sleep(random.uniform(5, 8)) driver.delete_all_cookies() print '测试一下吧-------1' time.sleep(random.uniform(5, 8)) while True: BounceCommentLogin(driver) LoginCodeVerificatin(driver) time.sleep(random.uniform(5, 8)) doc = pq(driver.page_source) print '源码吧----%s' % doc.find( '.J_KgRate_ReviewItem.kg-rate-ct-review-item').text() for data in doc.find( '.J_KgRate_ReviewItem.kg-rate-ct-review-item').items(): RateDate = strToDateTime( str(data.find('.tb-r-act-bar .tb-r-info .tb-r-date').text()), 'fiveAllWordTypes') TaoBaoComment = data.find( '.tb-rev-item .J_KgRate_ReviewContent.tb-tbcr-content ').text( ) PhotoItems = data.find( '.tb-rev-item-media .kg-photo-viewer-thumb-bar.tb-tbcr-mt .photo-item' ).items() auctionSku = data.find('.tb-r-act-bar .tb-r-info').text() #追加内容 appendContent = data.find( '.tb-rev-item tb-rev-item-append .tb-rev-item.tb-rev-item-append .J_KgRate_ReviewContent.tb-tbcr-content ' ).text() if '颜色分类:' in auctionSku: auctionSku = '颜色分类:' + auctionSku.split('颜色分类:')[-1] else: auctionSku = '-' Phostos = [] for photo in PhotoItems: # TODO:XDF 这里要注意,源码中可能存在xmlns,用pq是爬取不到的,要用lxml的tree抓取(非常坑爹) if 'xmlns' in photo.html(): selector = etree.HTML(photo.html()) Img = str(selector.xpath('//img/@src')[0]) if 'https:' not in Img: Image = 'https:' + Img.replace('40x40', '400x400') else: Image = Img Phostos.append(Image) else: print '**************不存在xmlns啦*****************' # print '相片----%s'%Image#photo.html() print '测试一下吧------', data.find('.from-whom').text( ), RateDate, TaoBaoComment, Phostos, auctionSku, appendContent # print '源码-----%s' %doc.find('.tb-revbd').text() time.sleep(random.uniform(5, 8)) driver.find_element_by_xpath('//*[@class="pg-next"]').click() time.sleep(random.uniform(5, 8))
def tmallGivenIDAndShopName(): # TODO:XDF Chrome欲歌浏览器 # options = webdriver.ChromeOptions() # # options.add_extension('AdBlock_v3.15.0.crx') # TODO:XDF Chrome欲歌广告过滤插件 # # 设置中文 # options.add_argument('lang=zh_CN.UTF-8') # prefs = {"profile.managed_default_content_settings.images": 2} # options.add_experimental_option("prefs", prefs) # TODO:XDF 禁止加载图片 # # 更换头部 # options.add_argument( # 'user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36"') # # driver = webdriver.Chrome(chrome_options=options, # # executable_path=r'/Users/zhuoqin/Desktop/Python/SeleniumDemo/chromedriver') # driver = webdriver.Chrome(executable_path=r'/Users/zhuoqin/Downloads/123456/chromedriver') # chrome_options=options, # wait = WebDriverWait(driver, 200, 0.5) # 表示给browser浏览器一个10秒的加载时间 # TODO:XDF PhantomJS无头浏览器 dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = ( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36" ) # 设置user-agent请求头 dcap["phantomjs.page.settings.loadImages"] = False # 禁止加载图片 print('即将开始。。。') service_args = [] service_args.append('--load-images=no') ##关闭图片加载 service_args.append('--disk-cache=yes') ##开启缓存 service_args.append('--ignore-ssl-errors=true') ##忽略https错误 #TODO:XDF 针对本地调试 # driver = webdriver.PhantomJS(executable_path=r'/Users/zhuoqin/Desktop/Python/SeleniumDemo/phantomjs',service_args=service_args,desired_capabilities=dcap) # driver = webdriver.PhantomJS(executable_path=r'/usr/bin/phantomjs', service_args=service_args, desired_capabilities=dcap) # TODO:XDF 针对Linux # # TODO:XDF 针对Linux服务器 wait = WebDriverWait(driver, 60, 0.5) # 表示给browser浏览器一个10秒的加载时间 # driver.implicitly_wait(30) driver.set_page_load_timeout(30) print('等待中。。。') while True: BaseInfo = TmallYuShouBaseInfoData() for k in range(0, len(BaseInfo)): time.sleep(random.uniform(5, 8)) TreasureID = str(BaseInfo[k]).replace(' ', '') driver.get('https://detail.tmall.com/item.htm?id=%s' % TreasureID) print('https://detail.tmall.com/item.htm?id=%s' % TreasureID) # ID = str(ownShopID['shopID'][k]) # JudgeLoginSuccess(driver,UnexpectedAlertPresentException,ActionChains) tmallLogin(driver, UnexpectedAlertPresentException, ActionChains) time.sleep(random.uniform(3, 5)) if judgeProduct(driver) == True: print '商品不存在' continue tmallCode(driver, wait, EC) try: wait.until( EC.presence_of_element_located( (By.CLASS_NAME, 'tb-detail-hd'))) # 显性等待 except Exception as e: print '显性未加载成功---%s' % e time.sleep(random.randint(3, 5)) #这里得让他睡眠一下,否则第二页开始会报错(加载数据) # driver.implicitly_wait(30) #隐性等待30秒,如果30之内页面加载完毕,往下执行,否则超时会报错,需要处理 html = driver.page_source #这是一面的页面内容 # print '源码内容----%s'%html print('等待中。。。%s' % k) if 'tmall' in driver.current_url: detailURL = 'https://detail.tmall.com/item.htm?id=' + TreasureID else: detailURL = 'https://item.taobao.com/item.htm?id=' + TreasureID try: doc = pq(html) if WhetherYuShou(doc) == False: print '-----不是预售-----跳过' continue # TODO:XDF 这里需要注意一下,src图片链接可以不丰在https,需要自己手动拼接 mainPics = doc.find('#J_ImgBooth').attr('src') if 'https:' in mainPics: mainPic = mainPics else: mainPic = 'https:' + mainPics # TODO:XDF 这里要注意,源码中可能存在xmlns,用pq是爬取不到的,要用lxml的tree抓取(非常坑爹) # if 'xmlns' in doc.find('.tb-detail-hd').html(): # print ('存在xmlns--%s'%doc.find('.tb-detail-hd').html()) # titles = doc.find('.tb-detail-hd').html() # selector = etree.HTML(titles) # title = str(selector.xpath('//h1/text()')[0]).replace('\r\n','').replace(' ','').replace('\n','').replace('\t','') # print (title) # else: # title = doc.find('#detail .tb-detail-hd h1').text().replace('\r\n', '').replace(' ', '').replace('\n', '').replace('\t','') presellPrice = clearToReplaceData( doc.find('#J_PromoBox').text(), 1) address = doc.find('#J_deliveryAdd').text() #收藏人数 popularity = clearToReplaceData( doc.find('#J_CollectCount').text(), 0) reserveCount = clearToReplaceData( doc.find('.tb-wrt-guc').text(), 3) paymentDate = doc.find('.J_step2Time').text().split('~') # driver.save_screenshot('RecordProcess/ceShiprocess%s.png' % k) try: paymentBeginDate = paymentDate[0] paymentFinishDate = paymentDate[1] except Exception as e: print('miss2---%s' % e) detailPrice = clearToReplaceData( doc.find('#J_StrPriceModBox .tm-price').text(), 5) categoryIdContent = clearToReplaceData( str(doc.find('#J_ZebraPriceDesc').attr('mdv-cfg')), 2) print(categoryIdContent) spuIds = 'TShop.Setup\((.*?)\);' apiData = re.findall(spuIds, html, re.S)[0] datas = json.loads(apiData) brandId = datas['itemDO']['brandId'] categoryId = datas['itemDO']['categoryId'] rootCatId = datas['itemDO']['rootCatId'] spuId = datas['itemDO']['spuId'] title = datas['itemDO']['title'] sellerId = datas['rateConfig']['sellerId'] shopID = sellerId shopName = doc.find('.slogo-shopname').text() categoryName = categoryNamesQly(TreasureID) # print '类型---%s'%categoryId styleData = doc.find('#J_AttrUL').children().items() # 风格 StyleName = styleNames(styleData) # 因为styleData是一个迭代器,被循环完的就会被释放掉(品牌有可能在查找风格的时候循环过去了,已经被释放掉了),所以这里得重新赋值数据源 brandData = doc.find('#J_AttrUL').children().items() # 品牌 brand = brandName(brandData) # 评价描述评分 EvaluationScores = evaluationScoreURL(str(TreasureID), str(spuId), str(sellerId)) URL_NO = doc.find('#LineZing').attr('shopid') try: ShopURL = str(doc.find('.shopLink').attr('href')) if len(ShopURL): ShopURL = clearToReplaceData(ShopURL, 4) else: ShopURL = '-' except Exception as e: print e print TreasureID, shopName, categoryName, datetime.datetime.now( ).strftime( '%Y-%m-%d %H:%M:%S' ), detailPrice, address, detailURL, title, mainPic, presellPrice, popularity #, paymentBeginDate, paymentFinishDate, reserveCount # product = { # 'title': title, # 'ID': ID, # 'addRess': address, # 'shopName': shopName, # 'mainPic': mainPic, # 'detailURL': detailURL, # 'detailPrice': detailPrice, # 'popularity': popularity, # 'reserveCount': reserveCount, # 'paymentBeginDate': paymentBeginDate, # 'paymentFinishDate': paymentFinishDate, # 'presellPrice': presellPrice, # 'categoryId': int(categoryId), # 'categoryName': categoryName, # 'spiderTime': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), # 'ShopID':shopID, # 'brandId':brandId, # 'brand':brand, # 'spuId':spuId, # 'rootCatId':int(rootCatId), # 'StyleName':StyleName, # 'EffectiveTime':'', # 'ReservationStatus':0, # 'ReNewPreSaleTime':datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), # 'JHSReNewTime':datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), # 'CollectionNum':0, # 'JHSmodifyTime':datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), # 'ItemName':'', # 'EvaluationTime':datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), # 'NCategory_Name':'', # 'Is_Search':1, # 'NStyleName':'', # 'NewstPrice':0, # 'SkuModifyDate':datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), # 'TempleteTime': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), # 'EvaluationScores':float(EvaluationScores), # 'URL_NO':URL_NO, # 'ShopURL':ShopURL, # 'sellerId':sellerId # } StartTime = datetime.datetime.strptime( str(allShopName['YuShouStartTime'][0]), '%Y-%m-%d %H:%M:%S') EndTime = datetime.datetime.strptime( str(allShopName['YuShouEndTime'][0]), '%Y-%m-%d %H:%M:%S') product = { 'title': title, 'TreasureID': TreasureID, 'addRess': address, 'shopName': shopName, 'mainPic': mainPic, 'detailPrice': detailPrice, 'popularity': popularity, 'reserveCount': int(reserveCount), 'paymentBeginDate': strToDateTime(str(paymentBeginDate), 'fiveColonTypes'), 'paymentFinishDate': strToDateTime(str(paymentFinishDate), 'fiveColonTypes'), 'presellPrice': presellPrice, 'categoryId': int(categoryId), 'categoryName': categoryName, 'spiderTime': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'ShopID': shopID, 'brandId': brandId, 'brand': brand, 'spuId': spuId, 'rootCatId': int(rootCatId), 'StyleName': StyleName, 'EffectiveTime': '', 'ReservationStatus': 0, 'CollectionNum': int(popularity), 'ItemName': '', 'NCategory_Name': '', 'Is_Search': 1, 'NStyleName': ' ', 'NewstPrice': 0, 'EvaluationScores': float(EvaluationScores), 'URL_NO': URL_NO, 'ShopURL': ShopURL, 'sellerId': sellerId, 'productState': '1', 'JHSmodifyTime': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'modifyTime': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'StartTime': StartTime, 'EndTime': EndTime, 'detailURL': detailURL } if str(dbChoice['tmallYuShouSql'][0]) == 'Mongodb': if TreasureID in BaseInfo: UpdateTmallBaseInfoTB(product) UpdateTmallYuShouTB(product) else: saveTmallGivenIDToYuShouTB(product) saveTmallBaseInfoTBToMongodb(product) else: if judgeHaveTreasureID(product) == True: print '存在------' InsertOrUpdateBaseInfo(product, 'Update') print '更新成功------' else: print '不存在吧------' InsertOrUpdateBaseInfo(product, 'Insert') print '存入成功------' # InsertPreSaleNew(product) except Exception as e: print('error---%s' % e) time.sleep(7200) print('---------------名字----1') time.sleep(random.randint(4, 7)) driver.close() driver.quit()