def getProductReview(url): L = [] options = Options() options.add_argument('--headless') # options.add_argument('--disable-gpu') options.add_argument('user-agent=Mozilla/5.0') driver = webdriver.Chrome(options=options, executable_path='./chromedriver') driver.get(url) driver.execute_script( "window.scrollBy(0, (document.body.scrollHeight*0.6));") comment = {} soup = BeautifulSoup(driver.page_source, 'html.parser') for review in soup.find_all('div', class_='BVRRContentReview'): author = review.find('span', class_='BVRRNickname').text date = review.find('span', class_='BVRRReviewDate').text title = review.find('span', class_='BVRRReviewTitle').text content = review.find('span', class_='BVRRReviewText').text score = review.find('span', class_='BVRRRatingNumber').text # print ("---") # print (author) # print (date) # print (title) # print (content) # print (score) comment = Struct.Comment(content, score, date, title) L.append(comment) driver.close() return L
def getProductReview(searchUrl): L = [] # Init driver # headless chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-extensions') chrome_options.add_argument('--window-size=1920,1080') # certificate capabilities = DesiredCapabilities.CHROME.copy() capabilities['acceptSslCerts'] = True capabilities['acceptInsecureCerts'] = True driver = webdriver.Chrome(executable_path='./chromedriver', chrome_options=chrome_options, desired_capabilities=capabilities) # driver = webdriver.Chrome(executable_path='/Users/Leon/Desktop/capstone_test/chromedriver') # baseUrl = 'https://www.sephora.com'; driver.get(searchUrl) time.sleep(0.5) # Delays for 0.5 seconds. # Move ten times can scroll to the end (the height of the website is 4284px) for i in range(10): driver.execute_script("window.scrollBy(0,500)") time.sleep(0.2) html = driver.page_source soup = BeautifulSoup(html.encode('utf-8'), 'html.parser') reviewTitleClass = 'css-1fsuw0x' reviewContentClass = 'css-eq4i08' reviewDateClass = 'css-12z5fyi' reviewTitles = soup.findAll("div", {"class": reviewTitleClass}) reviewContents = soup.findAll("div", {"class": reviewContentClass}) reviewDates = soup.findAll("span", {"class": reviewDateClass}) reviewRatings = getReviewRatings(soup) for i in range(len(reviewTitles)): title = reviewTitles[i].contents content = reviewContents[i].contents date = reviewDates[i].contents rating = reviewRatings[i] # print(content[0]); # print(rating); # print(date[0]); # print(title[0]); # print("--------") # print content comment = Struct.Comment(content[0], rating, date[0], title[0]) L.append(comment) driver.close() return L
def getProductReview(searchUrl): L = [] # headless chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--window-size=1920,1080') driver = webdriver.Chrome(executable_path='./chromedriver', chrome_options=chrome_options) # driver = webdriver.Chrome(executable_path='/Users/Leon/Desktop/capstone_test/chromedriver') # baseUrl = 'https://www.ulta.com'; driver.get(searchUrl) time.sleep(0.5) # Delays for 0.5 seconds. # Move five times can scroll to the end (the height of the website is 4284px) for i in range(5): driver.execute_script("window.scrollBy(0,1000)") time.sleep(0.3) html = driver.page_source soup = BeautifulSoup(html.encode('utf-8'), 'html.parser') reviews = soup.findAll('article', {'class': 'pr-review'}) # print len(reviews); # title reviewTitles = [] for review in reviews: temp = review.findAll('h2', {'class': 'pr-rd-review-headline'}) reviewTitle = temp[0].contents[0] reviewTitles.append(reviewTitle) # print reviewTitles[0]; # content reviewContents = [] for review in reviews: temp = review.findAll('p', {'class': 'pr-rd-description-text'}) reviewContent = temp[0].contents[0] reviewContents.append(reviewContent) # print reviewContents[0]; # date reviewDates = [] for review in reviews: temp = review.findAll('time', {}) reviewDate = temp[0].contents[0] reviewDates.append(reviewDate.strip()) # print reviewDates[0]; # rating reviewRatings = [] for review in reviews: temp = review.findAll('div', {'class': 'pr-snippet-rating-decimal'}) reviewRating = temp[0].contents[0] reviewRatings.append(reviewRating) # print reviewRatings[0]; # print len(reviewTitles) # print len(reviewContents) # print len(reviewDates) # print len(reviewRatings) for i in range(len(reviews)): # print reviewTitles[i] # print reviewContents[i] # print reviewDates[i] # print reviewRatings[i] # print '################' comment = Struct.Comment(reviewContents[i], reviewRatings[i], reviewDates[i], reviewTitles[i]) L.append(comment) return L