Пример #1
0
def getProductReview(url):
    L = []
    options = Options()
    options.add_argument('--headless')
    # options.add_argument('--disable-gpu')
    options.add_argument('user-agent=Mozilla/5.0')

    driver = webdriver.Chrome(options=options,
                              executable_path='./chromedriver')
    driver.get(url)
    driver.execute_script(
        "window.scrollBy(0, (document.body.scrollHeight*0.6));")

    comment = {}
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    for review in soup.find_all('div', class_='BVRRContentReview'):
        author = review.find('span', class_='BVRRNickname').text
        date = review.find('span', class_='BVRRReviewDate').text
        title = review.find('span', class_='BVRRReviewTitle').text
        content = review.find('span', class_='BVRRReviewText').text
        score = review.find('span', class_='BVRRRatingNumber').text

        # print ("---")
        # print (author)
        # print (date)
        # print (title)
        # print (content)
        # print (score)
        comment = Struct.Comment(content, score, date, title)
        L.append(comment)

    driver.close()
    return L
Пример #2
0
def getProductReview(searchUrl):
    L = []
    # Init driver

    # headless
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-extensions')
    chrome_options.add_argument('--window-size=1920,1080')

    # certificate
    capabilities = DesiredCapabilities.CHROME.copy()
    capabilities['acceptSslCerts'] = True
    capabilities['acceptInsecureCerts'] = True

    driver = webdriver.Chrome(executable_path='./chromedriver',
                              chrome_options=chrome_options,
                              desired_capabilities=capabilities)
    # driver = webdriver.Chrome(executable_path='/Users/Leon/Desktop/capstone_test/chromedriver')
    # baseUrl = 'https://www.sephora.com';
    driver.get(searchUrl)
    time.sleep(0.5)  # Delays for 0.5 seconds.

    # Move ten times can scroll to the end (the height of the website is 4284px)
    for i in range(10):
        driver.execute_script("window.scrollBy(0,500)")
        time.sleep(0.2)
    html = driver.page_source
    soup = BeautifulSoup(html.encode('utf-8'), 'html.parser')

    reviewTitleClass = 'css-1fsuw0x'
    reviewContentClass = 'css-eq4i08'
    reviewDateClass = 'css-12z5fyi'

    reviewTitles = soup.findAll("div", {"class": reviewTitleClass})
    reviewContents = soup.findAll("div", {"class": reviewContentClass})
    reviewDates = soup.findAll("span", {"class": reviewDateClass})
    reviewRatings = getReviewRatings(soup)

    for i in range(len(reviewTitles)):
        title = reviewTitles[i].contents
        content = reviewContents[i].contents
        date = reviewDates[i].contents
        rating = reviewRatings[i]

        # print(content[0]);
        # print(rating);
        # print(date[0]);
        # print(title[0]);
        # print("--------")
        # print content
        comment = Struct.Comment(content[0], rating, date[0], title[0])
        L.append(comment)
    driver.close()
    return L
Пример #3
0
def getProductReview(searchUrl):
    L = []
    # headless
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--window-size=1920,1080')

    driver = webdriver.Chrome(executable_path='./chromedriver',
                              chrome_options=chrome_options)
    # driver = webdriver.Chrome(executable_path='/Users/Leon/Desktop/capstone_test/chromedriver')
    # baseUrl = 'https://www.ulta.com';
    driver.get(searchUrl)
    time.sleep(0.5)  # Delays for 0.5 seconds.

    # Move five times can scroll to the end (the height of the website is 4284px)
    for i in range(5):
        driver.execute_script("window.scrollBy(0,1000)")
        time.sleep(0.3)
    html = driver.page_source
    soup = BeautifulSoup(html.encode('utf-8'), 'html.parser')

    reviews = soup.findAll('article', {'class': 'pr-review'})
    # print len(reviews);

    # title
    reviewTitles = []
    for review in reviews:
        temp = review.findAll('h2', {'class': 'pr-rd-review-headline'})
        reviewTitle = temp[0].contents[0]
        reviewTitles.append(reviewTitle)
    # print reviewTitles[0];

    # content
    reviewContents = []
    for review in reviews:
        temp = review.findAll('p', {'class': 'pr-rd-description-text'})
        reviewContent = temp[0].contents[0]
        reviewContents.append(reviewContent)
    # print reviewContents[0];

    # date
    reviewDates = []
    for review in reviews:
        temp = review.findAll('time', {})
        reviewDate = temp[0].contents[0]
        reviewDates.append(reviewDate.strip())
    # print reviewDates[0];

    # rating
    reviewRatings = []
    for review in reviews:
        temp = review.findAll('div', {'class': 'pr-snippet-rating-decimal'})
        reviewRating = temp[0].contents[0]
        reviewRatings.append(reviewRating)
    # print reviewRatings[0];

    # print len(reviewTitles)
    # print len(reviewContents)
    # print len(reviewDates)
    # print len(reviewRatings)

    for i in range(len(reviews)):
        # print reviewTitles[i]
        # print reviewContents[i]
        # print reviewDates[i]
        # print reviewRatings[i]
        # print '################'
        comment = Struct.Comment(reviewContents[i], reviewRatings[i],
                                 reviewDates[i], reviewTitles[i])
        L.append(comment)
    return L