def parse_author(self): try: return soup_utils.find_tag( self.item_soup, ReviewItem.REVIEW_AUTHOR_SELECTOR).text.strip() except: return None
def parse_helpful(self): try: vote_text = soup_utils.find_tag( self.item_soup, ReviewItem.REVIEW_VOTES_SELECTOR).text return vote_text.replace("people found this helpful.", "").strip() except: return 0
def parse_description(self): try: return soup_utils.find_tag( self.soup, self.DESCRIPTION_SELECTOR).decode_contents( formatter="html").strip() except: return None
def parse_feature_list(self): try: return soup_utils.find_tag( self.soup, self.FEATURE_BULLETS_SELECTOR).decode_contents( formatter="html").strip() except: return None
def parse_content(self): try: return soup_utils.find_tag( self.item_soup, ReviewItem.REVIEW_CONTENT_SELECTOR).decode_contents( formatter="html").strip() except: return None
def parse_date(self): try: date_text = soup_utils.find_tag( self.item_soup, ReviewItem.REVIEW_DATE_SELECTOR).text date_text = date_text.split('on')[-1].strip() return datetime.datetime.strptime(date_text, '%B %d, %Y') except: return None
def parse_breadcrumbs(self): try: categories = soup_utils.find_tag( self.soup, self.BREADCRUMBS_SELECTOR).text.strip() categories = [c.strip() for c in categories.split('›')] return " > ".join(categories) except: return None
def parse_price(self): try: price_text = soup_utils.find_tag(self.soup, self.PRICE_SELECTOR).text.strip() price_text = re.sub(r'[^0-9.,\-]', '', price_text) return round(float(price_text), 2) except: return None
def parse(self, response): soup = BeautifulSoup(response, HTML_PARSER) items = soup_utils.find_tags(soup, ITEM_LINK_SELECTOR) for item in items: item_link = soup_utils.find_tag(item, 'a') asin = extract_asin_from_url(item_link['href']) self.save_asin(asin) self.process_next_page(soup)
def parse_rating(self): try: stars_text = soup_utils.find_tag( self.item_soup, ReviewItem.REVIEW_RATING_SELECTOR).text.replace( "out of 5 stars", "").strip() return int(float(stars_text)) except: return 0
def process_next_page(self, soup): self.page_processed += 1 if self.page_processed > self.max_pages: return next_page = soup_utils.find_tag(soup, NEXT_PAGE_SELECTOR) if next_page is not None: next_page_url = soup_utils.format_url( next_page['href'], get_review_url(self.asin, self.country)) self.process(next_page_url)
def process_next_page(self, soup): next_page = soup_utils.find_tag(soup, NEXT_PAGE_SELECTOR) if next_page is not None: next_page_url = soup_utils.format_url(next_page['href'], self.url) self.process(next_page_url)
def parse_title(self): try: return soup_utils.find_tag( self.item_soup, ReviewItem.REVIEW_TITLE_SELECTOR).text.strip() except: return None
def parse_brand(self): return soup_utils.find_tag(self.soup, self.BRAND_SELECTOR).text.strip()
def parse_name(self): return soup_utils.find_tag(self.soup, self.TITLE_SELECTOR).text.strip()