def parse_user_ratings(self, response): for rating_element in response.css("ul.list-ratings li"): product_name_element = rating_element.css('div.left small') product_url = product_name_element.css( "a::attr(href)").extract_first() if (product_url): rating = items.ProductRating() ad_id = self.get_ad_id(product_url) rating['ads_id'] = ad_id yield self.make_request('ads', url=product_url, ads_id=ad_id) else: # No product URL found, still save it with the product name rating = items.UserRating() rating['username'] = response.meta['username'] rating['item_name'] = self.get_text(product_name_element) #rating['submitted_on'] = self.get_text(rating_element.css('.left date')) rating['submitted_on'] = self.to_utc( dateutil.parser.parse( self.get_text(rating_element.css('.left date')))) rating['rating'] = len(rating_element.css('.rating.stars i.full')) rating['comment'] = self.get_text( rating_element.css('div.right.formatted')) yield rating next_page_url = response.css( "section#main a.arrow-right::attr(href)").extract_first() if next_page_url: yield self.make_request('user_ratings', url=next_page_url, username=response.meta['username'])
def parse_user_ratings(self, response): for rating_element in response.css('section.main_items article'): rating = items.UserRating() rating['username'] = response.meta['username'] rating['item_name'] = self.get_text(rating_element.css('h1 a')) rating['submitted_on'] = self.parse_timestr( self.get_text(rating_element.css('h4'))) rating['rating'] = self.get_text(rating_element.css('h2')).count( '★') # ★ is the html entity for ★ rating['comment'] = self.get_text(rating_element.css('p')) yield rating
def parse_user_ratings(self, response): for rating_row in response.css("table.ratingTable tr"): rating_cells = rating_row.css("td") if len(rating_cells) == 5: rating = items.UserRating() rating['username'] = response.meta['username'] rating['submitted_on'] = self.parse_days_ago(self.get_text(rating_cells[0])) rating['rating'] = len(rating_cells[1].css('img[alt="gold"]')) rating['comment'] = self.get_text(rating_cells[2]) rating['submitted_by'] = self.get_text(rating_cells[3]) rating['price'] = self.get_text(rating_cells[4]) yield rating
def parse_feedback(self, response): try: username = re.search('/feedback_list/([\w-]+)/', response.url).group(1) user_rating = items.UserRating() user_rating['username'] = username trs = response.css('table.m-0 tr') for tr in trs: user_rating['submitted_by'] = self.get_text(tr.css('td:nth-child(2)')) user_rating['submitted_on'] = self.parse_datetime(self.get_text(tr.css('td:nth-child(3)'))) user_rating['comment'] = self.get_text(tr.css('td:last-child')) if user_rating['comment'] != 'Pas de feedbacks pour le moment': yield user_rating except Exception as error: self.logger.warning("Failed to yield user feedback at %s because '%s'" % (response.url, error))
def parse_user_feedback(self, response): username = self.get_username_from_profile(response) for line in response.css('ul.nav li[role="presentation"].active').xpath("./../../table/tbody/tr"): try: rating = items.UserRating() cells = line.css('td') expected_cols = 5 if len(cells) != expected_cols: raise WarningException("Feedback tables does not have %d columns as expected." % expected_cols) if len(cells[0].css('.label-danger')) > 0: rating['rating'] = 'Negative' elif len(cells[0].css('.label-success')) > 0: rating['rating'] = 'Positive' elif len(cells[0].css('.label-default')) > 0: rating['rating'] = 'Neutral' else: raise WarningException('Unknown rating icon') rating['delivery_time'] = self.get_text(cells[2]) rating['submitted_on'] = self.parse_timestr(self.get_text(cells[4])) rating['comment'] = self.get_text(cells[1].css("p:first-child")) rating['username'] = username m = re.match(r'([^\[]+)(\[\d+\])?', self.get_text(cells[3])) if m: rating['submitted_by'] = self.get_text(m.group(1)) yield rating # Will be flush later if all requests are completed. except WarningException as e: self.logger.warning("Could not get listing feedback at %s. %s" % (response.url, e)) except: raise for url in response.css(".pages ul.pagination a::attr(href)").extract(): if not url.endswith('page=1'): # We already saw that page, but maybe with a different URL (no page parameter) yield self.make_request('user_feedback', url=url, username=username)
def parse_userfeedbacks(self, response): table = response.css('.main-content table') header_map = self.parse_table_header(table.css('thead')) for tr in response.css('.main-content table tbody tr'): try: rating = items.UserRating() rating['submitted_on'] = self.parse_timestr( self.get_text(self.get_cell(tr, 'age', header_map))) rating['comment'] = self.get_text( self.get_cell(tr, 'feedback', header_map)) rating['username'] = response.meta['username'] rating['communication'] = self.get_score( self.get_cell(tr, 'communication', header_map)) rating['speed'] = self.get_score( self.get_cell(tr, 'shippingspeed', header_map)) rating['stealth'] = self.get_score( self.get_cell(tr, 'stealth', header_map)) rating['quality'] = self.get_score( self.get_cell(tr, 'productquality', header_map)) rating['payment_type'] = self.get_text( self.get_cell(tr, 'type', header_map)) rating['item_name'] = self.get_text( self.get_cell(tr, 'item name', header_map)) rating['submitter_level'] = self.get_text( self.get_cell(tr, 'level', header_map)) yield rating except Exception as e: self.logger.error("Cannot parse User Feedback. Error : %s" % e) for url in response.css('.pagination').xpath( './/a[not(contains(@href, "start=0"))]/@href').extract(): yield self.make_request('userfeedback', url=url, username=response.meta['username'])
def parse_vendor(self, response): vendor_profile = response.xpath( ".//section[@id='content1']//span[contains(text(),'Vendor')]/text()" ).extract_first() if vendor_profile and vendor_profile.strip() == "Vendor": # Yield vendor. try: vendor = items.User() vendor['username'] = response.xpath( ".//section[@id='content1']//span[@class='feedbackScore']/../text()" ).extract_first(default="").strip() vendor['relativeurl'] = self.get_relative_url(response.url) vendor['fullurl'] = response.url vendor['last_active'] = response.xpath( ".//section[@id='content1']//label[contains(text(),'Last Logged')]/following-sibling::span/text()" ).extract_first(default="").strip() vendor['last_active'] = self.parse_datetime( vendor['last_active']) vendor['public_pgp_key'] = self.get_text( response.xpath( ".//section[@id='content1']//div[@class='bubble']//div[@class='pgp_box']" )) if vendor['public_pgp_key'].endswith("BLOCK----"): self.logger.warning( "PGP key is missing a last letter '-' so adding it. Page %s" % response.url) vendor['public_pgp_key'] = vendor['public_pgp_key'] + "-" vendor['public_pgp_key'] = self.normalize_pgp_key( vendor['public_pgp_key']) vendor['join_date'] = response.xpath( ".//section[@id='content1']//label[contains(text(),'Member Since')]/following-sibling::span/text()" ).extract_first(default="").strip() vendor['join_date'] = self.parse_datetime(vendor['join_date']) vendor['feedback_received'] = response.xpath( ".//section[@id='content1']//label[contains(text(),'Feedback Score')]/following-sibling::span/text()" ).extract_first(default="").strip() vendor['ship_from'] = response.xpath( ".//section[@id='content1']//label[contains(text(),'Shipping From')]/following-sibling::span/text()" ).extract_first(default="").strip() vendor['ship_to'] = response.xpath( ".//section[@id='content1']//label[contains(text(),'Shipping To')]/following-sibling::span/text()" ).extract_first(default="").strip() vendor['profile'] = self.get_text( response.xpath( ".//section[@id='content1']//div[@class='bubble']/p")) vendor['successful_transactions'] = response.xpath( ".//section[@id='content1']//label[contains(text(), 'Sales')]/following-sibling::span/text()" ).extract_first(default="").strip() # new fields vendor['response_time'] = response.xpath( ".//section[@id='content1']//label[contains(text(), 'Average Message Response Time')]/following-sibling::span/text()" ).extract_first(default="").strip() vendor['vacation_mode'] = self.get_text( response.xpath( ".//section[@id='content1']//div[@class='row nomargin']//div[@class='col-2']/span[contains(@style,'color')]" )) vacation_mode_normalized = re.search(r"([\w\s]+)", vendor['vacation_mode'], re.M | re.I) if vacation_mode_normalized: vendor['vacation_mode'] = vacation_mode_normalized.group( 1).strip() yield vendor except Exception as error: self.logger.warning( "Couldn't yield vendor from %s (Error: %s)" % (response.url, error)) # Yield ratings. feedbacks = response.xpath( ".//section[@id='content2']//div[@class='feedback']") if feedbacks: for feedback in feedbacks: try: rating = items.UserRating() rating['username'] = response.xpath( ".//section[@id='content1']//span[@class='feedbackScore']/../text()" ).extract_first(default="").strip() if rating['username'] is None or len( rating['username']) < 2: inspect_response(response, self) ads_id = feedback.xpath( "div[@class='feedback_header']/a/@href" ).extract_first() if ads_id is not None: rating['ads_id'] = self.get_url_param( ads_id, 'lid') rating['submitted_by'] = feedback.xpath( "div[@class='feedback_header']//span[@class='feedbackScore']/../text()" ).extract_first(default="").strip() rating['item_name'] = feedback.xpath( "div[@class='feedback_header']/a/text()" ).extract_first(default="").strip() submitted_on_string = feedback.xpath( "div[@class='feedback_header']/span/text()" ).extract_first(default="").strip() if 'Private Listing' in submitted_on_string: submitted_on_string = feedback.xpath( "div[@class='feedback_header']/span/span/span/text()" ).extract_first() rating['submitted_on_string'] = submitted_on_string rating['submitted_on'] = self.parse_datetime( submitted_on_string) rating[ 'submitted_by_number_transactions'] = self.get_text( feedback.xpath( "div[@class='feedback_header']//span[@class='feedbackScore']/sub" )) rating['submitter_rating'] = self.get_text( feedback.xpath( "div[@class='feedback_header']//span[@class='feedbackScore']/sup" )) rating['comment'] = self.get_text(feedback.xpath("p")) rating['price_usd'] = feedback.xpath( "div[@class='feedback_subheader']/div/span/text()[contains(., 'USD')]" ).extract_first() rating['price_usd'] = rating['price_usd'].replace( "~", "").replace("USD", "").replace(" ", "") rating_star = feedback.xpath( "div[@class='feedback_subheader']//div[contains(@style,'img/star.png')]/@style" ).extract_first(default="") rating_star = re.search(r"width:(\d+)px;height", rating_star, re.M | re.S) if rating_star: rating_star = float(rating_star.group(1)) rating['rating'] = rating_star / 120 * 5 warning = feedback.xpath( "div[@class='feedback_subheader']/div/span") if warning and len(warning) > 1: rating['warnings'] = self.get_text(warning[0]) yield rating except Exception as error: self.logger.warning( "Couldn't yield feedbacks from %s (Error: %s)" % (response.url, error)) else: self.logger.warning( "Encountered a buyer profile. Skipping page %s. This should NOT happen." % response.url)
def parse_user(self, response): try: last_active_span = response.css( '.panel-heading .row div:nth-child(2) span') last_active = self.parse_datetime( re.search(r'Last seen: (.+)', self.get_text(last_active_span)).group(1)) user = items.User() user['username'] = self.get_text( response.css('.breadcrumb li.active')) user['relativeurl'] = self.get_relative_url(response.url) user['fullurl'] = response.url user['profile'] = self.get_text(response.css('#profile .col-md-9')) user['average_rating'] = self.get_text( response.css('center span')[0]) user['last_active'] = last_active user['terms_and_conditions'] = self.get_text( response.css('#tac .col-md-9')) user['public_pgp_key'] = self.normalize_pgp_key( self.get_text(response.css('#pgp pre.well'))) level_match = re.search( 'Level (\d+)', self.get_text(response.css('.label-success'))) if level_match: user['level'] = level_match.group(1) if 'FE' in response.xpath( ".//div/span[@class='label label-default']/text()" ).extract(): user['fe_enabled'] = True else: user['fe_enabled'] = False dream_rating = response.xpath( ".//small[preceding-sibling::img[contains(@title, 'Dream')]]/text()" ) if dream_rating: dream_rating = dream_rating.extract_first() user['dreammarket_sales'] = re.search("([0-9]*),", dream_rating).group(1) user['dreammarket_rating'] = re.search(", ([0-9\.]*)", dream_rating).group(1) yield user except Exception as error: self.logger.warning("Failed to yield user at %s because '%s'" % (response.url, error)) try: ratings = response.xpath( ".//div[@id='feedback']/div/div/div/table[1]/tbody/tr") if ratings and 'No available feedback' not in ratings.extract_first( ): for rating in ratings: tds = rating.css('td') user_rating = items.UserRating() user_rating['username'] = user['username'] user_rating['submitted_by'] = self.get_text(tds[0]) user_rating['rating'] = len(tds[1].css('i')) user_rating['comment'] = self.get_text(tds[2]) user_rating['price_usd'] = re.search( '([\d\.]+)', self.get_text(tds[3])).group(1) user_rating['submitted_on'] = self.parse_datetime( self.get_text(tds[4])).date() user_rating['submitted_on_string'] = self.get_text(tds[4]) yield user_rating except Exception as error: self.logger.warning( "Failed to yield user ratings at %s because '%s'" % (response.url, error))