def parse_user_ratings(self, response): for rating_element in response.css("ul.list-ratings li"): product_name_element = rating_element.css('div.left small') product_url = product_name_element.css( "a::attr(href)").extract_first() if (product_url): rating = items.ProductRating() ad_id = self.get_ad_id(product_url) rating['ads_id'] = ad_id yield self.make_request('ads', url=product_url, ads_id=ad_id) else: # No product URL found, still save it with the product name rating = items.UserRating() rating['username'] = response.meta['username'] rating['item_name'] = self.get_text(product_name_element) #rating['submitted_on'] = self.get_text(rating_element.css('.left date')) rating['submitted_on'] = self.to_utc( dateutil.parser.parse( self.get_text(rating_element.css('.left date')))) rating['rating'] = len(rating_element.css('.rating.stars i.full')) rating['comment'] = self.get_text( rating_element.css('div.right.formatted')) yield rating next_page_url = response.css( "section#main a.arrow-right::attr(href)").extract_first() if next_page_url: yield self.make_request('user_ratings', url=next_page_url, username=response.meta['username'])
def parse_ads_ratings(self, response): for rating_element in response.css("ul.list-ratings li"): rating = items.ProductRating() rating['ads_id'] = response.meta['ads_id'] #rating['submitted_on'] = self.get_text(rating_element.css('.left date')) rating['submitted_on'] = self.to_utc( dateutil.parser.parse( self.get_text(rating_element.css('.left date')))) rating['rating'] = len(rating_element.css('.rating.stars i.full')) rating['comment'] = self.get_text( rating_element.css('div.right.formatted')) yield rating
def parse_ads_ratings(self, response): for rating_element in response.css('section.main_items article'): rating = items.ProductRating() rating['ads_id'] = response.meta['ads_id'] header = self.get_text(rating_element.css('h1')) # header contains a bunch of info, formatted like this : # ★☆☆☆☆ 4 hours, 33 minutes ago: 2017-12-10 header_parts = header.split(' ') last_part = header_parts[-1] if re.match('\d{4}-\d{2}-\d{2}', last_part): rating['submitted_on'] = last_part first_part = header_parts[0] rating['rating'] = first_part.count( '★') # ★ is the html entity for ★ rating['comment'] = self.get_text(rating_element.css('p')) yield rating
def parse_listing_feedback(self, response): m = re.search('listing\/(\d+)', response.url) if not m: raise Exception('Cannot find listing ID') listing_id= m.group(1) for line in response.css('ul.nav li[role="presentation"].active').xpath("./../../table/tbody/tr"): try: rating = items.ProductRating() cells = line.css('td') expected_cols = 5 if len(cells) != expected_cols: raise WarningException("Feedback tables does not have %d columns as expected." % expected_cols) if len(cells[0].css('.label-danger')) > 0: rating['rating'] = 'Negative' elif len(cells[0].css('.label-success')) > 0: rating['rating'] = 'Positive' elif len(cells[0].css('.label-default')) > 0: rating['rating'] = 'Neutral' else: raise WarningException('Unknown rating icon') rating['delivery_time'] = self.get_text(cells[2]) rating['submitted_on'] = self.parse_timestr(self.get_text(cells[4])) rating['comment'] = self.get_text(cells[1].css("p:first-child")) rating['ads_id'] = listing_id m = re.match(r'([^\[]+)(\[\d+\])?', self.get_text(cells[3])) if m: rating['submitted_by'] = self.get_text(m.group(1)) yield rating except WarningException as e: self.logger.warning("Could not get listing feedback at %s. %s" % (response.url, e)) except: raise for url in response.css(".pages ul.pagination a::attr(href)").extract(): if not url.endswith('page=1'): # We already saw that page, but maybe with a different URL (no page parameter) yield self.make_request('listing_feedback', url=url, listing_id=listing_id)
def parse_product_rating(self, response): try: ads_id = re.search(r'/listings/[\w-]+/([\w-]+)/feedback', response.url).group(1) ratings = response.css('div.product-details table.table tbody tr') for rating in ratings: tds = rating.css('td') product_rating = items.ProductRating() product_rating['submitted_by'] = self.get_text(tds[0]) product_rating['rating'] = len(tds[1].css('i')) product_rating['comment'] = self.get_text(tds[2]) product_rating['submitted_on'] = self.parse_datetime( self.get_text(tds[3])).date() product_rating['submitted_on_string'] = self.get_text(tds[3]) product_rating['ads_id'] = ads_id yield product_rating except Exception as error: self.logger.warning( "Failed to yield product rating at %s because '%s'" % (response.url, error))
def parse_product_rating(self, response): try: ads_id = re.search('c=listings&a=product&code=([\w]+)&tab=3$', response.url).group(1) ratings = response.css('ul.nav-tabs').xpath( 'following-sibling::table').css('tbody tr') for rating in ratings: tds = rating.css('td') product_rating = items.ProductRating() product_rating['ads_id'] = ads_id product_rating['comment'] = self.get_text(tds[1]) product_rating['submitted_by'] = rating.css( 'td:nth-child(3)::text').extract_first().replace(' ', '') product_rating['submitted_on'] = self.parse_datetime( self.get_text(tds[3])) product_rating['submitted_on_string'] = self.get_text(tds[3]) prev_transactions = rating.xpath( ".//small/text()").extract_first() prev_transactions = prev_transactions.replace('[', '') prev_transactions = prev_transactions.replace(']', '') product_rating[ 'submitted_by_number_transactions'] = prev_transactions price_match = re.search('([\d\.]+) ([\w]+)', self.get_text(tds[4])) if price_match: price = price_match.group(1) currency = price_match.group(2).lower() if currency == 'usd': product_rating['price_usd'] = price elif currency == 'xmr': product_rating['price_xmr'] = price yield product_rating except Exception as error: self.logger.warning( "Failed to yield product rating at %s because '%s'" % (response.url, error))
def parse_listing(self, response): title = response.xpath( ".//section[@id='content1']//div[@class='listing_right']/span/text()" ).extract_first(default="").strip() username = response.xpath( ".//section[@id='content1']//div[@class='listing_right']//a[@class='greenlink']/text()" ).extract_first(default="").strip() if title == "" and username == "": self.logger.warning("Found what is likely an empty page at %s." % response.url) else: # Try to yield ads. try: ads_item = items.Ads() ads_item['title'] = title ads_item['vendor_username'] = username ads_item['relativeurl'] = self.get_relative_url(response.url) ads_item['fullurl'] = response.url if 'clid' in response.url: ads_item['offer_id'] = self.get_url_param( response.url, 'clid') else: ads_item['offer_id'] = self.get_url_param( response.url, 'lid') ads_item['category'] = response.xpath( ".//section[@id='content1']//div[@class='listing_right']/br/following-sibling::span/text()" ).extract_first(default="").strip() ads_item['ships_from'] = response.xpath( ".//section[@id='content1']//div[@class='listing_right']//b[contains(text(),'Shipping From:')]/following-sibling::span/text()" ).extract_first(default="").strip() ads_item['ships_to'] = response.xpath( ".//section[@id='content1']//div[@class='listing_right']//b[contains(text(),'Shipping To:')]/following-sibling::span/text()" ).extract_first(default="").strip() ads_item['description'] = self.get_text( response.xpath(".//section[@id='content1']/p")) ads_item['escrow'] = self.get_text( response.xpath( ".//section[@id='content1']//div[@class='listing_right']/div/span[@style='float:right']/span" )) ads_item['multisig'] = response.xpath( ".//section[@id='content1']//div[@class='listing_right']/div/span[@style='float:right']/img[@alt='Multisig']" ) ads_item['multisig'] = True if ads_item['multisig'] else False ads_item['stock'] = self.get_text( response.xpath( ".//section[@id='content1']//div[@class='listing_right']/div/span[not(@style='float:right')]/span" )) ads_item['shipping_options'] = self.get_shipping_options( response) ads_item['accepted_currencies'] = self.get_accepted_currencies( response) prices_text = self.get_text( response.xpath( ".//section[@id='content1']//div[@class='listing_right']/p" )) price_usd = re.search(r"\$\s*([\d\.]+)", prices_text, re.M | re.I) price_btc = re.search(r"([\d\.]+)\s*฿", prices_text, re.M | re.I) price_xmr = re.search(r"([\d\.]+)\s*XMR", prices_text, re.M | re.I) if price_usd: ads_item["price_usd"] = price_usd.group(1) else: self.logger.warning("No price_usd found on %s" % response.url) if price_xmr: ads_item["price_xmr"] = price_xmr.group(1) if price_btc: ads_item["price_btc"] = price_btc.group(1) yield ads_item except Exception as error: self.logger.warning("Couldn't yield ad from %s (Error: %s)" % (response.url, error)) # Try to yield images. try: image_urls = response.xpath( ".//section[@id='content1']//div[@class='listing_image']/img/@src" ).extract() if len(image_urls) > 0: img_item = items.AdsImage(image_urls=[]) for img_url in image_urls: img_item['image_urls'].append( self.make_request(reqtype='image', url=img_url)) img_item['ads_id'] = ads_item['offer_id'] yield img_item except Exception as error: self.logger.warning( "Couldn't yield ad images from %s (Error: %s)" % (response.url, error)) # Yield product ratings. # Note, that the price is also available in ads. feedbacks = response.xpath( ".//section[@id='content2']//div[@class='feedback']") if feedbacks: for feedback in feedbacks: rating = items.ProductRating() rating["ads_id"] = ads_item["offer_id"] rating["submitted_on_string"] = feedback.xpath( "div[@class='feedback_header']/span/text()").extract_first( default="").strip() rating["submitted_on"] = self.parse_datetime( rating["submitted_on_string"]) rating['price_usd'] = feedback.xpath( "div[@class='feedback_subheader']/div/span/text()[contains(., 'USD')]" ).extract_first() rating['price_usd'] = rating['price_usd'].replace( "~", "").replace("USD", "").replace(" ", "") rating_star = feedback.xpath( "div[@class='feedback_subheader']//div[contains(@style,'img/star.png')]/@style" ).extract_first(default="") rating_star = re.search(r"width:(\d+)px;height", rating_star, re.M | re.S) if rating_star: rating_star = float(rating_star.group(1)) rating['rating'] = rating_star / 120 * 5 warning = feedback.xpath( "div[@class='feedback_subheader']/div/span") if warning and len(warning) > 1: rating['warnings'] = self.get_text(warning[0]) rating["comment"] = self.get_text(feedback.xpath("p")) rating["submitted_by"] = feedback.xpath( "div[@class='feedback_header']//span[@class='feedbackScore']/../text()" ).extract_first(default="").strip() rating["submitter_rating"] = self.get_text( feedback.xpath( "div[@class='feedback_header']//span[@class='feedbackScore']/sup" )) rating["submitted_by_number_transactions"] = self.get_text( feedback.xpath( "div[@class='feedback_header']//span[@class='feedbackScore']/sub" )) yield rating
else: self.logger.warning('Unknown listing status %s' % response.url) ## ===================== IMAGES ===================== images_url = response.css('img.productImage::attr(src)').extract() for url in images_url: img_item = items.AdsImage(image_urls=[]) img_item['image_urls'].append(self.make_request('image', url=url)) img_item['ads_id'] = ads_item['offer_id'] yield img_item ## ===================== Product Ratings (feedback) ========= rating_lines = response.css('.ratings table tr') for tr in rating_lines: try: rating_item = items.ProductRating() age = self.get_text(tr.css('td.age')) m = re.search('(\d+)d', age) if m: days_offset = m.group(1) # A sanity check. Dream has some dates which are in 1969 and 1970.. submitted_on = (datetime.utcnow() - timedelta(days=int(days_offset))).date() if submitted_on < date(2011, 1, 1): submitted_on = '' self.logger.warning( "Encountered a date outside the acceptable range. See URL: %s" % response.url) else: rating_item['submitted_on'] = submitted_on
def parse_listing(self, response): ads = items.Ads() ads_img = items.AdsImage() listing_content = response.css("#content1") # Tabs feedback_content = response.css("#content2") # Tabs ads['title'] = self.get_text_first(response.css('.listing_right span')) #ads['offer_id'] = self.get_url_param(response.url, 'lid') try: offer_id = self.get_url_param(response.url, 'lid') ads['offer_id'] = self.get_url_param(response.url, 'lid') except: self.logger.warning( "Ran into a URL parameter issue at URL: %s. Offer_ID is not recorded." % (response.url)) ads['offer_id'] = self.get_url_param(response.url, 'lid') ads['relativeurl'] = response.meta['relativeurl'] ads['fullurl'] = self.make_url(ads['relativeurl']) user_url = response.css('.listing_right').xpath( './/a[contains(@href, "page=profile")]/@href').extract_first() # Some items don't have an associated vendor. try: ads['vendor_username'] = self.get_url_param(user_url, 'user') except: self.logger.warning( 'No seller available at URL: %s. Seller is noted as \'\'. Inspect the URL post-crawl.' % (response.url)) ads['vendor_username'] = '' ads['category'] = response.meta['category'] multilisting_select = listing_content.css( 'select[name="multilistingChild"]' ) # 2 types of Ads. Multilisting or not. if not multilisting_select: ads['multilisting'] = False listing_right_p = self.get_text( listing_content.css(".listing_right p")) m = re.search( r'\((\d+(\.\d+)?)\s*\xe0\xb8\xbf\)', listing_right_p ) # Search for bitcoin icon \xe0\b8\xbf is unicode char for bitcoin encoded in UTF8 m2 = re.search(r'([0-9.]{1,10}) \xe0\xb8\xbf', listing_right_p) if m: ads['price'] = m.group(1) # minor error handling in case the previous regex written by Pier-Yver doesn't catch bitcoin prices. elif m is None and m2 is not None: ads['price'] = m2.group(1) #self.logger.warning('Encountered an error with the old price-regex. Using RM\'s regex at URL: %s' % (response.url)) else: ads['multilisting'] = True options = [] # Just added @ below which should fix everything. for option in multilisting_select.xpath('.//option[@value!=""]'): options.append(self.get_text(option)) ads['price'] = json.dumps(options) #Bunches of regex to parse the page. listing_right_html = self.get_text( listing_content.css('.listing_right').extract_first() ) # Read HTML. We need tags as separator. listing_right_span_text = self.get_text( listing_content.css('.listing_right span')) m = re.search('<b>shipping from\s*:\s*</b>\s*([^<]+)', listing_right_html, re.IGNORECASE) if m: ads['ships_from'] = m.group(1) m = re.search('<b>shipping to\s*:\s*</b>\s*([^<]+)', listing_right_html, re.IGNORECASE) if m: ads['ships_to'] = m.group(1) shipping_options = [] for option in listing_content.css( '.listing_right form select[name="shipment"] option[value!=""]::text' ).extract(): shipping_options.append(self.get_text(option)) ads['shipping_options'] = json.dumps(shipping_options) ads['description'] = self.get_text(listing_content.xpath('./p')) stocks_possibilities = [ 'Excellent stock', 'Good stock', 'Low stock', 'Very low stock' ] for possibility in stocks_possibilities: if possibility in listing_right_span_text: ads['stock'] = possibility break yield ads # Ads Image. ads_img['ads_id'] = ads['offer_id'] ads_img['image_urls'] = [ self.make_request( 'image', url=listing_content.css( ".listing_image img::attr(src)").extract_first(), referer=response.url) ] yield ads_img # Handling listing feedbacks for feedback in feedback_content.css(".feedback"): try: rating = items.ProductRating() rating['ads_id'] = ads['offer_id'] rating['comment'] = self.get_text(feedback.css('p')) #rating['submitted_by'] = self.get_text(feedback.css('.feedback_header span a')) try: username = feedback.css('.feedback_header span a').xpath( "./text()")[0].extract().strip() except: username = '' self.logger.warning( 'Found a review with no username. URL: %s' % response.url) rating['submitted_on'] = self.parse_timestr( self.get_text( feedback.css('.feedback_header').xpath( 'span/text()').extract_first())) rating['submitted_by'] = username #star_styles = feedback.css('.feedback_subheader').xpath('./div[1]/@style').extract_first() star_styles = feedback.css('.feedback_subheader').xpath( './div/div')[0].extract() m = re.search(r'width:(\d+)px', star_styles) if m: width = int(m.group(1)) rating['rating'] = '%d/5' % (width // 24 ) # One star is 24 px wide else: self.logger.warning('Cannot find product rating score.') yield rating except Exception as e: self.logger.warning( 'Could not get listing feedback at %s. Error %s' % (response.url, e)) #If there is several pages of feedback. feedback_buffer_middleware will buffer them until we have them all and then sends them further in pipeline for url in feedback_content.css( 'div.pagination a::attr(href)').extract(): if self.get_url_param(url, 'pg') != '1': yield self.make_request( 'listing', url=url, relativeurl=response.meta['relativeurl'], ads_id=ads['offer_id'], category=response.meta['category']) # If statement to avoid requesting vendors pages when there is no vendor associated with an item. if ads['vendor_username'] is not '': yield self.make_request('userprofile', url=user_url)
def parse_offer(self, response): ads = items.Ads() ads['offer_id'] = self.get_offer_id_from_url(response.url) layout = 'unknown' info_block = response.xpath( '//h1[text()="Info"]/..' ) # Two known layout. Try first, fallback on second if len(info_block) == 1: layout = 'with_headings' else: layout = 'without_headings' if layout == 'without_headings': info_block = response.xpath( '//h1[contains(@class, "fheading")]/..') ads['title'] = self.get_text(response.css('h1.fheading')) ads['vendor_username'] = self.get_text( info_block.xpath('.//a[contains(@href, "profile")]')) if 'category' in response.meta and response.meta[ 'category'] is not None: ads['category'] = response.meta['category'] else: ads['category'] = None ads['fullurl'] = response.url.replace('/refund', '') ads['relativeurl'] = "/offer/%s" % ads['offer_id'] # ===== Info block 1 - Ships from/to, escrot, multisig, etc ========== # We determine the type of info by the icon in front of it. Most reliable way to do it as layout changes freely between listings if layout == 'with_headings': p = info_block.xpath('./p[1]') elif layout == 'without_headings': p = info_block.xpath('./p[1]') for line in p.extract_first().split('<br>'): linesel = scrapy.Selector(text=line) line_txt = self.get_text(linesel) if len(linesel.css(".ion-log-out")) > 0: # Ships From icon m = re.search('ships from:(.+)', line_txt, re.IGNORECASE) if m: ads['ships_from'] = self.get_text(m.group(1)) elif len(linesel.css(".ion-log-in")) > 0: # Ships To icon m = re.search('only ships to certain countries\s*\(([^\)]+)\)', line_txt, re.IGNORECASE) if m: ads['ships_to'] = json.dumps([ self.get_text(x.upper()) for x in m.group(1).split(',') ]) elif 'Worldwide' in line_txt: ads['ships_to'] = 'Worldwide' m = re.search('with Exceptions\s*\(([^\)]+)\)', line_txt, re.IGNORECASE) if m: ads['ships_to_except'] = json.dumps([ self.get_text(x.upper()) for x in m.group(1).split(',') ]) else: self.logger.warning( "New format of 'ships_to' string (%s) at %s" % (line_txt, response.url)) elif len(linesel.css( ".ion-android-share-alt")) > 0: # Properties icons if line_txt: line_txt = line_txt.lower() ads['multisig'] = True if 'multisig' in line_txt else False ads['escrow'] = True if 'escrow' in line_txt else False elif len(linesel.css( ".ion-android-checkmark-circle")) > 0: # Auto Accept icon if line_txt: line_txt = line_txt.lower() ads['auto_accept'] = True if 'auto-accept' in line_txt else False elif len(linesel.css( ".ion-ios-monitor-outline")) > 0: # Digital Good icon pass else: icontype = linesel.css('.ionicons') if icontype: iconclass = icontype[0].xpath('@class').extract_first() self.logger.warning( 'Unhandled information available with icon of type (%s) in offer page at %s' % (iconclass, response.url)) # ========================================= ## ============= Prices Options ======= price_opt_table = response.xpath( ".//h4[contains(text(), 'Prices')]/../table") options = [] for line in price_opt_table.css('tbody tr'): option = {} option['amount'] = self.get_text(line.css('td:nth-child(1)')) option['price_btc'] = self.get_text(line.css('td:nth-child(3)')) options.append(option) if len(options) > 0: ads['price_options'] = json.dumps(options) if len(options) == 1: m = re.search('(\d+(\.\d+)?) BTC.+', options[0]['price_btc']) if m: ads['price'] = m.group(1) ## ============== ## ============ Shipping Options ======== shipping_opt_table = response.xpath( ".//h4[contains(text(), 'Shipping Options')]/../table") options = [] for line in shipping_opt_table.css('tbody tr'): option = {} option['name'] = self.get_text(line.css('td:nth-child(1)')) amount_raw = line.css('td:nth-child(2)').extract_first() amount_raw = amount_raw.replace( '<i class="ionicons ion-ios-infinite"></i>', 'inf') # Infinity option['amount'] = self.get_text(scrapy.Selector(text=amount_raw)) option['price_btc'] = self.get_text( line.css('td:nth-child(4)')).replace(' BTC', '') options.append(option) if len(options) > 0: ads['shipping_options'] = json.dumps(options) ## ===================== # =================== Info block 2. List of key/value with key in bold. if layout == 'with_headings': p = response.xpath( './/h4[contains(text(), "Information")]/..').extract_first() if p is None: # BUG P IS NONE self.logger.warning( "Invalid layout, could not find h4 element with text 'Information' on url " + response.url) p = "" p = re.sub('<h4>[^<]+</h4>', '', p) elif layout == 'without_headings': p = info_block.xpath('./p[2]').extract_first() for line in p.split('<br>'): line_txt = self.get_text(scrapy.Selector(text=line)) known = False m = re.search('minimum amount per order:?\s*(.+)', line_txt, re.IGNORECASE) if m: ads['minimum_order'] = m.group(1) known = True m = re.search('maximum amount per order:?\s*(.+)', line_txt, re.IGNORECASE) if m: ads['maximum_order'] = m.group(1) known = True m = re.search('views:?\s*(.+)', line_txt, re.IGNORECASE) if m: ads['views'] = m.group(1) known = True m = re.search('Quantity in stock:?\s*(.+)', line_txt, re.IGNORECASE) if m: ads['stock'] = m.group(1) known = True m = re.search('Already sold:?\s*(.+)', line_txt, re.IGNORECASE) if m: ads['already_sold'] = m.group(1) known = True m = re.search('Country:?\s*(.+)', line_txt, re.IGNORECASE) if m: ads['country'] = m.group(1) known = True m = re.search('Replace-Time:?\s*(.+)', line_txt, re.IGNORECASE) if m: ads['replace_time'] = m.group(1) known = True m = re.search('Category', line_txt, re.IGNORECASE) if m: known = True if ads['category'] is None: splitted_html = re.sub('\s*<i[^\>]+>\s*</i>\s*', '/', line) line_txt2 = self.get_text( scrapy.Selector(text=splitted_html)) m = re.search('Category:\s*(.+)\s*', line_txt2, re.IGNORECASE) if m: ads['category'] = m.group(1) known = True if not known: self.logger.warning( 'Unknown information type (%s) in ads at %s' % (line_txt, response.url)) if response.url.endswith('refund'): ads['terms_and_conditions'] = self.get_text( response.css("#tabcontent")) else: #ads['description'] = self.get_text(response.css("#tabcontent")); ads['description'] = self.get_text(response.css("#tabcontent")) yield self.make_request('offer-refund', url=response.url + '/refund', category=ads['category']) yield ads #================================================= #if response.url.endswith('refund'): # ads['terms_and_conditions'] = self.get_text(response.css("#tabcontent")) #else: # #ads['description'] = self.get_text(response.css("#tabcontent")); # ads['description'] = self.get_text(response.css("#tabcontent")) # yield self.make_request('offer-refund', url=response.url + '/refund', category=ads['category']) ## ===================== IMAGES ===================== images_url = response.css('img.img-thumbnail::attr(src)').extract() for url in images_url: if url: img_item = items.AdsImage(image_urls=[]) img_item['image_urls'].append( self.make_request('image', url=url) ) # Need Scrapy > 1.4.0 for this to work (base64 url encoded data). img_item['ads_id'] = ads['offer_id'] yield img_item ## ============================ ## ========== Feedbacks ===== feedback_table = response.xpath( './/h3[contains(text(), "Feedback")]/../table') for line in feedback_table.css('tbody tr'): try: rating = items.ProductRating() score = self.get_text(line.css('td:nth-child(1) .text-muted')) m = re.search('\((\d+(.\d+)?)\)', score) if not m: self.logger.warning('Cannot read feedback score %s' % score) continue rating['rating'] = "%s/5" % m.group(1) #rating['comment'] = self.get_text(line.css('td:nth-child(2)')) comment = line.xpath('./td[2]/text()')[0].extract().strip() if comment is None: self.logger.warning( "Couldn't find the review. Inserting an empty string at URL: %s" % url) else: rating['comment'] = comment rating['ads_id'] = ads['offer_id'] rating['submitted_by'] = self.get_text( line.css('td:nth-child(3)')) rating['submitted_on'] = self.parse_timestr( self.get_text(line.css('td:nth-child(4)'))) yield rating except Exception as e: self.logger.warning( "Could not get product feedback. Error : %s" % e)
def parse_listing(self, response): try: ads_item = items.Ads() ads_item["offer_id"] = re.search(r"ls_id=(\d+)", response.url, re.M | re.I).group(1) ads_item["vendor_username"] = self.get_text( response.xpath("//small/a[contains(@href,'user.php?u_id=')]")) ads_item["vendor_username"] = ads_item["vendor_username"].split( "(")[0].strip() ads_item["fullurl"] = response.url.split("&")[0] ads_item["relativeurl"] = self.get_relative_url( ads_item["fullurl"]) ads_item["title"] = response.xpath( ".//div[@class='col-sm-12']/a[contains(@href, 'ls_id')]/text()" ).extract_first() ads_item["ships_to"] = self.get_text( response.xpath( "//small//b[contains(text(),'Ship To :')]/ancestor::small") ).replace("Ship To :", "").strip() if ads_item["ships_to"] == "": #self.logger.warning("Fallback to other shipping to field at %s." % response.url) ads_item["ships_to"] = self.get_text( response.xpath( "//small//b[contains(text(),'Ship To :')]/ancestor::small/following-sibling::small[1]" )) ads_item["ships_from"] = self.get_text( response.xpath( "//small//b[contains(text(),'Origin Country :')]/ancestor::small" )).replace("Origin Country :", "").strip() ads_item["ads_class"] = self.get_text( response.xpath( "//small//b[contains(text(),'Product class :')]/ancestor::small" )).replace("Product class :", "").strip() ads_item["quantity"] = self.get_text( response.xpath( "//small//b[contains(text(),'Quantity :')]/ancestor::small" )).replace("Quantity :", "").strip() accepted_currencies = [] sale_price = self.get_text( response.xpath( "//form//span[contains(text(),'Sale Price :')]")).replace( "Sale Price :", "").strip() if "USD" in sale_price: ads_item["price_usd"] = re.search(r"([\d\.]+)\s*USD", sale_price, re.M | re.I) ads_item["price_usd"] = ads_item["price_usd"].group( 1) if ads_item["price_usd"] else None if "BTC" in sale_price: ads_item["price_btc"] = re.search(r"([\d\.]+)\s*BTC", sale_price, re.M | re.I) ads_item["price_btc"] = ads_item["price_btc"].group( 1) if ads_item["price_btc"] else None accepted_currencies.append("BTC") ads_item["accepted_currencies"] = ",".join(accepted_currencies) ads_item["shipping_options"] = self.get_shipping_options(response) # new fields ads_item["escrow"] = self.get_text( response.xpath( "//small//b[contains(text(),'Payment :')]/ancestor::small") ).replace("Payment :", "").strip() active_tab = self.get_text( response.xpath( "//ul[@class='nav nav-tabs']/li[@class='active']/a")) if "Product Description" in active_tab: ads_item['description'] = self.get_text( response.xpath("//div[@class='tab-content']")) elif "Refund Policy" in active_tab: ads_item['refund_policy'] = self.get_text( response.xpath("//div[@class='tab-content']")) elif "Product Tags" in active_tab: pass elif "Feedback" in active_tab: feedbacks = response.xpath( "//div[@class='tab-content']//table/tbody/tr") if feedbacks: for feedback in feedbacks: rating = items.ProductRating() rating["ads_id"] = ads_item["offer_id"] rating["submitted_by"] = self.get_text( feedback.xpath("td[3]/small")) rating["submitted_on_string"] = self.get_text( feedback.xpath("td[5]/small")).replace( "View Item", "").strip() rating["submitted_on"] = self.parse_datetime( rating["submitted_on_string"]) rating["comment"] = self.get_text( feedback.xpath("td[2]/small")) rating["price_usd"] = self.get_text( feedback.xpath("td[4]/small")) # new fields score = self.get_text(feedback.xpath("td[1]")) if score == "\xe2\x98\x91": rating["rating"] = "Positive" elif score == "\xe2\x98\x92": rating["rating"] = "Negative" elif score == "\xe2\x98\x90": rating["rating"] = "Neutral" else: self.logger.warning( "Unknown rating type '%s' at %s" % (rating["rating"], response.url)) yield rating else: self.logger.warning("Unknown tab: %s at %s" % (active_tab, response.url)) yield ads_item except Exception as error: self.logger.warning("Couldn't yield Ad (Error %s) at %s." % (error, response.url)) if self.is_listing_tab_page(response) is False: # self.requests_from_listing_page(response) image_urls = response.xpath( "//img[@class='pull-left']/@src").extract() if len(image_urls) > 0: img_item = items.AdsImage(image_urls=[]) for img_url in image_urls: # e.g. uploads/9bc5f18d5667081890e8972def13da2f_100_100.png # -> uploads/9bc5f18d5667081890e8972def13da2f.png img_url = re.sub(r"_\d+_\d+\.", ".", img_url) img_item['image_urls'].append( self.make_request(reqtype='image', url=img_url)) img_item['ads_id'] = ads_item['offer_id'] yield img_item
def parse_listing(self, response): # The ad. ads_item = items.Ads() ads_item["offer_id"] = re.search(r"/item/([^/]+)", response.url, re.M | re.I) if ads_item["offer_id"]: ads_item["offer_id"] = ads_item["offer_id"].group(1) else: self.logger.warning("offer_id is None at %s" % response.url) return ads_item["vendor_username"] = re.search(r"/user/([^/]+)", response.url, re.M | re.I) if ads_item["vendor_username"]: ads_item["vendor_username"] = ads_item["vendor_username"].group(1) ads_item["fullurl"] = response.url.split( ads_item["offer_id"])[0] + ads_item["offer_id"] ads_item["relativeurl"] = self.get_relative_url(ads_item["fullurl"]) ads_item["title"] = "".join( response.xpath( ".//div[@class='ui segment inverted t-item-image secondary']/h3/text()" ).extract()).strip() ads_item["description"] = self.get_text( response.xpath( ".//div[@class='ui segment']/h3[contains(text(),'About')]/following-sibling::div" )) ads_item["shipping_options"] = self.get_shipping_options(response) ads_item["product_rating"] = response.xpath( ".//div[@class='ui segment inverted t-item-image secondary']/h3//i[@class='icon thumbs up']/following-sibling::span/text()" ).extract_first(default="").strip() yield ads_item # The images. image_urls = response.xpath( ".//div[@class='ui segment inverted t-item-image secondary']/img/@src" ).extract() if len(image_urls) > 0: img_item = items.AdsImage(image_urls=[]) for img_url in image_urls: img_item['image_urls'].append( self.make_request(reqtype='image', url=img_url)) img_item['ads_id'] = ads_item['offer_id'] yield img_item # The reviews. feedbacks = response.xpath( ".//div[@class='ui segment']/h3[contains(text(),'Reviews')]/following-sibling::div[@class='ui comments']/div[@class='comment']" ) if feedbacks: for feedback in feedbacks: rating = items.ProductRating() rating["ads_id"] = ads_item["offer_id"] rating["submitted_by"] = feedback.xpath( ".//a[@class='author']/text()").extract_first( default="").strip().replace("@", "") rating["submitted_on_string"] = feedback.xpath( ".//span[@class='date']/text()").extract_first( default="").strip() rating["submitted_on"] = self.parse_datetime( rating["submitted_on_string"]) rating["comment"] = self.get_text( feedback.xpath(".//pre[@class='text']")) rating["rating"] = feedback.xpath( ".//i[@class='icon thumbs up']/following-sibling::span/text()" ).extract_first(default="").strip() yield rating