Python ProductRating示例，scrapyprj.items.market_items.ProductRating Python示例

示例#1

0

显示文件

文件： CannabisGrowersCoopSpider.py 项目： lionheart1022/torforum_crawler

    def parse_user_ratings(self, response):
        for rating_element in response.css("ul.list-ratings li"):
            product_name_element = rating_element.css('div.left small')
            product_url = product_name_element.css(
                "a::attr(href)").extract_first()
            if (product_url):
                rating = items.ProductRating()
                ad_id = self.get_ad_id(product_url)
                rating['ads_id'] = ad_id
                yield self.make_request('ads', url=product_url, ads_id=ad_id)
            else:  # No product URL found, still save it with the product name
                rating = items.UserRating()
                rating['username'] = response.meta['username']
                rating['item_name'] = self.get_text(product_name_element)

            #rating['submitted_on'] = self.get_text(rating_element.css('.left date'))
            rating['submitted_on'] = self.to_utc(
                dateutil.parser.parse(
                    self.get_text(rating_element.css('.left date'))))

            rating['rating'] = len(rating_element.css('.rating.stars i.full'))
            rating['comment'] = self.get_text(
                rating_element.css('div.right.formatted'))
            yield rating

        next_page_url = response.css(
            "section#main a.arrow-right::attr(href)").extract_first()
        if next_page_url:
            yield self.make_request('user_ratings',
                                    url=next_page_url,
                                    username=response.meta['username'])

示例#2

0

显示文件

文件： CannabisGrowersCoopSpider.py 项目： lionheart1022/torforum_crawler

 def parse_ads_ratings(self, response):
     for rating_element in response.css("ul.list-ratings li"):
         rating = items.ProductRating()
         rating['ads_id'] = response.meta['ads_id']
         #rating['submitted_on'] = self.get_text(rating_element.css('.left date'))
         rating['submitted_on'] = self.to_utc(
             dateutil.parser.parse(
                 self.get_text(rating_element.css('.left date'))))
         rating['rating'] = len(rating_element.css('.rating.stars i.full'))
         rating['comment'] = self.get_text(
             rating_element.css('div.right.formatted'))
         yield rating

示例#3

0

显示文件

 def parse_ads_ratings(self, response):
     for rating_element in response.css('section.main_items article'):
         rating = items.ProductRating()
         rating['ads_id'] = response.meta['ads_id']
         header = self.get_text(rating_element.css('h1'))
         # header contains a bunch of info, formatted like this :
         # ★☆☆☆☆ 4 hours, 33 minutes ago: 2017-12-10
         header_parts = header.split(' ')
         last_part = header_parts[-1]
         if re.match('\d{4}-\d{2}-\d{2}', last_part):
             rating['submitted_on'] = last_part
         first_part = header_parts[0]
         rating['rating'] = first_part.count(
             '&starf;')  # &starf; is the html entity for ★
         rating['comment'] = self.get_text(rating_element.css('p'))
         yield rating

示例#4

0

显示文件

文件： HansaMarketSpider.py 项目： lionheart1022/torforum_crawler

	def parse_listing_feedback(self, response):
		
		m = re.search('listing\/(\d+)', response.url)
		if not m:
			raise Exception('Cannot find listing ID')
		listing_id= m.group(1)
		
		for line in response.css('ul.nav li[role="presentation"].active').xpath("./../../table/tbody/tr"):
			try:
				rating = items.ProductRating()
				cells = line.css('td')
				expected_cols = 5
				if len(cells) != expected_cols:
					raise WarningException("Feedback tables does not have %d columns as expected." % expected_cols)

				if len(cells[0].css('.label-danger')) > 0:
					rating['rating'] = 'Negative'
				elif len(cells[0].css('.label-success')) > 0:
					rating['rating'] = 'Positive'
				elif len(cells[0].css('.label-default')) > 0:
					rating['rating'] = 'Neutral'
				else:
					raise WarningException('Unknown rating icon')

				rating['delivery_time'] = self.get_text(cells[2])
				rating['submitted_on'] 	= self.parse_timestr(self.get_text(cells[4]))
				rating['comment'] 		= self.get_text(cells[1].css("p:first-child"))
				rating['ads_id'] = listing_id

				m = re.match(r'([^\[]+)(\[\d+\])?', self.get_text(cells[3]))
				if m:
					rating['submitted_by'] 	= self.get_text(m.group(1))

				yield rating

			except WarningException as e:
				self.logger.warning("Could not get listing feedback at %s. %s" % (response.url, e))
			except:
				raise

			for url in response.css(".pages ul.pagination a::attr(href)").extract():
				if not url.endswith('page=1'):  # We already saw that page, but maybe with a different URL (no page parameter)
					yield self.make_request('listing_feedback', url=url, listing_id=listing_id)

示例#5

0

显示文件

    def parse_product_rating(self, response):
        try:
            ads_id = re.search(r'/listings/[\w-]+/([\w-]+)/feedback',
                               response.url).group(1)
            ratings = response.css('div.product-details table.table tbody tr')

            for rating in ratings:
                tds = rating.css('td')
                product_rating = items.ProductRating()
                product_rating['submitted_by'] = self.get_text(tds[0])
                product_rating['rating'] = len(tds[1].css('i'))
                product_rating['comment'] = self.get_text(tds[2])
                product_rating['submitted_on'] = self.parse_datetime(
                    self.get_text(tds[3])).date()
                product_rating['submitted_on_string'] = self.get_text(tds[3])
                product_rating['ads_id'] = ads_id
                yield product_rating

        except Exception as error:
            self.logger.warning(
                "Failed to yield product rating at %s because '%s'" %
                (response.url, error))

示例#6

0

显示文件

文件： BerlusconiMarketSpider.py 项目： alexmason528/torforum_crawler

    def parse_product_rating(self, response):
        try:
            ads_id = re.search('c=listings&a=product&code=([\w]+)&tab=3$',
                               response.url).group(1)
            ratings = response.css('ul.nav-tabs').xpath(
                'following-sibling::table').css('tbody tr')

            for rating in ratings:
                tds = rating.css('td')
                product_rating = items.ProductRating()
                product_rating['ads_id'] = ads_id
                product_rating['comment'] = self.get_text(tds[1])
                product_rating['submitted_by'] = rating.css(
                    'td:nth-child(3)::text').extract_first().replace(' ', '')
                product_rating['submitted_on'] = self.parse_datetime(
                    self.get_text(tds[3]))
                product_rating['submitted_on_string'] = self.get_text(tds[3])
                prev_transactions = rating.xpath(
                    ".//small/text()").extract_first()
                prev_transactions = prev_transactions.replace('[', '')
                prev_transactions = prev_transactions.replace(']', '')
                product_rating[
                    'submitted_by_number_transactions'] = prev_transactions
                price_match = re.search('([\d\.]+) ([\w]+)',
                                        self.get_text(tds[4]))
                if price_match:
                    price = price_match.group(1)
                    currency = price_match.group(2).lower()
                    if currency == 'usd':
                        product_rating['price_usd'] = price
                    elif currency == 'xmr':
                        product_rating['price_xmr'] = price
                yield product_rating

        except Exception as error:
            self.logger.warning(
                "Failed to yield product rating at %s because '%s'" %
                (response.url, error))

示例#7

0

显示文件

    def parse_listing(self, response):
        title = response.xpath(
            ".//section[@id='content1']//div[@class='listing_right']/span/text()"
        ).extract_first(default="").strip()
        username = response.xpath(
            ".//section[@id='content1']//div[@class='listing_right']//a[@class='greenlink']/text()"
        ).extract_first(default="").strip()
        if title == "" and username == "":
            self.logger.warning("Found what is likely an empty page at %s." %
                                response.url)
        else:
            # Try to yield ads.
            try:
                ads_item = items.Ads()
                ads_item['title'] = title
                ads_item['vendor_username'] = username
                ads_item['relativeurl'] = self.get_relative_url(response.url)
                ads_item['fullurl'] = response.url
                if 'clid' in response.url:
                    ads_item['offer_id'] = self.get_url_param(
                        response.url, 'clid')
                else:
                    ads_item['offer_id'] = self.get_url_param(
                        response.url, 'lid')
                ads_item['category'] = response.xpath(
                    ".//section[@id='content1']//div[@class='listing_right']/br/following-sibling::span/text()"
                ).extract_first(default="").strip()
                ads_item['ships_from'] = response.xpath(
                    ".//section[@id='content1']//div[@class='listing_right']//b[contains(text(),'Shipping From:')]/following-sibling::span/text()"
                ).extract_first(default="").strip()
                ads_item['ships_to'] = response.xpath(
                    ".//section[@id='content1']//div[@class='listing_right']//b[contains(text(),'Shipping To:')]/following-sibling::span/text()"
                ).extract_first(default="").strip()
                ads_item['description'] = self.get_text(
                    response.xpath(".//section[@id='content1']/p"))
                ads_item['escrow'] = self.get_text(
                    response.xpath(
                        ".//section[@id='content1']//div[@class='listing_right']/div/span[@style='float:right']/span"
                    ))
                ads_item['multisig'] = response.xpath(
                    ".//section[@id='content1']//div[@class='listing_right']/div/span[@style='float:right']/img[@alt='Multisig']"
                )
                ads_item['multisig'] = True if ads_item['multisig'] else False
                ads_item['stock'] = self.get_text(
                    response.xpath(
                        ".//section[@id='content1']//div[@class='listing_right']/div/span[not(@style='float:right')]/span"
                    ))
                ads_item['shipping_options'] = self.get_shipping_options(
                    response)
                ads_item['accepted_currencies'] = self.get_accepted_currencies(
                    response)

                prices_text = self.get_text(
                    response.xpath(
                        ".//section[@id='content1']//div[@class='listing_right']/p"
                    ))
                price_usd = re.search(r"\$\s*([\d\.]+)", prices_text,
                                      re.M | re.I)
                price_btc = re.search(r"([\d\.]+)\s*฿", prices_text,
                                      re.M | re.I)
                price_xmr = re.search(r"([\d\.]+)\s*XMR", prices_text,
                                      re.M | re.I)

                if price_usd:
                    ads_item["price_usd"] = price_usd.group(1)
                else:
                    self.logger.warning("No price_usd found on %s" %
                                        response.url)
                if price_xmr:
                    ads_item["price_xmr"] = price_xmr.group(1)
                if price_btc:
                    ads_item["price_btc"] = price_btc.group(1)

                yield ads_item
            except Exception as error:
                self.logger.warning("Couldn't yield ad from %s (Error: %s)" %
                                    (response.url, error))
            # Try to yield images.
            try:
                image_urls = response.xpath(
                    ".//section[@id='content1']//div[@class='listing_image']/img/@src"
                ).extract()
                if len(image_urls) > 0:
                    img_item = items.AdsImage(image_urls=[])
                    for img_url in image_urls:
                        img_item['image_urls'].append(
                            self.make_request(reqtype='image', url=img_url))
                    img_item['ads_id'] = ads_item['offer_id']
                    yield img_item
            except Exception as error:
                self.logger.warning(
                    "Couldn't yield ad images from %s (Error: %s)" %
                    (response.url, error))

        # Yield product ratings.
        # Note, that the price is also available in ads.
        feedbacks = response.xpath(
            ".//section[@id='content2']//div[@class='feedback']")
        if feedbacks:
            for feedback in feedbacks:
                rating = items.ProductRating()
                rating["ads_id"] = ads_item["offer_id"]
                rating["submitted_on_string"] = feedback.xpath(
                    "div[@class='feedback_header']/span/text()").extract_first(
                        default="").strip()
                rating["submitted_on"] = self.parse_datetime(
                    rating["submitted_on_string"])
                rating['price_usd'] = feedback.xpath(
                    "div[@class='feedback_subheader']/div/span/text()[contains(., 'USD')]"
                ).extract_first()
                rating['price_usd'] = rating['price_usd'].replace(
                    "~", "").replace("USD", "").replace(" ", "")
                rating_star = feedback.xpath(
                    "div[@class='feedback_subheader']//div[contains(@style,'img/star.png')]/@style"
                ).extract_first(default="")
                rating_star = re.search(r"width:(\d+)px;height", rating_star,
                                        re.M | re.S)
                if rating_star:
                    rating_star = float(rating_star.group(1))
                    rating['rating'] = rating_star / 120 * 5
                warning = feedback.xpath(
                    "div[@class='feedback_subheader']/div/span")
                if warning and len(warning) > 1:
                    rating['warnings'] = self.get_text(warning[0])
                rating["comment"] = self.get_text(feedback.xpath("p"))
                rating["submitted_by"] = feedback.xpath(
                    "div[@class='feedback_header']//span[@class='feedbackScore']/../text()"
                ).extract_first(default="").strip()
                rating["submitter_rating"] = self.get_text(
                    feedback.xpath(
                        "div[@class='feedback_header']//span[@class='feedbackScore']/sup"
                    ))
                rating["submitted_by_number_transactions"] = self.get_text(
                    feedback.xpath(
                        "div[@class='feedback_header']//span[@class='feedbackScore']/sub"
                    ))

                yield rating

示例#8

0

显示文件

文件： DreamMarketSpider.py 项目： alexmason528/torforum_crawler

        else:
            self.logger.warning('Unknown listing status %s' % response.url)

        ## ===================== IMAGES =====================
        images_url = response.css('img.productImage::attr(src)').extract()
        for url in images_url:
            img_item = items.AdsImage(image_urls=[])
            img_item['image_urls'].append(self.make_request('image', url=url))
            img_item['ads_id'] = ads_item['offer_id']
            yield img_item

        ## ===================== Product Ratings (feedback) =========
        rating_lines = response.css('.ratings table tr')
        for tr in rating_lines:
            try:
                rating_item = items.ProductRating()

                age = self.get_text(tr.css('td.age'))
                m = re.search('(\d+)d', age)
                if m:
                    days_offset = m.group(1)
                    # A sanity check. Dream has some dates which are in 1969 and 1970..
                    submitted_on = (datetime.utcnow() -
                                    timedelta(days=int(days_offset))).date()
                    if submitted_on < date(2011, 1, 1):
                        submitted_on = ''
                        self.logger.warning(
                            "Encountered a date outside the acceptable range. See URL: %s"
                            % response.url)
                    else:
                        rating_item['submitted_on'] = submitted_on

示例#9

0

显示文件

    def parse_listing(self, response):
        ads = items.Ads()
        ads_img = items.AdsImage()

        listing_content = response.css("#content1")  # Tabs
        feedback_content = response.css("#content2")  # Tabs

        ads['title'] = self.get_text_first(response.css('.listing_right span'))
        #ads['offer_id']		= self.get_url_param(response.url, 'lid')
        try:
            offer_id = self.get_url_param(response.url, 'lid')
            ads['offer_id'] = self.get_url_param(response.url, 'lid')
        except:
            self.logger.warning(
                "Ran into a URL parameter issue at URL: %s. Offer_ID is not recorded."
                % (response.url))
            ads['offer_id'] = self.get_url_param(response.url, 'lid')
        ads['relativeurl'] = response.meta['relativeurl']
        ads['fullurl'] = self.make_url(ads['relativeurl'])
        user_url = response.css('.listing_right').xpath(
            './/a[contains(@href, "page=profile")]/@href').extract_first()
        # Some items don't have an associated vendor.
        try:
            ads['vendor_username'] = self.get_url_param(user_url, 'user')
        except:
            self.logger.warning(
                'No seller available at URL: %s. Seller is noted as \'\'. Inspect the URL post-crawl.'
                % (response.url))
            ads['vendor_username'] = ''

        ads['category'] = response.meta['category']

        multilisting_select = listing_content.css(
            'select[name="multilistingChild"]'
        )  # 2 types of Ads. Multilisting or not.

        if not multilisting_select:
            ads['multilisting'] = False
            listing_right_p = self.get_text(
                listing_content.css(".listing_right p"))
            m = re.search(
                r'\((\d+(\.\d+)?)\s*\xe0\xb8\xbf\)', listing_right_p
            )  # Search for bitcoin icon \xe0\b8\xbf is unicode char for bitcoin encoded in UTF8
            m2 = re.search(r'([0-9.]{1,10}) \xe0\xb8\xbf', listing_right_p)
            if m:
                ads['price'] = m.group(1)
            # minor error handling in case the previous regex written by Pier-Yver doesn't catch bitcoin prices.
            elif m is None and m2 is not None:
                ads['price'] = m2.group(1)
                #self.logger.warning('Encountered an error with the old price-regex. Using RM\'s regex at URL: %s' % (response.url))
        else:
            ads['multilisting'] = True
            options = []
            # Just added @ below which should fix everything.
            for option in multilisting_select.xpath('.//option[@value!=""]'):
                options.append(self.get_text(option))

            ads['price'] = json.dumps(options)

        #Bunches of regex to parse the page.
        listing_right_html = self.get_text(
            listing_content.css('.listing_right').extract_first()
        )  # Read HTML. We need tags as separator.
        listing_right_span_text = self.get_text(
            listing_content.css('.listing_right span'))
        m = re.search('<b>shipping from\s*:\s*</b>\s*([^<]+)',
                      listing_right_html, re.IGNORECASE)
        if m:
            ads['ships_from'] = m.group(1)

        m = re.search('<b>shipping to\s*:\s*</b>\s*([^<]+)',
                      listing_right_html, re.IGNORECASE)
        if m:
            ads['ships_to'] = m.group(1)
        shipping_options = []
        for option in listing_content.css(
                '.listing_right form select[name="shipment"] option[value!=""]::text'
        ).extract():
            shipping_options.append(self.get_text(option))
        ads['shipping_options'] = json.dumps(shipping_options)
        ads['description'] = self.get_text(listing_content.xpath('./p'))
        stocks_possibilities = [
            'Excellent stock', 'Good stock', 'Low stock', 'Very low stock'
        ]
        for possibility in stocks_possibilities:
            if possibility in listing_right_span_text:
                ads['stock'] = possibility
                break

        yield ads

        # Ads Image.
        ads_img['ads_id'] = ads['offer_id']
        ads_img['image_urls'] = [
            self.make_request(
                'image',
                url=listing_content.css(
                    ".listing_image img::attr(src)").extract_first(),
                referer=response.url)
        ]
        yield ads_img

        # Handling listing feedbacks
        for feedback in feedback_content.css(".feedback"):
            try:
                rating = items.ProductRating()
                rating['ads_id'] = ads['offer_id']
                rating['comment'] = self.get_text(feedback.css('p'))
                #rating['submitted_by'] 	= self.get_text(feedback.css('.feedback_header span a'))
                try:
                    username = feedback.css('.feedback_header span a').xpath(
                        "./text()")[0].extract().strip()
                except:
                    username = ''
                    self.logger.warning(
                        'Found a review with no username. URL: %s' %
                        response.url)
                rating['submitted_on'] = self.parse_timestr(
                    self.get_text(
                        feedback.css('.feedback_header').xpath(
                            'span/text()').extract_first()))
                rating['submitted_by'] = username
                #star_styles = feedback.css('.feedback_subheader').xpath('./div[1]/@style').extract_first()
                star_styles = feedback.css('.feedback_subheader').xpath(
                    './div/div')[0].extract()
                m = re.search(r'width:(\d+)px', star_styles)
                if m:
                    width = int(m.group(1))
                    rating['rating'] = '%d/5' % (width // 24
                                                 )  # One star is 24 px wide
                else:
                    self.logger.warning('Cannot find product rating score.')
                yield rating
            except Exception as e:
                self.logger.warning(
                    'Could not get listing feedback at %s. Error %s' %
                    (response.url, e))

        #If there is several pages of feedback. feedback_buffer_middleware will buffer them until we have them all and then sends them further in pipeline
        for url in feedback_content.css(
                'div.pagination a::attr(href)').extract():
            if self.get_url_param(url, 'pg') != '1':
                yield self.make_request(
                    'listing',
                    url=url,
                    relativeurl=response.meta['relativeurl'],
                    ads_id=ads['offer_id'],
                    category=response.meta['category'])
        # If statement to avoid requesting vendors pages when there is no vendor associated with an item.
        if ads['vendor_username'] is not '':
            yield self.make_request('userprofile', url=user_url)

示例#10

0

显示文件

文件： WallstreetMarket.py 项目： lionheart1022/torforum_crawler

    def parse_offer(self, response):
        ads = items.Ads()
        ads['offer_id'] = self.get_offer_id_from_url(response.url)

        layout = 'unknown'
        info_block = response.xpath(
            '//h1[text()="Info"]/..'
        )  # Two known layout. Try first, fallback on second

        if len(info_block) == 1:
            layout = 'with_headings'
        else:
            layout = 'without_headings'

        if layout == 'without_headings':
            info_block = response.xpath(
                '//h1[contains(@class, "fheading")]/..')

        ads['title'] = self.get_text(response.css('h1.fheading'))
        ads['vendor_username'] = self.get_text(
            info_block.xpath('.//a[contains(@href, "profile")]'))
        if 'category' in response.meta and response.meta[
                'category'] is not None:
            ads['category'] = response.meta['category']
        else:
            ads['category'] = None

        ads['fullurl'] = response.url.replace('/refund', '')
        ads['relativeurl'] = "/offer/%s" % ads['offer_id']

        # =====  Info block 1 - Ships from/to, escrot, multisig, etc ==========
        # We determine the type of info by the icon in front of it. Most reliable way to do it as layout changes freely between listings

        if layout == 'with_headings':
            p = info_block.xpath('./p[1]')
        elif layout == 'without_headings':
            p = info_block.xpath('./p[1]')

        for line in p.extract_first().split('<br>'):
            linesel = scrapy.Selector(text=line)
            line_txt = self.get_text(linesel)

            if len(linesel.css(".ion-log-out")) > 0:  # Ships From icon
                m = re.search('ships from:(.+)', line_txt, re.IGNORECASE)
                if m:
                    ads['ships_from'] = self.get_text(m.group(1))
            elif len(linesel.css(".ion-log-in")) > 0:  # Ships To icon
                m = re.search('only ships to certain countries\s*\(([^\)]+)\)',
                              line_txt, re.IGNORECASE)
                if m:
                    ads['ships_to'] = json.dumps([
                        self.get_text(x.upper()) for x in m.group(1).split(',')
                    ])
                elif 'Worldwide' in line_txt:
                    ads['ships_to'] = 'Worldwide'
                    m = re.search('with Exceptions\s*\(([^\)]+)\)', line_txt,
                                  re.IGNORECASE)
                    if m:
                        ads['ships_to_except'] = json.dumps([
                            self.get_text(x.upper())
                            for x in m.group(1).split(',')
                        ])
                else:
                    self.logger.warning(
                        "New format of 'ships_to' string  (%s) at %s" %
                        (line_txt, response.url))
            elif len(linesel.css(
                    ".ion-android-share-alt")) > 0:  # Properties icons
                if line_txt:
                    line_txt = line_txt.lower()
                    ads['multisig'] = True if 'multisig' in line_txt else False
                    ads['escrow'] = True if 'escrow' in line_txt else False
            elif len(linesel.css(
                    ".ion-android-checkmark-circle")) > 0:  # Auto Accept icon
                if line_txt:
                    line_txt = line_txt.lower()
                    ads['auto_accept'] = True if 'auto-accept' in line_txt else False
            elif len(linesel.css(
                    ".ion-ios-monitor-outline")) > 0:  # Digital Good icon
                pass
            else:
                icontype = linesel.css('.ionicons')
                if icontype:
                    iconclass = icontype[0].xpath('@class').extract_first()
                    self.logger.warning(
                        'Unhandled information available with icon of type (%s) in offer page at %s'
                        % (iconclass, response.url))
        # =========================================

        ## ============= Prices Options =======
        price_opt_table = response.xpath(
            ".//h4[contains(text(), 'Prices')]/../table")
        options = []
        for line in price_opt_table.css('tbody tr'):
            option = {}
            option['amount'] = self.get_text(line.css('td:nth-child(1)'))
            option['price_btc'] = self.get_text(line.css('td:nth-child(3)'))
            options.append(option)

        if len(options) > 0:
            ads['price_options'] = json.dumps(options)
            if len(options) == 1:
                m = re.search('(\d+(\.\d+)?) BTC.+', options[0]['price_btc'])
                if m:
                    ads['price'] = m.group(1)

        ## ==============

        ## ============ Shipping Options ========
        shipping_opt_table = response.xpath(
            ".//h4[contains(text(), 'Shipping Options')]/../table")
        options = []
        for line in shipping_opt_table.css('tbody tr'):
            option = {}
            option['name'] = self.get_text(line.css('td:nth-child(1)'))
            amount_raw = line.css('td:nth-child(2)').extract_first()
            amount_raw = amount_raw.replace(
                '<i class="ionicons ion-ios-infinite"></i>', 'inf')  # Infinity
            option['amount'] = self.get_text(scrapy.Selector(text=amount_raw))
            option['price_btc'] = self.get_text(
                line.css('td:nth-child(4)')).replace(' BTC', '')
            options.append(option)

        if len(options) > 0:
            ads['shipping_options'] = json.dumps(options)
        ## =====================

        # ===================   Info block 2. List of key/value with key in bold.
        if layout == 'with_headings':
            p = response.xpath(
                './/h4[contains(text(), "Information")]/..').extract_first()
            if p is None:  # BUG P IS NONE
                self.logger.warning(
                    "Invalid layout, could not find h4 element with text 'Information' on url "
                    + response.url)
                p = ""
            p = re.sub('<h4>[^<]+</h4>', '', p)
        elif layout == 'without_headings':
            p = info_block.xpath('./p[2]').extract_first()

        for line in p.split('<br>'):
            line_txt = self.get_text(scrapy.Selector(text=line))
            known = False
            m = re.search('minimum amount per order:?\s*(.+)', line_txt,
                          re.IGNORECASE)
            if m:
                ads['minimum_order'] = m.group(1)
                known = True

            m = re.search('maximum amount per order:?\s*(.+)', line_txt,
                          re.IGNORECASE)
            if m:
                ads['maximum_order'] = m.group(1)
                known = True

            m = re.search('views:?\s*(.+)', line_txt, re.IGNORECASE)
            if m:
                ads['views'] = m.group(1)
                known = True

            m = re.search('Quantity in stock:?\s*(.+)', line_txt,
                          re.IGNORECASE)
            if m:
                ads['stock'] = m.group(1)
                known = True

            m = re.search('Already sold:?\s*(.+)', line_txt, re.IGNORECASE)
            if m:
                ads['already_sold'] = m.group(1)
                known = True

            m = re.search('Country:?\s*(.+)', line_txt, re.IGNORECASE)
            if m:
                ads['country'] = m.group(1)
                known = True

            m = re.search('Replace-Time:?\s*(.+)', line_txt, re.IGNORECASE)
            if m:
                ads['replace_time'] = m.group(1)
                known = True

            m = re.search('Category', line_txt, re.IGNORECASE)
            if m:
                known = True
                if ads['category'] is None:
                    splitted_html = re.sub('\s*<i[^\>]+>\s*</i>\s*', '/', line)
                    line_txt2 = self.get_text(
                        scrapy.Selector(text=splitted_html))

                    m = re.search('Category:\s*(.+)\s*', line_txt2,
                                  re.IGNORECASE)
                    if m:
                        ads['category'] = m.group(1)
                        known = True

            if not known:
                self.logger.warning(
                    'Unknown information type (%s) in ads at %s' %
                    (line_txt, response.url))
        if response.url.endswith('refund'):
            ads['terms_and_conditions'] = self.get_text(
                response.css("#tabcontent"))
        else:
            #ads['description']				= self.get_text(response.css("#tabcontent"));
            ads['description'] = self.get_text(response.css("#tabcontent"))
            yield self.make_request('offer-refund',
                                    url=response.url + '/refund',
                                    category=ads['category'])
        yield ads
        #=================================================

        #if response.url.endswith('refund'):
        #	ads['terms_and_conditions']		= self.get_text(response.css("#tabcontent"))
        #else:
        #	#ads['description']				= self.get_text(response.css("#tabcontent"));
        #	ads['description']				= self.get_text(response.css("#tabcontent"))
        #	yield self.make_request('offer-refund', url=response.url + '/refund', category=ads['category'])

        ## ===================== IMAGES =====================
        images_url = response.css('img.img-thumbnail::attr(src)').extract()
        for url in images_url:
            if url:
                img_item = items.AdsImage(image_urls=[])
                img_item['image_urls'].append(
                    self.make_request('image', url=url)
                )  # Need Scrapy > 1.4.0 for this to work (base64 url encoded data).
                img_item['ads_id'] = ads['offer_id']
                yield img_item
        ## ============================

        ## ========== Feedbacks =====

        feedback_table = response.xpath(
            './/h3[contains(text(), "Feedback")]/../table')

        for line in feedback_table.css('tbody tr'):
            try:
                rating = items.ProductRating()
                score = self.get_text(line.css('td:nth-child(1) .text-muted'))
                m = re.search('\((\d+(.\d+)?)\)', score)
                if not m:
                    self.logger.warning('Cannot read feedback score %s' %
                                        score)
                    continue

                rating['rating'] = "%s/5" % m.group(1)
                #rating['comment'] 		= self.get_text(line.css('td:nth-child(2)'))
                comment = line.xpath('./td[2]/text()')[0].extract().strip()
                if comment is None:
                    self.logger.warning(
                        "Couldn't find the review. Inserting an empty string at URL: %s"
                        % url)
                else:
                    rating['comment'] = comment
                rating['ads_id'] = ads['offer_id']
                rating['submitted_by'] = self.get_text(
                    line.css('td:nth-child(3)'))
                rating['submitted_on'] = self.parse_timestr(
                    self.get_text(line.css('td:nth-child(4)')))
                yield rating
            except Exception as e:
                self.logger.warning(
                    "Could not get product feedback. Error : %s" % e)

示例#11

0

显示文件

文件： ApollonMarketSpider.py 项目： alexmason528/torforum_crawler

    def parse_listing(self, response):
        try:
            ads_item = items.Ads()
            ads_item["offer_id"] = re.search(r"ls_id=(\d+)", response.url,
                                             re.M | re.I).group(1)
            ads_item["vendor_username"] = self.get_text(
                response.xpath("//small/a[contains(@href,'user.php?u_id=')]"))
            ads_item["vendor_username"] = ads_item["vendor_username"].split(
                "(")[0].strip()
            ads_item["fullurl"] = response.url.split("&")[0]
            ads_item["relativeurl"] = self.get_relative_url(
                ads_item["fullurl"])
            ads_item["title"] = response.xpath(
                ".//div[@class='col-sm-12']/a[contains(@href, 'ls_id')]/text()"
            ).extract_first()
            ads_item["ships_to"] = self.get_text(
                response.xpath(
                    "//small//b[contains(text(),'Ship To :')]/ancestor::small")
            ).replace("Ship To :", "").strip()
            if ads_item["ships_to"] == "":
                #self.logger.warning("Fallback to other shipping to field at %s." % response.url)
                ads_item["ships_to"] = self.get_text(
                    response.xpath(
                        "//small//b[contains(text(),'Ship To :')]/ancestor::small/following-sibling::small[1]"
                    ))
            ads_item["ships_from"] = self.get_text(
                response.xpath(
                    "//small//b[contains(text(),'Origin Country :')]/ancestor::small"
                )).replace("Origin Country :", "").strip()
            ads_item["ads_class"] = self.get_text(
                response.xpath(
                    "//small//b[contains(text(),'Product class :')]/ancestor::small"
                )).replace("Product class :", "").strip()
            ads_item["quantity"] = self.get_text(
                response.xpath(
                    "//small//b[contains(text(),'Quantity :')]/ancestor::small"
                )).replace("Quantity :", "").strip()

            accepted_currencies = []
            sale_price = self.get_text(
                response.xpath(
                    "//form//span[contains(text(),'Sale Price :')]")).replace(
                        "Sale Price :", "").strip()
            if "USD" in sale_price:
                ads_item["price_usd"] = re.search(r"([\d\.]+)\s*USD",
                                                  sale_price, re.M | re.I)
                ads_item["price_usd"] = ads_item["price_usd"].group(
                    1) if ads_item["price_usd"] else None
            if "BTC" in sale_price:
                ads_item["price_btc"] = re.search(r"([\d\.]+)\s*BTC",
                                                  sale_price, re.M | re.I)
                ads_item["price_btc"] = ads_item["price_btc"].group(
                    1) if ads_item["price_btc"] else None
                accepted_currencies.append("BTC")
            ads_item["accepted_currencies"] = ",".join(accepted_currencies)
            ads_item["shipping_options"] = self.get_shipping_options(response)

            # new fields
            ads_item["escrow"] = self.get_text(
                response.xpath(
                    "//small//b[contains(text(),'Payment :')]/ancestor::small")
            ).replace("Payment :", "").strip()
            active_tab = self.get_text(
                response.xpath(
                    "//ul[@class='nav nav-tabs']/li[@class='active']/a"))

            if "Product Description" in active_tab:
                ads_item['description'] = self.get_text(
                    response.xpath("//div[@class='tab-content']"))
            elif "Refund Policy" in active_tab:
                ads_item['refund_policy'] = self.get_text(
                    response.xpath("//div[@class='tab-content']"))
            elif "Product Tags" in active_tab:
                pass
            elif "Feedback" in active_tab:
                feedbacks = response.xpath(
                    "//div[@class='tab-content']//table/tbody/tr")
                if feedbacks:
                    for feedback in feedbacks:
                        rating = items.ProductRating()
                        rating["ads_id"] = ads_item["offer_id"]
                        rating["submitted_by"] = self.get_text(
                            feedback.xpath("td[3]/small"))
                        rating["submitted_on_string"] = self.get_text(
                            feedback.xpath("td[5]/small")).replace(
                                "View Item", "").strip()
                        rating["submitted_on"] = self.parse_datetime(
                            rating["submitted_on_string"])
                        rating["comment"] = self.get_text(
                            feedback.xpath("td[2]/small"))
                        rating["price_usd"] = self.get_text(
                            feedback.xpath("td[4]/small"))
                        # new fields
                        score = self.get_text(feedback.xpath("td[1]"))
                        if score == "\xe2\x98\x91":
                            rating["rating"] = "Positive"
                        elif score == "\xe2\x98\x92":
                            rating["rating"] = "Negative"
                        elif score == "\xe2\x98\x90":
                            rating["rating"] = "Neutral"
                        else:
                            self.logger.warning(
                                "Unknown rating type '%s' at %s" %
                                (rating["rating"], response.url))
                        yield rating
            else:
                self.logger.warning("Unknown tab: %s at %s" %
                                    (active_tab, response.url))
            yield ads_item
        except Exception as error:
            self.logger.warning("Couldn't yield Ad (Error %s) at %s." %
                                (error, response.url))

        if self.is_listing_tab_page(response) is False:
            #     self.requests_from_listing_page(response)
            image_urls = response.xpath(
                "//img[@class='pull-left']/@src").extract()
            if len(image_urls) > 0:
                img_item = items.AdsImage(image_urls=[])
                for img_url in image_urls:
                    # e.g. uploads/9bc5f18d5667081890e8972def13da2f_100_100.png
                    #      -> uploads/9bc5f18d5667081890e8972def13da2f.png
                    img_url = re.sub(r"_\d+_\d+\.", ".", img_url)
                    img_item['image_urls'].append(
                        self.make_request(reqtype='image', url=img_url))
                img_item['ads_id'] = ads_item['offer_id']
                yield img_item

示例#12

0

显示文件

文件： TochkaPointMarketSpider.py 项目： alexmason528/torforum_crawler

 def parse_listing(self, response):
     # The ad.
     ads_item = items.Ads()
     ads_item["offer_id"] = re.search(r"/item/([^/]+)", response.url,
                                      re.M | re.I)
     if ads_item["offer_id"]:
         ads_item["offer_id"] = ads_item["offer_id"].group(1)
     else:
         self.logger.warning("offer_id is None at %s" % response.url)
         return
     ads_item["vendor_username"] = re.search(r"/user/([^/]+)", response.url,
                                             re.M | re.I)
     if ads_item["vendor_username"]:
         ads_item["vendor_username"] = ads_item["vendor_username"].group(1)
     ads_item["fullurl"] = response.url.split(
         ads_item["offer_id"])[0] + ads_item["offer_id"]
     ads_item["relativeurl"] = self.get_relative_url(ads_item["fullurl"])
     ads_item["title"] = "".join(
         response.xpath(
             ".//div[@class='ui segment inverted t-item-image secondary']/h3/text()"
         ).extract()).strip()
     ads_item["description"] = self.get_text(
         response.xpath(
             ".//div[@class='ui segment']/h3[contains(text(),'About')]/following-sibling::div"
         ))
     ads_item["shipping_options"] = self.get_shipping_options(response)
     ads_item["product_rating"] = response.xpath(
         ".//div[@class='ui segment inverted t-item-image secondary']/h3//i[@class='icon thumbs up']/following-sibling::span/text()"
     ).extract_first(default="").strip()
     yield ads_item
     # The images.
     image_urls = response.xpath(
         ".//div[@class='ui segment inverted t-item-image secondary']/img/@src"
     ).extract()
     if len(image_urls) > 0:
         img_item = items.AdsImage(image_urls=[])
         for img_url in image_urls:
             img_item['image_urls'].append(
                 self.make_request(reqtype='image', url=img_url))
         img_item['ads_id'] = ads_item['offer_id']
         yield img_item
     # The reviews.
     feedbacks = response.xpath(
         ".//div[@class='ui segment']/h3[contains(text(),'Reviews')]/following-sibling::div[@class='ui comments']/div[@class='comment']"
     )
     if feedbacks:
         for feedback in feedbacks:
             rating = items.ProductRating()
             rating["ads_id"] = ads_item["offer_id"]
             rating["submitted_by"] = feedback.xpath(
                 ".//a[@class='author']/text()").extract_first(
                     default="").strip().replace("@", "")
             rating["submitted_on_string"] = feedback.xpath(
                 ".//span[@class='date']/text()").extract_first(
                     default="").strip()
             rating["submitted_on"] = self.parse_datetime(
                 rating["submitted_on_string"])
             rating["comment"] = self.get_text(
                 feedback.xpath(".//pre[@class='text']"))
             rating["rating"] = feedback.xpath(
                 ".//i[@class='icon thumbs up']/following-sibling::span/text()"
             ).extract_first(default="").strip()
             yield rating