示例#1
0
 def parse_item(self, response):
     """This function parses a property page
     @url https://nj.58.com/ershoufang/pn3/?PGTID=0d30000c-000a-c568-cd81-f02b4ffbea21&ClickID=1
     @returns items 1
     @scrapes title price description address image_urls
     @scrapes url project spider server date
     """
     # Create the loader using the response
     l = ItemLoader(item=PropertiesItem(), response=response)
     # Load fields using XPath expressions
     l.add_xpath(
         'title',
         '//div[@class="list-info"][1]/h2[@class="title"]/a/text()')
     l.add_xpath('price', '//p[@class="sum"][1]/b/text()')
     l.add_xpath(
         'description',
         '//div[@class="list-info"][1]/p[@class="baseinfo"][1]//text()',
         MapCompose(str.strip), Join())
     l.add_xpath(
         'address',
         '//div[@class="list-info"][1]/p[@class="baseinfo"][2]/span//text()',
         MapCompose(str.strip), Join())
     l.add_xpath('image_urls', '//div[@class = "pic"][1]/a/img/@src')
     # Housekeeping fields
     l.add_value('url', response.url)
     l.add_value('project', self.settings.get('BOT_NAME'))
     l.add_value('spider', self.name)
     l.add_value('server', socket.gethostname())
     l.add_value('date', datetime.datetime.now())
     return l.load_item()
示例#2
0
    def parse_item(self, selector, response):
        # Create the loader using the selector
        l = ItemLoader(item=PropertiesItem(), selector=selector)

        # Load fields using XPath expressions
        l.add_xpath('title', './/*[@itemprop="name"][1]/text()',
                    MapCompose(str.strip, str.strip))
        l.add_xpath('price',
                    './/*[@itemprop="price"][1]/text()',
                    MapCompose(lambda i: i.replace(',', ''), float),
                    re='[,.0-9]+')
        l.add_xpath('description', './/*[@itemprop="description"][1]/text()',
                    MapCompose(str.strip), Join())
        l.add_xpath('address', './/*[@itemtype="http://schema.org/Place"]'
                    '[1]/*/text()', MapCompose(str.strip))
        make_url = lambda i: urlparse.urljoin(response.url, i)
        l.add_xpath('image_urls', './/*[@itemprop="image"][1]/@src',
                    MapCompose(make_url))

        # Housekeeping fields
        l.add_xpath('url', './/*[@itemprop="url"][1]/@href',
                    MapCompose(make_url))
        l.add_value('project', self.settings.get('BOT_NAME'))
        l.add_value('spider', self.name)
        l.add_value('server', socket.gethostname())
        l.add_value('date', datetime.datetime.now())

        return l.load_item()
示例#3
0
文件: basic.py 项目: 73pi/myscrapy
    def parse(self, response):
        """This function parses a property page(contract爬虫可行测试 items是一不是L).
        @url https://voice.hupu.com/nba
        @returns items 1
        @scrapes title toptitle topnews news
        @scrapes url project spider server date
        """
        # Create the loader using the response
        l = ItemLoader(item=PropertiesItem(), response=response)

        #load fields using XPath expressions
        l.add_xpath(
            'title',
            '/html/body/div[3]/div[1]/div[1]/h2/text()')
        l.add_xpath(
            'news',
            '/html/body/div[3]/div[1]/div[2]/ul/li/div[1]/h4/a/text()',
            MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            'toptitle',
            '//*[@class="hd"]/h2/text()')
        l.add_xpath(
            'topnews',
            '//*[@class="bd"]//a/text()')

        # Housekeeping fields
        l.add_value('url', response.url)
        l.add_value('project', self.settings.get('BOT_NAME'))
        l.add_value('spider', self.name)
        l.add_value('server', socket.gethostbyname('nba.hupu.com'))
        l.add_value('date', datetime.datetime.now())

        return l.load_item()

        """学习代码
示例#4
0
 def parse_pageItem(self, response):
     item = PropertiesItem()
     # item['title'] = response.xpath('//*[@itemprop="name"][1]/text()').extract()
     # item['price'] = response.xpath('//*[@itemprop="price"][1]/text()').re('[.0-9]+')
     # item['description'] = response.xpath('//*[@itemprop="description"][1]/text()').extract()
     # item['address'] = response.xpath('//*[@itemtype="http://schema.org/''Place"][1]/text()').extract()
     # item['image_urls'] = response.xpath('//*[@itemprop="image"][1]/@src').extract()
     sel = Selector(response)
     xConts = sel.xpath('//div[@class="joke-list-item-main"]')
     for xCont_each in xConts:
         item['description'] = xCont_each.xpath(
             '//div[@class="joke-main-content clearfix"]/p/text()').extract(
             )
         item['image_urls'] = xCont_each.xpath(
             '//a/img[@class="joke-main-img"]/@src').extract()
         item['vot'] = xCont_each.xpath(
             '//div/div/div/a[@data="g"]/text()').extract()
         # xItem['vot']['bad'] = xCont_each.xpath(‘//div/div/div/a[@data="b"]/text()').extract()
         # xItem['vot']['comm'] = xCont_each.xpath('//div/div/div//a[@data="c"]/text()').extract()
         next_selector = response.xpath(
             '//a[@class="pagination-link pagination-next"]/@href').extract(
             )
         for each_next_selector in next_selector:
             Request('https://www.haha.mx' + each_next_selector,
                     callback=self.parse_pageItem,
                     method='GET')
         yield item
示例#5
0
    def parse_item(self, response):
        """ This function parses a property page

        @url http://localhost:9312/properties/property_000000.html
        @returns items 1
        @scrapes title price description address image_urls
        @scrapes url project spider server date
        """
        loader = ItemLoader(item=PropertiesItem(), response=response)
        loader.add_xpath('price',
                         '//*[@itemprop="price"][1]/text()',
                         MapCompose(lambda i: i.replace(',', ''), float),
                         re='[,.0-9]+')
        loader.add_xpath(
            'description', '//*[@itemprop="description"][1]/text()',
            MapCompose(str.strip, lambda i: i.replace('\r\n', ' ')))
        loader.add_xpath('address',
                         '//*[@itemtype="http://schema.org/Place"][1]/text()',
                         MapCompose(str.strip))
        loader.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src',
                         MapCompose(lambda i: parse.urljoin(response.url, i)))

        loader.add_value('title', response.meta['title'],
                         MapCompose(str.strip, str.title))
        loader.add_value('url', response.url)
        loader.add_value('project', self.settings.get('BOT_NAME'))
        loader.add_value('spider', self.name)
        loader.add_value('server', socket.gethostname())
        loader.add_value('date', datetime.datetime.now())
        yield loader.load_item()
示例#6
0
    def parse_item(self, response):
        """ This function parses a property page.

        @url http://pdrfinessetools.com/index.php?route=extension/list/latest
        @returns items 1
        @scrapes product_title price description product_code image_urls
        @scrapes url project spider server date
        """

        # Create the loader using the response
        l = ItemLoader(item=PropertiesItem(), response=response)

        # Load fields using XPath expressions
        l.add_xpath('product_title', '//*[@id="content"]//h1/text()',
                    MapCompose(str.strip, str.title))
        l.add_xpath('price', '//*[@class="list-unstyled"]//h2/text()')
        l.add_xpath('description', '//*[@id="tab-description"]//text()',
                    MapCompose(str.strip), Join())
        l.add_xpath(
            'product_code',
            '//*[@id="content"]//*[@class="list-unstyled"]//li[2]/text()',
            MapCompose(str.strip))
        l.add_xpath(
            'image_urls',
            '//*[@id="content"]//a[@class="thumbnail"]//img/@src',
            MapCompose(lambda i: urllib.parse.urljoin(response.url, i)))

        # Housekeeping fields
        l.add_value('url', response.url)
        l.add_value('project', self.settings.get('BOT_NAME'))
        l.add_value('spider', self.name)
        l.add_value('server', socket.gethostname())
        l.add_value('date', datetime.datetime.now())

        return l.load_item()
示例#7
0
    def parse(self, response):
        l = ItemLoader(item=PropertiesItem(), response=response)
        l.add_xpath("title", '//*[@class="title"]/text()')
        l.add_xpath("price", '//*[@class="trl-item sty1"]/i/text()',
                    MapCompose(float))

        return l.load_item()
示例#8
0
    def parse(self, response):
        l = ItemLoader(item=PropertiesItem(), response=response)
        l.add_xpath('title', '//*[@itemprop="name"]/text()',
                    MapCompose(str.strip, str.title))
        l.add_xpath('price',
                    './/*[@itemprop="price"][1]/text()',
                    MapCompose(lambda i: i.replace(',', ''), float),
                    re='[,.0-9]+')
        l.add_xpath('description', '//*[@itemprop="description"]/text()',
                    MapCompose(str.strip), Join())
        l.add_xpath('address',
                    '//*[@itemtype="http://schema.org/Place"]/span/text()',
                    MapCompose(str.strip))
        l.add_xpath(
            'image_urls', '//*[@itemprop="image"]/@src',
            MapCompose(lambda i: urllib.parse.urljoin(response.url, i)))

        # Housekeeping fields
        l.add_value('url', response.url)
        l.add_value('project', self.settings.get('BOT_NAME'))
        l.add_value('spider', self.name)
        l.add_value('server', socket.gethostname())
        l.add_value('date', datetime.datetime.now())

        return l.load_item()
示例#9
0
    def parse_item(self, response):
        """
        @url http://localhost:9312/properties/property_000001.html
        @returns items 1
        @scrapes title price description address image_urls
        @scrapes url project spider server date
        """

        l = ItemLoader(item=PropertiesItem(), response=response)

        l.add_xpath('title', '//*[@itemprop="name"][1]/text()',
                    MapCompose(unicode.strip, unicode.title))
        l.add_xpath('price',
                    './/*[@itemprop="price"][1]/text()',
                    MapCompose(lambda i: i.replace(',', ''), float),
                    re='[,.0-9]+')
        l.add_xpath('description', '//*[@itemprop="description"][1]/text()',
                    MapCompose(unicode.strip), Join())
        l.add_xpath('address',
                    '//*[@itemtype="http://schema.org/Place"][1]/text()',
                    MapCompose(unicode.strip))
        l.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src',
                    MapCompose(lambda i: urlparse.urljoin(response.url, i)))

        l.add_value('url', response.url)
        l.add_value('project', self.settings.get('BOT_NAME'))
        l.add_value('spider', self.name)
        l.add_value('server', gethostname())
        l.add_value('date', datetime.now())

        return l.load_item()
示例#10
0
    def parse_item(self, response):
        """ This function parses a property page.
        @url http://web:9312/properties/property_000000.html
        @returns items 1
        @scrapes title price description address image_urls
        @scrapes url project spider server date
        """
        if not response:
            self.log("RESPONSE IS NONE")
        # Create the loader using the response
        l = ItemLoader(item=PropertiesItem(), response=response)

        # Load fields using XPath expressions
        l.add_xpath('title', '//*[@itemprop="name"][1]/text()',
                    MapCompose(str.strip, str.title))
        l.add_xpath('price',
                    './/*[@itemprop="price"][1]/text()',
                    MapCompose(lambda i: i.replace(',', ''), float),
                    re='[,.0-9]+')
        l.add_xpath('description', '//*[@itemprop="description"][1]/text()',
                    MapCompose(str.strip), Join())
        l.add_xpath('address',
                    '//*[@itemtype="http://schema.org/Place"][1]/text()',
                    MapCompose(str.strip))
        l.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src',
                    MapCompose(lambda i: urljoin(response.url, i)))

        # Housekeeping fields
        l.add_value('url', response.url)
        l.add_value('project', self.settings.get('BOT_NAME'))
        l.add_value('spider', self.name)
        l.add_value('server', socket.gethostname())
        l.add_value('date', datetime.datetime.now())

        return l.load_item()
示例#11
0
 def parse_item(self, response):
     l = ItemLoader(item=PropertiesItem(), response=response)
     l.add_xpath("title", '//*[@class="tab-cont clearfix"]/h1/text()')
     l.add_xpath("price", '//*[@class="trl-item sty1"]/i/text()',
                 MapCompose(float))
     l.add_xpath("image_url", '//*[@class="bigImg"]/img[1]/@src')
     return l.load_item()
示例#12
0
    def parse(self, response):
        """ This function parses a property page.
        @url https://www.gumtree.com/p/property-to-rent/one-bedroom-property-near-chiswick-park-tube-station./1405437559
        @returns items 1
        @scrapes title price description address image_urls
        @scrapes url project server spider date
        """

        loader = ItemLoader(item=PropertiesItem(), response=response)

        loader.add_xpath(
            'title', '//h1[@class="css-1uk1gs8 e1pt9h6u3"]/text()', MapCompose(str.strip))
        loader.add_xpath(
            'price', '//h2[@itemprop="price"]/text()',
            MapCompose(lambda i: i.replace(',', ''), float), re='[,.0-9]+'
        )

        loader.add_xpath(
            'description', '//p[@itemprop="description"]/text()', MapCompose(str.strip), Join())
        loader.add_xpath(
            'address', '//h4[@itemprop="addressLocality"]/text()', MapCompose(str.strip))
        loader.add_xpath(
            'image_urls', '//*[@class="carousel-item"]/img/@src',
            MapCompose(lambda i: urljoin(response.url, i))
        )

        loader.add_value('url', response.url)
        loader.add_value('project', self.settings.get('BOT_NAME'))
        loader.add_value('spider', self.name)
        loader.add_value('server', socket.gethostname())
        loader.add_value('date', datetime.datetime.now())

        return loader.load_item()
示例#13
0
 def parse_item(self, response):
     l = ItemLoader(item=PropertiesItem(), response=response)
     l.add_xpath("title", '//*[@class="tab-cont clearfix"]/h1/text()')
     l.add_xpath("price", '//*[@class="trl-item sty1"]/i/text()',
                 MapCompose(float))
     l.add_value("nextpage_url2", response.meta["urlll"])
     return l.load_item()
示例#14
0
class BasicSpider(scrapy.Spider):
    name = "basictest"
    allowed_domains = ["web"]
    start_urls = (
        'https://developers.facebook.com/blog/post/2021/01/26/introducing-instagram-content-publishing-api/?utm_source=email&utm_medium=fb4d-newsletter-february21&utm_campaign=organic&utm_offering=business-tools&utm_product=instagram&utm_content=body-button-instagram-graph-API&utm_location=2',
    )

    def parse(self, response):
        """ @url https://developers.facebook.com/blog/post/2021/01/26/introducing-instagram-content-publishing-api/?utm_source=email&utm_medium=fb4d-newsletter-february21&utm_campaign=organic&utm_offering=business-tools&utm_product=instagram&utm_content=body-button-instagram-graph-API&utm_location=2
        @return item 1
        @scrapes title price
        @scrapes url project"""

    l = ItemLoader(item=PropertiesItem(), response=response)
    # Load fields using XPath expressions
    l.add_xpath(
        'title',
        '/html/body/div[1]/div[5]/div[2]/div/div/div/div[2]/div[2]/div[2]/div[1]/div/div/div[2]/div/div/p[1]/text()',
        MapCompose(unicode.strip, unicode.title))
    # l.add_xpath('price', './/*[@itemprop="price"][1]/text()',
    # MapCompose(lambda i: i.replace(',', ''),
    # float),
    # re='[,.0-9]+')
    # l.add_xpath('description', '//*[@itemprop="description"]'
    # '[1]/text()',
    # MapCompose(unicode.strip), Join())

    # Housekeeping fields
    l.add_value('url', response.url)
    l.add_value('project', self.settings.get('BOT_NAME'))
    l.add_value('spider', self.name)
    l.add_value('server', socket.gethostname())
    l.add_value('date', datetime.datetime.now())
    return l.load_item()
示例#15
0
    def parse_item(self, response):
        l = ItemLoader(item=PropertiesItem(), response=response)

        l.add_xpath('title', '//h1[@id="ad-title"]/text()',
                    MapCompose(unicode.strip, unicode.title))
        l.add_xpath('price',
                    '//strong[contains(@class, "ad-price")]/text()',
                    MapCompose(lambda i: i.replace(',', ''), float),
                    re='[,.0-9]+')
        l.add_xpath('description', '//p[@class="ad-description"][1]/text()',
                    MapCompose(unicode.strip), Join())
        l.add_xpath('address', '//span[@itemprop="address"]/text()',
                    MapCompose(unicode.strip))
        l.add_xpath('image_urls',
                    '/descendant::img[@itemprop="image"][1]/@src',
                    MapCompose(lambda i: urlparse.urljoin(response.url, i)))

        l.add_value('url', response.url)
        l.add_value('project', self.settings.get('BOT_NAME'))
        l.add_value('spider', self.name)
        l.add_value('server', socket.gethostname())
        l.add_value('date', datetime.datetime.now())

        return l.load_item()


#        item = PropertiesItem()
#        item['title'] =  response.xpath('//h1[@id="ad-title"]/text()').extract()
#        item['price'] = response.xpath('//strong[contains(@class, "ad-price")]/text()').extract()[0].strip()
#        item['description'] = response.xpath('//p[@class="ad-description"][1]/text()').extract()
#        item['address'] = response.xpath('//span[@itemprop="address"]/text()').extract()
#        item['image_urls'] = response.xpath('/descendant::img[@itemprop="image"][1]/@src').extract()
#        return item
示例#16
0
    def parse(self, response):
        ids = response.xpath(
            '//div[@class="star_hotstar"]//a[@class="name_hotstar"]/@href'
        ).extract()
        # pdb.set_trace()
        return_items = []
        # pdb.set_trace()
        for id in ids:
            try:
                item = PropertiesItem()
                star_id = re.findall("\d+", id)[0]

                sex = response.xpath(
                    '//div[@class="star_hotstar"]//a[@class="name_hotstar" and @href="http://www.happyjuzi.com/star-{}/"]/following-sibling::a[1]/text()'
                    .format(star_id)).extract_first()

                item['id'] = star_id
                if sex == '男':
                    item['sex'] = 1
                elif sex == '女':
                    item['sex'] = 0
                else:
                    item['sex'] = -1

                print('id:{}'.format(star_id))
                return_items.append(item)

            except:
                print('error')

        return return_items
示例#17
0
	def parse(self, response):
		# 直接在日志中输出结果 方式:
    	#self.log("address:%s" %response.xpath('//*[@id="baseinfo_top_layout"]/div[3]/div[2]/div/ul[2]/li[1]/text()').extract())
		'''
    	#实例化 item 可以输出到指定的文件类型 中保存结果
		# item = PropertiesItem()
    	item['title']=response.xpath('//*[@id="baseinfo_top_layout"]/div[3]/div[2]/div/div[1]/h1/text()').extract()
		item['price']=response.xpath('//*[@id="baseinfo_top_layout"]/div[3]/div[2]/div/div[1]/h2/span/strong/text()').re('[.0-9]+')
		item['address']=response.xpath('//*[@id="baseinfo_top_layout"]/div[3]/div[2]/div/ul[2]/li[1]/text()').extract()
		return item
		'''
		#用处理器进行处理  更强大
		dl = ItemLoader(item = PropertiesItem(),response = response)
		# 用 css 选择器 进行提取数据
    	# 用ID属性进行定位
    	#dl.add_css('price','#baseinfo_top_layout strong::text')
    	#用 class 属性定位  等同于上面用ID定位
		dl.add_css('price','.price strong::text')

		#用 xpath 选择器  进行提取数据
		dl.add_xpath('title','//*[@id="baseinfo_top_layout"]/div[3]/div[2]/div/div[1]/h1/text()')
		#dl.add_xpath('price','//*[@id="baseinfo_top_layout"]/div[3]/div[2]/div/div[1]/h2/span/strong/text()')
		dl.add_xpath('address','//*[@id="baseinfo_top_layout"]/div[3]/div[2]/div/ul[2]/li[1]/text()')
		#add_value()  方法 获取python 计算获取的单个值  而不是xpath or css 表达式
		dl.add_value('url',response.url)
		dl.add_value('spider', self.name)
		dl.add_value('server', socket.gethostname())
		dl.add_value('h_date', datetime.datetime.now())
		#item =  dl.load_item()
		print(dl.load_item())
		yield dl.load_item()
示例#18
0
    def parse(self, response):
    
        
    
        l = ItemLoader(item = PropertiesItem(), response=response)
            
       
        l.add_xpath('title', '//*[@id="productTitle"][1]/text()', MapCompose(unicode.strip, unicode.title))
        
        l.add_xpath('price', '//*[@id="priceblock_ourprice"][1]/text()',  MapCompose(lambda i: i.replace(',', ''), float), re='[.0-9]+')

        l.add_xpath('description','//*[@id="productDescription"]//p/text()', MapCompose(unicode.strip), Join())


        l.add_xpath('availability', '//*[@id="availability"]//span/text()', MapCompose(unicode.strip))

        l.add_xpath('image_urls', '//*[@id="imgTagWrapperId"][1]/@src', MapCompose(lambda i: urlparse.urljoin(response.url, i)))


        l.add_value('url', response.url)
        l.add_value('project', self.settings.get('BOT_NAME'))
        l.add_value('spider', self.name)
        l.add_value('server', socket.gethostname())
        l.add_value('date', datetime.datetime.now())
        
        return l.load_item()
    def parse(self, response):
        # Convert rows into items
        rows = response.xpath('//tbody/tr[@id]')
        for row in rows:
            # Load fields using XPath expressions
            l = ItemLoader(item=PropertiesItem(),
                           selector=row,
                           response=response)
            l.add_xpath('build', './td[3]/text()')
            l.add_xpath('city', './td/text()[following-sibling::br]')
            l.add_xpath('house_number', './td/a/strong/text()', re='[0-9]+.')
            l.add_xpath('living_space', './td[4]/text()')
            l.add_xpath('plot_space', './td[5]/text()')
            l.add_xpath('postal_code', './td/text()[following-sibling::br]')
            l.add_xpath('price', './td/strong/text()')
            l.add_xpath('street',
                        './td/a/strong/text()',
                        re=r'\D+\S[A-Za-z][^- ]')
            yield l.load_item()

        # Get the next index URL
        urls = response.xpath(
            '//a[contains(@class, "volgende")]//@href').extract()
        if len(urls) > 0:
            yield response.follow(urls[0], callback=self.parse)
示例#20
0
    def parse_item(self, response):

        item = PropertiesItem()

        item['url'] = response.url
        item['title'] = response.xpath('/html/head/title/text()').extract()[0]

        yield item
示例#21
0
    def parse(self, response):
        # 创建 contract
        """ This function parase a property page.
        @url http://web:3912/properties/property_000000.html
        @returns items L
        @scrapes title price description address image_urls
        @scrapes url project spider server date
        """
        # 比较实用的水平爬取和垂直爬取URL
        # 水平
        urls = response.xpath('//*[contains(@class,"next")]//@href').extract()
        absUrls = [urlparse.urljoin(response.url, i) for i in urls]
        # 垂直
        urls = response.xpath('//*[@itemprop="url"]/@href').extract()
        absUrls = [urlparse.urljoin(response.url, i) for i in urls]
        # 使用爬虫预定义的方法log(),输出在基本字段表中总结的所有内容
        self.log("title: %s" % response.xpath('//*[@itemprop="name"][1]/text()').extract())
        self.log("price: %s" % response.xpath('//*[@itemprop="price"[1]/text()').re('[.0-9]+'))
        self.log("description: %s" % response.xpath('//*[@itemprop="description"][1]/text()').extract())
        self.log("address: %s" % response.xpath('//*[@itemprop="http://schema.org/Place"][1]/text()').extract())
        self.log("image_urls: %s" % response.xpath('//*[@itemprop="image"][1]/@src').extract())

        # # 填充Item
        # item = PropertiesItem()
        # item['title'] = response.xpath('//*[@itemprop="name"][1]/text()').extract()
        # item['price'] = response.xpath('//*[@itemprop="price"[1]/text()').re('[.0-9]+')
        # item['description'] = response.xpath('//*[@itemprop="description"][1]/text()').extract()
        # item['address'] = response.xpath('//*[@itemprop="http://schema.org/Place"][1]/text()').extract()
        # item['image_urls'] = response.xpath('//*[@itemprop="image"][1]/@src').extract()
        # return item


        # 清理 item 装载器与管理字段
        L = ItemLoader(item=PropertiesItem(), response=response)
        L.add_xpath('title', '//*[@itemprop="name"][1]/text()')
        L.add_xpath('price', '//*[@itemprop="price"][1]/text()', re='[.0-9]+')
        L.add_xpath('description', '//*[@itemprop="description"][1]/text()')
        L.add_xpath('address', '//*[@itemprop="http://schema.org/Place"][1]/text()')
        L.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src')


        # 使用处理器 用来对我们的Xpath/CSS结果进行处理。
        # 在爬虫中使用几个这样子的处理器,并按照我们想要的方式输出
        L.add_xpath('title', '//*[@itemprop="name"][1]/text()', MapCompose(unicode.strip, unicode.title))
        L.add_xpath('price', '//*[@itemprop="price"][1]/text()', MapCompose(lambda i: i.replace(',', ''), float), re='[.0-9]+')
        L.add_xpath('description', '//*[@itemprop="description"][1]/text()', MapCompose(unicode.strip), Join())
        L.add_xpath('address', '//*[@itemprop="http://schema.org/Place"][1]/text()', MapCompose(unicode.strip))
        L.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src', MapCompose(lambda i : urlparse.urljoin(response.url, i)))

        # 使用add_value方法设置管理字段
        L.add_value('url', reponse.url)
        L.add_value('project', self.settings.get('BOT_NAME'))
        L.add_value('spider', self.name)
        L.add_value('server', socket.gethostname)
        L.add_value('date', datetime.datetime.now())
        return L.load_item()
        pass
示例#22
0
    def parse(self, response):
        l = ItemLoader(item=PropertiesItem(), response=response)

        l.add_xpath("title", "//*[@title]/@title")
        l.add_xpath("image_urls",
                    "//*[starts-with(@src,'http://image.woshipm.com')]/@src",
                    MapCompose(lambda i: i.strip()))
        # l.add_xpath("urls","//*[starts-with(@href,'http://www.woshipm.com')]/@href",MapCompose(lambda i:i[22:]))
        return l.load_item()
示例#23
0
 def parse_item(self, selector, response):
     l = ItemLoader(item=PropertiesItem(), selector=selector)
     l.add_xpath("title", './/*[@class="title"]/a/text()')
     l.add_xpath("price", './/*[@class="price"]/text()', MapCompose(float))
     make_url = lambda i: urllib.parse.urljoin(response.url, i)
     l.add_xpath("item_url", './/*[@class="title"]/a/@href',
                 MapCompose(make_url))
     l.add_value("nextpage_url", response.url)
     return l.load_item()
示例#24
0
    def parse_item(self, response):
        # Create the loader using the response
        l = ItemLoader(item=PropertiesItem(), response=response)

        # image
        l.add_xpath('image_urls', '//*[@class="artical-importantPic"][1]/img/@src',
                    MapCompose(lambda i: urlparse.urljoin(response.url, i)))

        return l.load_item()
示例#25
0
    def parse2(self, response):
        msg = response.xpath('//*[@class="txt-box"]').extract()
        for i in msg:
            print('hi', i)

        l = ItemLoader(item=PropertiesItem(), response=response)
        l.add_value("title", '题目')
        l.add_value("describe", '介绍')

        return l.load_item()
示例#26
0
    def parse_item(self, response):
        # 定义loader 装饰器
        loader = ItemLoader(item=PropertiesItem(), response=response)

        loader.add_value('link', response.url)

        loader.add_xpath('text', 'xpath表达式',
                         MapCompose(unicode.strip, unicode.titile), Join())

        return loader.load_item()
示例#27
0
    def parse(self, response):
        self.log("I am running")
        item = PropertiesItem()

        item['url'] = response.url
        item['project'] = self.settings.get('BOT_NAME')
        item['spider'] = self.name
        item['server'] = socket.gethostname()
        item['date'] = datetime.datetime.now()

        yield item
示例#28
0
    def parse(self, response):
        try:
            item = PropertiesItem()
            item['id'] = re.findall("\d+", response.request.url)[0]

            # pdb.set_trace()
            img_src = response.xpath(
                '//img[@class="i_starimg_starindex"]/@src').extract_first()
            if '?' in img_src:
                img_src = img_src[:img_src.index('?')]

            item['key_src'] = img_src
            print('success:{}'.format(item['id']))

            return item

        except:
            item = PropertiesItem()
            item['id'] = re.findall("\d+", response.request.url)[0]

            return item
示例#29
0
文件: basic.py 项目: YoonaX/A-_Scrapy
    def parse(self, response):
        """ This function parses a property page.

        @url http://web:9312/properties/property_000000.html
        @returns items 1
        @scrapes title price description address image_urls
        @scrapes url project spider server date
        """

        item = PropertiesItem()
        item['title'] = response.xpath('//title/text()').extract()
        print(item['title'])
示例#30
0
    def parse_item(self, response):
        # 定义loader 装饰器
        print(response.url)
        # 定义loader 装饰器
        loader = ItemLoader(item=PropertiesItem(), response=response)

        loader.add_value('link', response.url)

        loader.add_xpath('text', '//div[@class="vulners-card-text"]//text()',
                         MapCompose(str.strip), Join())

        return loader.load_item()