Python XPathItemLoader.add_value示例，scrapy.contrib.loader.XPathItemLoader.add_value Python示例

示例#1

0

显示文件

文件： ParseSpider.py 项目： talentsun/gmatclub

	def parse_rc(self,response):
		loader = XPathItemLoader(item=ParseRcItem(), response=response)
		id = self.parse_id_from_url(response.url)
		loader.add_value('questionId', id)
		loader.add_xpath('text', '//div[@class="text"]/text()')
		loader.add_xpath('text', '//div[@class="text"]/span/text()')
		loader.add_xpath('answerList','//div[@class="item clearfix"]/span/text()')
		loader.add_xpath('choiceList','//div[@class="item clearfix"]/b/text()')
		loader.add_xpath('answer','//div[@class="answer clearfix hidden QuesHidden"]/b/text()')
		# loader.add_xpath('explanation','//div[@id="DivExplain"]')
		item =  loader.load_item()
		if len(item['text']) ==3:
			test = item['text'][0] + '<span style="text-decoration:underline;">' + item['text'][2]  + '</span>'+ item['text'][1]
		else:
			test = item['text'][0]

		for filename in self.fileList:
			index = filename.find(id)
			if index != -1:
				f = open('/home/huwei/origin/rcarticle/' + filename)
				artile = f.read()
				f.close

		content = self.rc_content.format(artile[24:len(artile) - 4],item['questionId'][0],
			item['questionId'][0],test,
			item['questionId'][0],item['choiceList'][0],item['choiceList'][0],item['answerList'][0],
			item['questionId'][0],item['choiceList'][1],item['choiceList'][1],item['answerList'][1],
			item['questionId'][0],item['choiceList'][2],item['choiceList'][2],item['answerList'][2],
			item['questionId'][0],item['choiceList'][3],item['choiceList'][3],item['answerList'][3],
			item['questionId'][0],item['choiceList'][4],item['choiceList'][4],item['answerList'][4],
			item['questionId'][0],item['answer'][0])
		wf = open('/home/huwei/gmatclub/rc/' + id + '.html','w')
		wf.write(content)
		wf.close()
		return item

示例#2

0

显示文件

 def get_question(self, selector, response):
     # both select function and selector's join function need to add dot to search from relative based directory
     question_loader = XPathItemLoader(item = LazyTweetQuestion(), \
             selector = selector)
     question_loader.add_xpath(
         'question_content', ''.join([
             './/span[@class="post-body"]',
             '//span[@class="post-status"]/descendant-or-self::text()'
         ]))
     # not useful
     question_loader.add_xpath(
         'question_tags', ''.join(['//*[@id="post-tags"]/ul/li/a/text()']))
     question_loader.add_xpath(
         'asking_date', ''.join([
             './/span[@class="post-meta"]//span[@class="timestamp"]/text()'
         ]))
     question_loader.add_value(
         'asker',
         self.get_user(
             selector.select(''.join(['.//span[@class="post-meta"]']))))
     question_loader.add_xpath(
         'number_of_answers',
         ''.join(['.//span[@class="post-meta"]', '//a[last()]/text()']))
     question_loader.add_value('question_id', response.url.split('/')[-1])
     print question_loader.get_output_value('question_tags')
     return question_loader.load_item()

示例#3

0

显示文件

文件： NrcExtractor.py 项目： SkyTruth/scraper

    def process_item(self, task_id):
        report = self.db.loadScrapedFullReport(task_id)
        if report is None:
            return

        text = report["full_report_body"]
        text = "".join(chr(min(ord(c), 127)) for c in text)
        t = TextResponse(url=report["full_report_url"], body=text.encode("utf-8"))  # must have utf-8 here
        l = XPathItemLoader(NrcParsedReport(), response=t)
        l.add_value("reportnum", task_id)

        patterns = self.compile_patterns()

        for p in patterns:
            l.add_value(p[0], text, TakeFirst(), unicode.strip, re=p[1])

        county = l.get_output_value("county")
        pattern = self.get_area_code_pattern(county)
        if pattern:
            l.add_value("areaid", county)
            l.add_value("blockid", text, TakeFirst(), unicode.strip, re="%s[\s]+(?:BLOCK[\s]+)?([\d]+)" % pattern)
            l.add_value("blockid", text, TakeFirst(), unicode.strip, re="BLOCK[\s]+([\d]+)")

        item = l.load_item()

        yield item
        self.item_completed(task_id)

示例#4

0

显示文件

文件： ParseSpider.py 项目： talentsun/gmatclub

	def parse_five_chose_one(self, response):
		print 'parse_five_chose_One'
		loader = XPathItemLoader(item=ParseFiveSelectOneItem(), response=response)
		id = self.parse_id_from_url(response.url)
		loader.add_value('questionId', id)
		loader.add_xpath('text', '//div[@class="text"]/text()')
		loader.add_xpath('text', '//div[@class="text"]/span/text()')
		loader.add_xpath('answerList','//div[@class="item clearfix"]/span/text()')
		loader.add_xpath('choiceList','//div[@class="item clearfix"]/b/text()')
		loader.add_xpath('answer','//div[@class="answer clearfix hidden QuesHidden"]/b/text()')
		loader.add_xpath('explanation','//div[@id="DivExplain"]')
		item =  loader.load_item()
	        if len(item['text']) ==3:
			test = item['text'][0] + '<span style="text-decoration:underline;">' + item['text'][2]  + '</span>'+ item['text'][1]
		else:
			test = item['text'][0]
		content = self.five_chose_one_content.format(item['questionId'][0],test,
			item['questionId'][0],item['choiceList'][0],item['choiceList'][0],item['answerList'][0],
			item['questionId'][0],item['choiceList'][1],item['choiceList'][1],item['answerList'][1],
			item['questionId'][0],item['choiceList'][2],item['choiceList'][2],item['answerList'][2],
			item['questionId'][0],item['choiceList'][3],item['choiceList'][3],item['answerList'][3],
			item['questionId'][0],item['choiceList'][4],item['choiceList'][4],item['answerList'][4],
			item['questionId'][0],item['answer'][0],item['explanation'][0][21:len(item['explanation'][0]) - 6])
		wf = open('/home/huwei/gmatclub/ir/' + id + '.html','w')
		wf.write(content)
		wf.close()
		return item

示例#5

0

显示文件

    def parse_item(self, response, loop, fields):
        hxs = HtmlXPathSelector(response)
        self.macro.update({'URL': response.url})

        for e in hxs.select(loop or '(//*)[1]'):
            loader = XPathItemLoader(item=Item(), selector=e)

            for k, v in fields.iteritems():
                if 'value' in v:
                    get_v_x = loader.get_value
                    v_x = v.get('value')
                elif 'xpath' in v:
                    get_v_x = loader.get_xpath
                    v_x = v.get('xpath')
                else:
                    log.msg(u'field [{}] should contains "value" or "xpath"'.
                            format(k),
                            level=log.WARNING)
                    continue

                val = get_v_x(self.macro.expand(v_x),
                              utils.convert_type(v.get('parse', {})),
                              re=v.get('regex'))

                if not val and 'default' in v:
                    val = self.macro.expand(v.get('default'))

                qry = v.get('filter', {})
                if utils.filter_data(qry, val):
                    loader.add_value(k, val)
                else:
                    break
            else:
                yield loader.load_item()

示例#6

0

显示文件

文件： example_spider.py 项目： xiangxiaobaog3/project

    def parse_item(self, response, loop, fields):
        hxs = HtmlXPathSelector(response)
        self.macro.update({'URL':response.url})

        for e in hxs.select(loop or '(//*)[1]'):
            loader = XPathItemLoader(item=Item(), selector=e)

            for k,v in fields.iteritems():
                if 'value' in v:
                    get_v_x = loader.get_value
                    v_x = v.get('value')
                elif 'xpath' in v:
                    get_v_x = loader.get_xpath
                    v_x = v.get('xpath')
                else:
                    log.msg(u'field [{}] should contains "value" or "xpath"'.format(k), level=log.WARNING)
                    continue

                val = get_v_x(
                    self.macro.expand(v_x),
                    utils.convert_type(v.get('parse', {})),
                    re=v.get('regex')
                )

                if not val and 'default' in v:
                    val = self.macro.expand(v.get('default'))

                qry = v.get('filter', {})
                if utils.filter_data(qry, val):
                    loader.add_value(k, val)
                else:
                    break
            else:
                yield loader.load_item()

示例#7

0

显示文件

文件： trulia.py 项目： unreal-estate/giscrape

    def parse_sale(self, response):
        l = XPathItemLoader(item=SaleItem(), response=response)

        l.add_value('url', response.url)
        l.add_xpath('address', '//h1[@class="address"]/text()')

        l.add_xpath('price', '//div[@class="price"]/text()')
        l.add_xpath('sale_date',
                    '//th[text()="Last sale:"]/../td/div[last()]/text()',
                    re=r'on (\w+)')

        l.add_xpath('bedrooms', '//th[text()="Bedrooms:"]/../td/text()')
        l.add_xpath('bathrooms',
                    '//th[text()="Bathrooms:"]/../td/text()',
                    re=r'(\d+)')
        l.add_xpath('powder_rooms',
                    '//th[text()="Bathrooms:"]/../td/text()',
                    re=r', (\d+)')
        l.add_xpath('property_type',
                    '//th[text()="Property type:"]/../td/text()')
        l.add_xpath('size',
                    '//th[text()="Size:"]/../td/text()',
                    re=r'([\d|,]+) sqft')
        l.add_xpath('lot', '//th[text()="Lot:"]/../td/text()')
        l.add_xpath('price_per_sf', '//th[text()="Price/sqft:"]/../td/text()')
        l.add_xpath('year_built', '//th[text()="Year built:"]/../td/text()')

        l.add_xpath('public_records',
                    'id("property_public_info_module")/ul/li/span/text()')

        return l.load_item()

示例#8

0

显示文件

文件： franceradio_discovery_spider.py 项目： solaise73/adaptfm

	def parse(self, response):
		x = HtmlXPathSelector(response)
		#x.register_namespace("xsi", "http://www.w3.org/2001/XMLSchema-instance")
		
		#programs = x.select('./body/outline[position()=4]/outline[position()<4]')
		programs = x.select('//div[@class="item-list"]/ul/li[contains(@class,"views-row")]/div/div/div')
		podcastCount = str(len(programs))
		i=0
		allitems=[]
		for program in programs:
			i=i+1
			l = XPathItemLoader(PodcastItem(), selector=program)
			l.add_xpath('id', 'concat("fri_", .//li/a[@class="rss"]/@href)')
			l.add_value('type', 'disco')
			l.add_xpath('brandId', './/li/a[@class="rss"]/@href')
			l.add_xpath('brandFeed', 'concat("http://www.franceinfo.fr", .//li[contains(@class,"link_rss")]/a[@class="rss"]/@href)')
			l.add_xpath('brandName', './/h3/a/text()')
			l.add_xpath('brandTimes', './/div[@class="views-field-field-emission-texte-diffusion-value"]/text()')
			l.add_xpath('brandDescription', './/div[@class="views-field-field-emission-desc-courte-value"]/p/text()')
			l.add_xpath('brandHomepage', './/h3/a/@href')
			
			l.add_value('channelId', 'franceinfo')
			l.add_xpath('channelName', '//head/meta[@property="og:site_name"]/@content')
			l.add_xpath('channelDescription', '//head/meta[@property="og:description"]/@content')
			l.add_xpath('channelImage', '//div[@id="header"]/div/span/a/img/@src')
			l.add_xpath('brandHomepage', './/h3/a/@href')
			l.add_xpath('brandHomepage', './/h3/a/@href')
			l.add_xpath('brandHomepage', './/h3/a/@href')
			l.add_xpath('brandHomepage', './/h3/a/@href')
			
			self.log('Discovering fri [%s of %s] feeds' % (i, podcastCount), level=log.INFO)
		
		
			item = l.load_item()
			yield item

示例#9

0

显示文件

文件： flipkart_spider.py 项目： sankar009/E-Commerce-Scrapy

    def parse(self, response):
        """
        Default callback used by Scrapy to process downloaded responses

        """
        selector = HtmlXPathSelector(response)


        details=urlparse(response.request.url)
        queryStr={x.split('=')[0]:(x.split('=')[1]) for x in details.query.split("&")}
        print "\n",(urllib.unquote(queryStr['p%5B%5D']).split("=")[1]),queryStr['start']

        
        for deal in selector.select(self.deals_list_xpath):
            loader = XPathItemLoader(flipkartData(), selector=deal)

            # define processors
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()

            # iterate over fields and add xpaths to the loader
            for field, xpath in self.item_fields.iteritems():
                loader.add_xpath(field, xpath)

            # adding the request URL to the loader
            loader.add_value("requestURL",unicode(response.request.url, "utf-8"))

            # adding the category for the request
            loader.add_value("category",unicode(self.category))

            yield loader.load_item()

示例#10

0

显示文件

文件： lazytweet.py 项目： KeithYue/QA-spider

    def get_question(self, selector, response):
# both select function and selector's join function need to add dot to search from relative based directory
        question_loader = XPathItemLoader(item = LazyTweetQuestion(), \
                selector = selector)
        question_loader.add_xpath('question_content', ''.join([
            './/span[@class="post-body"]',
            '//span[@class="post-status"]/descendant-or-self::text()'
            ]))
        # not useful
        question_loader.add_xpath('question_tags', ''.join([
            '//*[@id="post-tags"]/ul/li/a/text()'
            ]))
        question_loader.add_xpath('asking_date', ''.join([
            './/span[@class="post-meta"]//span[@class="timestamp"]/text()'
            ]))
        question_loader.add_value('asker', self.get_user(selector.select(''.join([
            './/span[@class="post-meta"]'
            ]))))
        question_loader.add_xpath('number_of_answers', ''.join([
            './/span[@class="post-meta"]',
            '//a[last()]/text()'
            ]))
        question_loader.add_value('question_id', response.url.split('/')[-1])
        print question_loader.get_output_value('question_tags')
        return question_loader.load_item()

示例#11

0

显示文件

文件： NrcMaterialsScraper.py 项目： netconstructor/scraper-2

    def parse_materials(self, response):
        reportnum = response.request.meta['reportnum']
        text = unicode (response.body, response.encoding)
        hxs = HtmlXPathSelector(text=text)
        materials = hxs.select ('//table[@class="t16Standard"]/tr')
        if (len(materials) == 0):
            self.log('Materials data not present in response from {0}'.format(response.url), log.INFO)
        else:
            # Skip the first report record because this is the header row
            materials.pop (0)
            if (len(materials) == 0):
                self.log('No materials reports found in response {0}'
                         .format(reportnum), log.INFO)
            else:
                self.log('Retrieved {0} materials records in report {1}'
                         .format(len(materials),reportnum), log.INFO)

        for material in materials:
            l = XPathItemLoader(NrcScrapedMaterial(), material)
            l.name_in = lambda slist: [s[:32] for s in slist]
            l.add_value('reportnum', reportnum)
            for name, params in NrcScrapedMaterial.fields.items():
                if 'xpath' in params:
                    l.add_xpath(name, params['xpath'])
            item = l.load_item()
            yield item
     
        self.db.setBotTaskStatus(reportnum, self.name, 'DONE')

示例#12

0

显示文件

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        parse_prices = lambda l: filter(bool, [item.strip() for item in l])

        item_name = hxs.select(
            "//input[contains(concat(' ', @class, ' '), ' search-within ')]/@value"
        ).extract()
        item_hash = hashlib.md5(
            '%s::%s::%s' %
            (self.auction_id, item_name, self.name)).hexdigest()
        item_price = parse_prices(
            hxs.select("//div[2]//div[2]/text()").extract())

        loader = XPathItemLoader(item=SearchResultItem(), response=response)
        loader.add_value("id", item_hash)
        loader.add_value("auction_id", self.auction_id)
        loader.add_value("site", self.name)
        loader.add_xpath(
            "name",
            "//input[contains(concat(' ', @class, ' '), ' search-within ')]/@value"
        )
        loader.add_value("link", response.url)
        loader.add_value("price", item_price)

        return loader.load_item()

示例#13

0

显示文件

文件： jabong_spider.py 项目： DeepFashion/E-Commerce-Scrapy

    def parse(self, response):
        """
        Default callback used by Scrapy to process downloaded responses

        """

        selector = HtmlXPathSelector(response)


        details=urlparse(response.request.url)
        queryStr={x.split('=')[0]:(x.split('=')[1]) for x in details.query.split("&")}
        print "\n",queryStr['page']

        # iterate over deals
        for deal in selector.select(self.products_list_xpath):
            loader = XPathItemLoader(JabongData(), selector=deal)

            # define processors
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()

            # iterate over fields and add xpaths to the loader
            for field, xpath in self.item_fields.iteritems():
                loader.add_xpath(field, xpath)

            # adding the request URL to the loader 
            loader.add_value("requestURL",unicode(response.request.url, "utf-8"))


            # adding the category for the request
            loader.add_value("category",unicode(self.category))

            yield loader.load_item()

示例#14

0

显示文件

文件： leboncoin.py 项目： ajocelynpatrick/scrapy-lbc

    def parse(self, response):
      # hxs     = HtmlXPathSelector(response)
      # ads     = hxs.select('//div[@class="list-ads"]/a')
      # items   = []
      # for ad in ads:
      #     item = LeboncoinItem()
      #     item['name']    = ad.select('div[@class="ad-lbc"]/div[@class="detail"]/div[@class="title"]/text()').re('^\s*([\w\s]+\w)\s*')
      #     item['photo']   = ad.select('div[@class="ad-lbc"]/div[@class="image"]/div[@class="image-and-nb"]/img/@src').extract()
      #     item['url']     = ad.select('@href').extract()

           # self.log(item['name'])
            #print item['name'],':' ,item['photo'],'--->', item['url']
           #html = '<div><div style="width:150px;height:250px;float:left;text-align:center">\
           #<img src="%s" alt="" /><br />\
           #<p><a href="%s">%s</a></p>\
           #</div></div>' % (''.join(item['photo']), ''.join(item['url']), ''.join(item['name']) )

           ##print photo
           #items.append(item)
           ##   put in filename
           #filename = response.url.split("/")[-4]
           #open('/tmp/lbc/'+filename+'.html', 'a').write(html)
        #return items
        #yield items
        hxs = HtmlXPathSelector(response)
        for qxs in hxs.select('//div[@class="list-ads"]/a'):
            loader = XPathItemLoader(LeboncoinItem(), selector=qxs)
            loader.add_xpath('name'      ,  'div[@class="ad-lbc"]/div[@class="detail"]/div[@class="title"]/text()', re='^\s*([\w\s]+\w)\s*' )
            loader.add_xpath('photo'     ,  'div[@class="ad-lbc"]/div[@class="image"]/div[@class="image-and-nb"]/img/@src' )
            loader.add_xpath('url'       ,  '@href' )
            loader.add_value('category'  ,  response.url.split("/")[-4]  )

            yield loader.load_item()

示例#15

0

显示文件

文件： NrcExtractor.py 项目： netconstructor/scraper-2

 def process_item(self, task_id):
     report = self.db.loadScrapedFullReport(task_id)
     if report is None:
         return
         
     text = report['full_report_body']
     text = "".join(chr(min(ord(c),127)) for c in text)
     t = TextResponse (url=report['full_report_url'], body=text.encode('utf-8')) #must have utf-8 here
     l = XPathItemLoader(NrcParsedReport(), response=t)
     l.add_value('reportnum', task_id)
     
     patterns = self.compile_patterns ()
     
     for p in patterns:
         l.add_value(p[0], text, TakeFirst(), unicode.strip, re=p[1])
             
     county = l.get_output_value('county')
     pattern = self.get_area_code_pattern(county)
     if pattern:
         l.add_value ('areaid', county)
         l.add_value('blockid', text, TakeFirst(), unicode.strip, re="%s[\s]+(?:BLOCK[\s]+)?([\d]+)" % pattern)
         l.add_value('blockid', text, TakeFirst(), unicode.strip, re="BLOCK[\s]+([\d]+)")
         
                     
     item = l.load_item()
     
     yield item
     self.item_completed(task_id)

示例#16

0

显示文件

文件： yahoo_answer.py 项目： KeithYue/QA-spider

    def get_answer(self, selector, question_loader):
        answer_loader = XPathItemLoader(item = YahooAnswer(), selector = selector)
        answer_loader.add_xpath('answer_id', './@id')
        answer_loader.add_xpath('answer_content','.//div[@class="qa-container"]//div[@class="content"]//text()')
        answer_loader.add_value('answerer',self.get_user(selector))
        answer_loader.add_value('question_id',question_loader.get_output_value('question_id'))
        answer_loader.add_xpath('answering_date',''.join([
            './/div[@class="qa-container"]//ul[@class="meta"]',
            '/li[1]/abbr/@title'
            ]))
        answer_loader.add_xpath('marks',''.join([
            './/div[@class="utils-container"]',
            '//li[@class="rate-up"]',
            '//span[@class="seo-rated"]/text()'
            ]))
        answer_loader.add_xpath('marks',''.join([
            './/div[@class="utils-container"]',
            '//li[@class="rate-up"]',
            '//span[@class="seo-rated"]//strong/text()'
            ]))
# get the good number ot bad number
        marks = answer_loader.get_output_value('marks')
        # print marks
        if marks.find('good'):
            answer_loader.add_value('number_of_good_marks', marks.split(' ')[0])
#bad numbers
# is best answer
        answer_class = selector.select('./@class').extract()[0]
        if answer_class.find('best') != -1:
            answer_loader.add_value('is_best_answer', 1)
        else:
            answer_loader.add_value('is_best_answer', 0)

        return answer_loader.load_item()

示例#17

0

显示文件

文件： digitalpodcast_discovery_spider.py 项目： solaise73/adaptfm

	def parse(self, response):
		x = XmlXPathSelector(response)
		#x.register_namespace("xsi", "http://www.w3.org/2001/XMLSchema-instance")
		
		#programs = x.select('./body/outline[position()=4]/outline[position()<4]')
		programs = x.select('//body/outline/outline')
		podcastCount = str(len(programs))
		i=0
		allitems=[]
		for program in programs:
			i=i+1
			l = XPathItemLoader(PodcastItem(), selector=program)
			l.add_xpath('id', 'concat("dpc_", ./@xmlUrl)')
			l.add_value('audioType', 'disco')
			l.add_xpath('brandId', './@xmlUrl')
			l.add_xpath('brandFeed', './@xmlUrl')
			l.add_xpath('brandName', './@title')
			l.add_xpath('brandDescription', './@description')
			l.add_xpath('brandHomepage', './@htmlUrl')
			
			self.log('Discovering dpc [%s of %s] feeds' % (i, podcastCount), level=log.INFO)
		
		
			item = l.load_item()
			yield item

示例#18

0

显示文件

文件： group_spider.py 项目： whitefoxx/douban_crawler

    def parse(self, response):
        url = response.url
        group_name = url[url.find("group") :].split("/")[1]
        hxs = HtmlXPathSelector(response)

        dls = hxs.select('//dl[@class="obu"]')
        items = []
        for dl in dls:
            item = GroupUserItem()
            l = XPathItemLoader(item=item, selector=dl)
            l.add_xpath("homepage", "dt/a/@href")
            l.add_xpath("image", "dt/a/img/@src")
            l.add_xpath("name", "dd/a/text()")
            l.add_value("group", group_name)
            yield l.load_item()

        links = hxs.select('//span[@class="next"]/a/@href').extract()
        for url in links:
            yield Request(url, callback=self.parse)
        if len(links) < 1:
            p = re.compile('<span class="next">.*?<a href="(.+?)">', re.S)
            m = p.search(response.body_as_unicode())
            if m:
                url = m.group(1)
                yield Request(url, callback=self.parse)

示例#19

0

显示文件

文件： trulia.py 项目： kerinin/giscrape

    def parse_listing(self, response):
        l = XPathItemLoader(item=ListingItem(), response=response)

        l.add_value("url", response.url)
        l.add_xpath("address", '//h1[@class="address"]/text()')

        l.add_xpath("price", '//div[@class="price"]/text()')

        l.add_xpath("bedrooms", '//th[text()="Bedrooms:"]/../td/text()')
        l.add_xpath("bathrooms", '//th[text()="Bathrooms:"]/../td/text()', re=r"(\d+)")
        l.add_xpath("powder_rooms", '//th[text()="Bathrooms:"]/../td/text()', re=r", (\d+)")
        l.add_xpath("property_type", '//th[text()="Property type:"]/../td/text()')
        l.add_xpath("size", '//th[text()="Size:"]/../td/text()', re=r"([\d|,]+) sqft")
        l.add_xpath("lot", '//th[text()="Lot:"]/../td/text()')
        l.add_xpath("price_per_sf", '//th[text()="Price/sqft:"]/../td/text()')
        l.add_xpath("year_built", '//th[text()="Year built:"]/../td/text()')
        l.add_xpath("date_listed", '//th[text()="Added on Trulia:"]/../td/text()')
        l.add_xpath("mls_id", '//th[text()="MLS/ID:"]/../td/text()')

        l.add_xpath("descriptive_title", '//h2[@class="descriptive_title"]/text()')
        l.add_xpath("description", '//div[@class="listing_description_module"]/text()')

        l.add_xpath("additional_fields", 'id("property_listing_details_module")/ul/li/span/text()')

        l.add_xpath("public_records", 'id("property_public_info_module")/ul/li/span/text()')

        return l.load_item()

示例#20

0

显示文件

    def parse(self, response):

        ubi = XPathItemLoader(item=FinanceIndex(), response=response)
        ubi.add_value("name", "Uruguay Bond Index")
        ubi.add_value("unit", "bps")
        ubi.add_xpath("value", "//span/text()")

        return [ubi.load_item()]

示例#21

0

显示文件

文件： kitco.py 项目： TotallyBullshit/finance-2

    def parse(self, response):
        
        gold = XPathItemLoader(item=FinanceIndex(), response=response)
        gold.add_value("name", "Oro Spot Cierre Londres")
        gold.add_value("unit", "USD")
        gold.add_xpath("value", "//td[@bgcolor='#cccc99'][1]//text()")

        return [gold.load_item()]

示例#22

0

显示文件

文件： rafap.py 项目： TotallyBullshit/finance-2

    def parse(self, response):
        
        ubi = XPathItemLoader(item=FinanceIndex(), response=response)
        ubi.add_value("name", "Uruguay Bond Index")
        ubi.add_value("unit", "bps")
        ubi.add_xpath("value", "//span/text()")

        return [ubi.load_item()]

示例#23

0

显示文件

    def parse(self, response):

        gold = XPathItemLoader(item=FinanceIndex(), response=response)
        gold.add_value("name", "Oro Spot Cierre Londres")
        gold.add_value("unit", "USD")
        gold.add_xpath("value", "//td[@bgcolor='#cccc99'][1]//text()")

        return [gold.load_item()]

示例#24

0

显示文件

 def get_user(self, selector):
     user_loader = XPathItemLoader(item=LazyTweetUser(), selector=selector)
     user_loader.add_xpath('twitter_username', ''.join(['./a[1]/text()']))
     user_loader.add_value(
         'twitter_url', ''.join([
             r'http://twitter.com/',
             user_loader.get_output_value('twitter_username')
         ]))
     return user_loader.load_item()

示例#25

0

显示文件

文件： chunyu_doctor_spider.py 项目： lee670523/gitPython

    def parse_doctor_detail(self, response):
        """ This function parses a sample response. Some contracts are mingled
        with this docstring.

        @url http://www.chunyuyisheng.com/doctor/clinic_web_31f4d70d2867b969
        @returns items 1 1
        @returns requests 0 0
        """

        hxs = HtmlXPathSelector(response)

        l = XPathItemLoader(CYDoctorItem(), hxs)

        l.add_xpath('_name', ("//div[@class='bdHd']/h1/text()"))

        shortdesc = hxs.select(
            "//div[@id='mainColumn']//p[@class='bdFt']/text()").extract()
        if len(shortdesc) == 1:
            shortdescStr = shortdesc[0].strip()
            words = shortdescStr.split()
            if len(words) == 3:
                l.add_value('title', words[0])
                l.add_value('hospital', words[1])
                l.add_value('specialty', words[2])
            else:
                print("title/hostpital/special error.")

        l.add_xpath(
            'specialtyDesc',
            "//div[@id='docOtherInfo']/div[@class='infoCell'][1]//p[2]/text()")
        l.add_xpath(
            'personalInfo',
            "//div[@id='docOtherInfo']/div[@class='infoCell'][2]//p[2]/text()")
        l.add_xpath('stars', "//p[@class='right starTxt']/text()")

        answer = hxs.select(
            "//div[@id='resolvedData']/p[1]/a/text()").extract()
        if len(answer) == 1:
            answerStr = answer[0].strip().replace(u"\xa0", "")
            m = re.match(u"解答:(?P<answer_cnt>\d+)", answerStr)
            if m.groupdict()["answer_cnt"] is not None:
                l.add_value('answers', m.groupdict()["answer_cnt"])

        review = hxs.select("//div[@id='resolvedData']/p[2]/text()").extract()
        if len(review) == 1:
            reviewStr = review[0].strip().replace(u"\xa0", "")
            m = re.match(u"评价:(?P<review_cnt>\d+)", reviewStr)
            if m.groupdict()["review_cnt"] is not None:
                l.add_value('reviews', m.groupdict()["review_cnt"])

        # l.add_xpath('answers', "//div[@id='resolvedData']/p[1]/a/text()")
        # l.add_xpath('reviews', "//div[@id='resolvedData']/p[2]/text()")

        ret = l.load_item()
        print ret

        yield ret

示例#26

0

显示文件

文件： kaogmat.py 项目： talentsun/gmatclub

 def parse_argument(self, response):
     loader = XPathItemLoader(item=Argument(), response=response)
     id = self.parse_id_from_url(response.url)
     if id:
         loader.add_value('id', id)
     else:
         loader.add_value('id', -1)
     loader.add_xpath('rating', '//b[@id="QuestionRateValue"]/text()')
     loader.add_xpath('essay', '//div[@class="essay"]')
     return loader.load_item()

示例#27

0

显示文件

文件： fxstreet.py 项目： TotallyBullshit/finance-2

 def parse(self, response):
     
     items = []
     for name, pattern, pos in rates:
         rate = XPathItemLoader(item=FinanceIndex(), response=response)
         rate.add_value("name", name)
         rate.add_value("unit", "%")
         rate.add_xpath("value", "//a[contains(text(), '%s')]/parent::td/following-sibling::td[%d]/text()" % (pattern, pos))
         items.append(rate.load_item())
     return items

示例#28

0

显示文件

文件： kitapyurdu.py 项目： rimbi/bookcrawler

    def parse_item(self, response):
		l = XPathItemLoader(item=BookItem(), response=response)
		l.add_xpath('name',     '//span[@class=\'kitapismi\']/text()')
		l.add_xpath('isbn',     '//span[@class=\'normalkucuk\']/text()', u'ISBN:([0-9]+)')
		l.add_xpath('author',   '//span/a[contains(@href, "/yazar/")]/text()')
		l.add_xpath('publisher','//span/a[contains(@href, "/yayinevi/")]/text()')
		l.add_xpath('price',    '//td/text()', u'Kitapyurdu Fiyatı:(.*) TL\.')
		l.add_value('link', response.url)
		l.add_value('store', 3)
		return l.load_item()

示例#29

0

显示文件

文件： bcu.py 项目： TotallyBullshit/finance-2

    def parse(self, response):

        rate = XPathItemLoader(item=FinanceIndex(), response=response)
        
        rate.add_value("name", "Tasa Objetivo BCU")
        rate.add_value("unit", "%")
        rate.add_xpath("value", "8.75")
        #rate.update_only_if_change = True
        
        return [rate.load_item()]

示例#30

0

显示文件

文件： netkitap.py 项目： sardok/bookcrawler

    def parse_item(self, response):
		l = XPathItemLoader(item=BookItem(), response=response)
		l.add_xpath('name',     '//h1[@class=\'kitapad14pnt\']/b/text()')
		l.add_xpath('isbn',     '//span[@class=\'kunye\']/text()', u'ISBN: ([0-9\-X]+)')
		l.add_xpath('author',   '//span[@class=\'yazarad12pnt\']/a/span[@class=\'yazarad12pnt\']/text()')
		l.add_xpath('publisher','//h3[@class=\'kapakyazisi\']/b/font/a/text()')
		l.add_xpath('price',    '//span[@class="kapakyazisi"]/font/b/text()', u'(.*) TL')
		l.add_value('link', response.url)
		l.add_value('store', 5)
		return l.load_item()

示例#31

0

显示文件

文件： lazytweet.py 项目： KeithYue/QA-spider

 def get_user(self, selector):
     user_loader = XPathItemLoader(item = LazyTweetUser(), selector = selector)
     user_loader.add_xpath('twitter_username', ''.join([
         './a[1]/text()'
         ]))
     user_loader.add_value('twitter_url', ''.join([
         r'http://twitter.com/',
         user_loader.get_output_value('twitter_username')
         ]))
     return user_loader.load_item()

示例#32

0

显示文件

文件： ideefixe.py 项目： rimbi/bookcrawler

    def parse_item(self, response):
		l = XPathItemLoader(item=BookItem(), response=response)
		l.add_xpath('name',     '//div[@class=\'boxTanimisim\']/div/text()')
		l.add_xpath('isbn',     '//div[@id=\'tanitimbox\']/text()', u'.*ISBN : ([0-9]+)')
		l.add_xpath('author',   '//div[@class=\'boxTanimVideo\']/a/text()')
		l.add_xpath('publisher','//h3[@class=\'boxTanimyayinevi\']/a/b/text()')
		l.add_xpath('price',    '//b[@class=\'pricerange\']/text()', u'\s*([0-9,]*) TL \(KDV Dahil\)')
		l.add_value('link', response.url)
		l.add_value('store', 2)
		return l.load_item()

示例#33

0

显示文件

文件： zhizhu_user_topic_spider.py 项目： KeithYue/Zhihu_Spider

    def get_UT_item(self, sel, user_url):
        '''
        given the selector of topic and user url, generate the u_t relationship
        '''
        ut_loader = XPathItemLoader(item=ZhiHuU_T(), selector = sel)
        ut_loader.add_value('crawled_from', user_url)
        ut_loader.add_value('user_url', '/'+'/'.join(user_url.split('/')[-3:-1]))
        ut_loader.add_xpath('topic_url', './/a[contains(@class, "zm-list-avatar-link")]/@href')

        return  ut_loader.load_item()

示例#34

0

显示文件

文件： ilknokta.py 项目： rimbi/bookcrawler

    def parse_item(self, response):
		l = XPathItemLoader(item=BookItem(), response=response)
		l.add_xpath('name',     '//font[@class=\'baslikt\']/strong/text()')
		l.add_xpath('isbn',     '//td/text()', u'.*ISBN: ([0-9\-]+)')
		l.add_xpath('author',   '//td[@class=\'yazart\']/a/text()')
		l.add_xpath('publisher','//a[@class=\'yayineviU\']/text()')
		l.add_xpath('price',    '//font[@class=\'fiyat\']/text()', u'([0-9,]+) TL')
		l.add_value('link', response.url)
		l.add_value('store', 6)
		return l.load_item()

示例#35

0

显示文件

文件： pandora.py 项目： rimbi/bookcrawler

    def parse_item(self, response):
		l = XPathItemLoader(item=BookItem(), response=response)
		l.add_xpath('name',     '//span[@id=\'ctl00_ContentPlaceHolderMainOrta_LabelAdi\']/text()')
		l.add_xpath('isbn',     '//span[@id=\'ctl00_ContentPlaceHolderMainOrta_LabelIsbn\']/text()')
		l.add_xpath('author',   '//span[@id=\'ctl00_ContentPlaceHolderMainOrta_LabelYazar\']/a/text()')
		l.add_xpath('publisher','//a[@id=\'ctl00_ContentPlaceHolderMainOrta_HyperLinkYayinci\']/text()')
		l.add_xpath('price',    '//span[@class=\'fiyat\']/text()', u'(.*) TL')
		l.add_value('link', response.url)
		l.add_value('store', 4)
		return l.load_item()

示例#36

0

显示文件

文件： bcu.py 项目： mhumpher/finance-1

    def parse(self, response):

        rate = XPathItemLoader(item=FinanceIndex(), response=response)

        rate.add_value("name", "Tasa Objetivo BCU")
        rate.add_value("unit", "%")
        rate.add_xpath("value", "8.75")
        #rate.update_only_if_change = True

        return [rate.load_item()]

示例#37

0

显示文件

文件： chunyu_doctor_spider.py 项目： lee670523/gitPython

    def parse_doctor_detail(self, response):
        """ This function parses a sample response. Some contracts are mingled
        with this docstring.

        @url http://www.chunyuyisheng.com/doctor/clinic_web_31f4d70d2867b969
        @returns items 1 1
        @returns requests 0 0
        """

        hxs = HtmlXPathSelector(response)


        l = XPathItemLoader(CYDoctorItem(), hxs)


        l.add_xpath('_name', ("//div[@class='bdHd']/h1/text()"))

        shortdesc = hxs.select("//div[@id='mainColumn']//p[@class='bdFt']/text()").extract()
        if len(shortdesc) == 1:
                shortdescStr = shortdesc[0].strip()
                words = shortdescStr.split()
                if len(words) == 3:
                    l.add_value('title', words[0])
                    l.add_value('hospital', words[1])
                    l.add_value('specialty', words[2])
                else:
                    print ("title/hostpital/special error.")



        l.add_xpath('specialtyDesc', "//div[@id='docOtherInfo']/div[@class='infoCell'][1]//p[2]/text()")
        l.add_xpath('personalInfo', "//div[@id='docOtherInfo']/div[@class='infoCell'][2]//p[2]/text()")
        l.add_xpath('stars', "//p[@class='right starTxt']/text()")

        answer = hxs.select("//div[@id='resolvedData']/p[1]/a/text()").extract()
        if len(answer) == 1:
            answerStr = answer[0].strip().replace(u"\xa0", "")
            m = re.match(u"解答:(?P<answer_cnt>\d+)", answerStr)
            if m.groupdict()["answer_cnt"]is not None:
                l.add_value('answers', m.groupdict()["answer_cnt"])

        review = hxs.select("//div[@id='resolvedData']/p[2]/text()").extract()
        if len(review) == 1:
            reviewStr = review[0].strip().replace(u"\xa0", "")
            m = re.match(u"评价:(?P<review_cnt>\d+)", reviewStr)
            if m.groupdict()["review_cnt"]is not None:
                l.add_value('reviews', m.groupdict()["review_cnt"])

        # l.add_xpath('answers', "//div[@id='resolvedData']/p[1]/a/text()")
        # l.add_xpath('reviews', "//div[@id='resolvedData']/p[2]/text()")

        ret = l.load_item()
        print ret

        yield ret

示例#38

0

显示文件

文件： compras.py 项目： betorod/GPB

 def load_compra_items(self, response, orden_compra):
     hxs = HtmlXPathSelector(response)
     for tr in hxs.select('//table[contains(@width, "760")][2]/tr'):
         i = CompraLineaItem()
         l = XPathItemLoader(item=i, selector=tr)
         l.add_xpath('cantidad', 'td[1]/text()')
         l.add_xpath('importe', 'td[2]/text()')
         l.add_xpath('detalle', 'td[3]/text()')
         l.add_value('orden_compra', orden_compra)
         x = l.load_item()
         yield x

示例#39

0

显示文件

文件： spiders.py 项目： RuralIndia/pari_scrapy

    def parse_articles(self, response):
        hxs = HtmlXPathSelector(response)

        l = XPathItemLoader(item=Article(), response=response)
        l.add_xpath("title", "//h1[contains(@class,'detail-title')]/text()")
        l.add_xpath("content", "//div[contains(@class,'article-text')]//p[contains(@class,'body')]")
        l.add_xpath("date", "//span[contains(@class,'dateline')]/text()")
        l.add_xpath("location", " ")
        l.add_xpath("keywords", "//div[@id='articleKeywords']/p/a/text()")
        l.add_value("link", response.url)
        l.add_value("author", 'Sainath')
        return l.load_item()

示例#40

0

显示文件

文件： zhizhu_user_topic_spider.py 项目： yyhTHU/Zhihu_Spider

    def get_UT_item(self, sel, user_url):
        '''
        given the selector of topic and user url, generate the u_t relationship
        '''
        ut_loader = XPathItemLoader(item=ZhiHuU_T(), selector=sel)
        ut_loader.add_value('crawled_from', user_url)
        ut_loader.add_value('user_url',
                            '/' + '/'.join(user_url.split('/')[-3:-1]))
        ut_loader.add_xpath(
            'topic_url', './/a[contains(@class, "zm-list-avatar-link")]/@href')

        return ut_loader.load_item()

示例#41

0

显示文件

文件： yahoo_answer.py 项目： KeithYue/QA-spider

    def get_user(self, selector):
        user_loader = XPathItemLoader(item = YahooUser(), selector = selector)
        user_loader.add_xpath('user_name', './/span[contains(@class, "user")]//span[contains(@class, "fn")]/text()')
        user_loader.add_xpath('user_url', './/span[@class="user"]//a[@class="url"]/@href')
        user_loader.add_value('user_id', re.match(r'http://answers\.yahoo\.com/my/profile\?show=(.*)',
            user_loader.get_output_value('user_url')
            ).group(1))

        if user_loader.get_collected_values('user_name'):
            return user_loader.load_item()
        else:
            return None

示例#42

0

显示文件

文件： doc_spider.py 项目： lee670523/gitPython

    def parse_faculty_detail(self, response):
        """ This function parses a sample response. Some contracts are mingled
        with this docstring.

        @url http://www.haodf.com/faculty/DE4rO-XCoLU0Jq1rbc1P6dS2aO.htm
        @returns items 21 21
        @returns requests 3 3
        @scrapes _name specialty title shortDesc
        """
        hxs = HtmlXPathSelector(response)

        linkExtractor = SgmlLinkExtractor(
            allow=(r"/faculty/\S+/menzhen.htm\?orderby", ), unique=True)
        links = linkExtractor.extract_links(response)
        for link in links:
            yield Request(link.url, callback=self.parse_faculty_detail)

        specialty = hxs.select(
            "/html/body/div[3]/div/div[2]/div/a[3]/text()").extract()
        hospital = hxs.select(
            "/html/body/div[3]/div/div[2]/div/a[2]/text()").extract()

        docLinks = hxs.select(
            "//table[@id='doc_list_index']/tr[descendant::td[contains(@class, 'tda')]]"
        )
        #docLinks = hxs.select("//table[@id='doc_list_index']/tr")

        for doc in docLinks:
            l = XPathItemLoader(DoctorItem(), doc)

            docNames = doc.select(
                "./td[@class='tda']/li/a[contains(@href, 'http://www.haodf.com/doctor/')]/text()"
            ).extract()

            if len(docNames) != 0:
                print docNames[0]

            l.add_xpath(
                '_name',
                "./td[@class='tda']/li/a[contains(@href, 'http://www.haodf.com/doctor/')]/text()"
            )
            l.add_value('specialty', specialty)
            l.add_value('hospital', hospital)
            l.add_xpath('title', "./td[@class='tda']/li/p[1]/text()")
            l.add_xpath('acadamicDegree', "./td[@class='tda']/li/p[2]/text()")
            l.add_xpath('shortDesc', "./td[@class='tdb']/text()")
            #clinic time todo

            ret = l.load_item()
            #print ret

            yield ret

示例#43

0

显示文件

    def parse(self, response):

        items = []
        for name, pattern, pos in rates:
            rate = XPathItemLoader(item=FinanceIndex(), response=response)
            rate.add_value("name", name)
            rate.add_value("unit", "%")
            rate.add_xpath(
                "value",
                "//a[contains(text(), '%s')]/parent::td/following-sibling::td[%d]/text()"
                % (pattern, pos))
            items.append(rate.load_item())
        return items

示例#44

0

显示文件

文件： mdq_spider.py 项目： garagelab/GPB

    def parse_lineas(self, response):
        hxs = HtmlXPathSelector(response)
        for tr in hxs.select('//table//tr[position() > 1]'):
            cli = CompraLineaItem()
            l = XPathItemLoader(item=cli, selector=tr)
            l.add_xpath('cantidad', 'td[4]/text()')
            l.add_xpath('unidad_medida', 'td[5]/text()')
            l.add_xpath('importe', 'td[3]/text()')
            l.add_xpath('importe_total', 'td[6]/text()')
            l.add_xpath('detalle', 'td[2]/text()')
            l.add_value('orden_compra', [response.request.meta['orden_de_compra']]) # hack, ver ../items.py:50 (TakeFirst())

            yield l.load_item()

示例#45

0

显示文件

文件： auctionzip_spider.py 项目： elcpls/auction-scraper

    def parse_links(self, response):
        listing = re.findall(r"lid=(\d+)",response.url)

        loader = XPathItemLoader(item=AuctionsItem(), response=response)
        loader.add_value("id",listing[0])
        loader.add_xpath("auctioneer",settings['AUCTION_AUCTIONEER'])
        loader.add_xpath("contact_number",settings['AUCTION_CONTACT_NUMBER'])
        loader.add_xpath("date",settings['AUCTION_DATE'])
        loader.add_xpath("time",settings['AUCTION_TIME'])
        loader.add_xpath("location",settings['AUCTION_LOCATION'])
        loader.add_value("link",response.url)
        loader.add_xpath("listing",settings['AUCTION_LISTING'])

        return loader.load_item()

示例#46

0

显示文件

 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     orden_compra, anio = re.search(r'wOCabc=(\d+)&wEjercicio=(\d+)',
                                    urlparse(response.url).query).groups()
     for tr in hxs.select('//table[contains(@width, "760")][2]/tr'):
         i = CompraLineaItem()
         l = XPathItemLoader(item=i, selector=tr)
         l.add_xpath('cantidad', 'td[1]/text()')
         l.add_xpath('importe', 'td[2]/text()')
         l.add_xpath('detalle', 'td[3]/text()')
         l.add_value('orden_compra', int(orden_compra))
         l.add_value('anio', int(anio))
         x = l.load_item()
         yield x

示例#47

0

显示文件

文件： spiders.py 项目： RuralIndia/pari_scrapy

    def parse_articles(self, response):
        hxs = HtmlXPathSelector(response)

        l = XPathItemLoader(item=Article(), response=response)
        l.add_xpath("title", "//h1[contains(@class,'detail-title')]/text()")
        l.add_xpath(
            "content",
            "//div[contains(@class,'article-text')]//p[contains(@class,'body')]"
        )
        l.add_xpath("date", "//span[contains(@class,'dateline')]/text()")
        l.add_xpath("location", " ")
        l.add_xpath("keywords", "//div[@id='articleKeywords']/p/a/text()")
        l.add_value("link", response.url)
        l.add_value("author", 'Sainath')
        return l.load_item()

示例#48

0

显示文件

 def get_answer(self, selector, response):
     answer_loader = XPathItemLoader(item = LazyTweetAnswer(), \
             selector = selector)
     answer_loader.add_value('question_id', response.url.split('/')[-1])
     answer_loader.add_value(
         'answerer',
         self.get_user(
             selector.select(''.join(['.//span[@class="answer-meta"]']))))
     answer_loader.add_xpath(
         'answer_content', ''.join([
             './/span[@class="answer-body"]',
             '//span[@class="answer-status"]//descendant-or-self::text()'
         ]))
     print answer_loader.get_output_value('answer_content')
     a = input()
     return answer_loader.load_item()

示例#49

0

显示文件

    def parse_full_report(self, response):
        # need to work around weird bug where lxml can't handle encode=WINDOWS-1252
        # so pull out the body, convert to utf-8 and create a new TextResponse object to contain it
        # since XPathItemLoader requires a Response object
        text = unicode (response.body, response.encoding)
        t = TextResponse (url=response.url, body=text.encode('utf-8'), encoding='utf-8')

        l= XPathItemLoader(NrcScrapedFullReport(), response=t)
        url_parts = urlsplit(response.url)
        l.add_value('reportnum', parse_qs(url_parts.query)['standard_web inc_seq'])
        l.add_xpath('full_report_body', '//body')
        l.add_value('full_report_url', response.url)
        item = l.load_item()
        reportnum = item['reportnum']
        yield item
        self.db.setBotTaskStatus(reportnum, self.name, 'DONE')

示例#50

0

显示文件

    def get_user(self, selector, response, label):
        user_loader = XPathItemLoader(item = StackOverflowUser(),
                selector = selector)
        user_loader.add_xpath('user_name', ''.join([
            './/div[contains(@class, "user-details")]',
            '/a/text()'
            ]))
        user_loader.add_xpath('user_link', ''.join([
            './/div[contains(@class, "user-details")]',
            '/a/@href'
            ]))

        if user_loader.get_output_value('user_link'):
            user_id = user_loader.get_output_value('user_link')
            user_loader.add_value('user_id',
                    user_loader.get_output_value('user_link'))

        return user_loader.load_item()

示例#51

0

显示文件

    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        for qxs in hxs.select(self.lista_linhas_xpath):
            loader = XPathItemLoader(LinhaItem(), selector=qxs)
            loader.add_xpath('linha', './td[1]/p//text()')
            loader.add_xpath('nome', './td[3]/p//text()')

            link = self.base_url + qxs.select('./td[3]//a/@href').extract()[0]
            #TODO: Deveria manter o contexto e retornar os dados da proxima pagina
            #      mas o que parece eh que nao esta retornando
            request = Request(link, callback=self.parse_item)
            #pdb.set_trace()

            loader.add_value('ida', request.meta['ida'])
            loader.add_value('volta', request.meta['volta'])

            yield loader.load_item()

示例#52

0

显示文件

文件： trulia.py 项目： unreal-estate/giscrape

    def parse_rental(self, response):
        l = XPathItemLoader(item=RentalItem(), response=response)

        l.add_value('url', response.url)
        l.add_xpath('address', '//th[text()="Address:"]/../td/text()')

        l.add_xpath('price', '//th[text()="Price:"]/../td/div/text()')
        l.add_xpath('price_period',
                    '//th[text()="Price:"]/../td/div/span/text()')

        l.add_xpath('bedrooms', '//th[text()="Bedrooms:"]/../td/text()')
        l.add_xpath('bathrooms',
                    '//th[text()="Bathrooms:"]/../td/text()',
                    re=r'(\d+)')
        l.add_xpath('powder_rooms',
                    '//th[text()="Bathrooms:"]/../td/text()',
                    re=r', (\d+)')
        l.add_xpath('property_type',
                    '//th[text()="Property type:"]/../td/text()')
        l.add_xpath('size',
                    '//th[text()="Size:"]/../td/text()',
                    re=r'([\d|,]+) sqft')
        l.add_xpath('lot', '//th[text()="Lot:"]/../td/text()')
        l.add_xpath('year_built', '//th[text()="Year built:"]/../td/text()')
        l.add_xpath('lease_term',
                    '//th[text()="Terms of lease:"]/../td/text()')
        l.add_xpath('pets_allowed', '//th[text()="Pets:"]/../td/text()')
        l.add_xpath('date_listed',
                    '//th[text()="Added on Trulia:"]/../td/text()')
        l.add_xpath('mls_id', '//th[text()="MLS/ID:"]/../td/text()')

        l.add_xpath('descriptive_title',
                    '//h2[@class="descriptive_title"]/text()')
        l.add_xpath('description',
                    '//div[@class="listing_description_module"]/text()')

        l.add_xpath('additional_fields',
                    'id("property_listing_details_module")/ul/li/span/text()')

        l.add_xpath('public_records',
                    'id("property_public_info_module")/ul/li/span/text()')

        return l.load_item()

示例#53

0

显示文件

文件： google_spider.py 项目： dheerajbhaskar/GooglePlay-Downloader

    def parse_item(self, response):
        sel = Selector(response)
        print response.url
        app_loader = XPathItemLoader(item=AppItem(),
                                     selector=sel)  # init the item loader
        # set app id
        app_loader.add_value('app_id', parse_id(response.url))
        # composite the title
        app_loader.add_xpath(
            'title', '//div[contains(@class, "document-title")]//text()')
        app_loader.add_xpath(
            'description',
            '//div[contains(@class, "id-app-orig-desc")]//text()')
        app_loader.add_xpath('score',
                             '//meta[@itemprop="ratingValue"]//@content')
        app_loader.add_xpath(
            'icon_url',
            '//div[contains(@class, "details-info")]//img[contains(@class, "cover-image")]/@src'
        )
        app_loader.add_xpath(
            'author',
            '//div[@itemprop="author"]//span[@itemprop="name"]//text()')
        app_loader.add_xpath(
            'app_type',
            '//div[contains(@class, "details-info")]//span[@itemprop="genre"]/text()'
        )

        # get the similarities and the more from developers
        app_loader.add_xpath(
            'similarity',
            '//div[contains(@class, "recommendation")]//div[contains(@class, "details-section-contents")]/div[@class="rec-cluster" and position()=1]//div[contains(@class, "card")]/@data-docid'
        )

        app_loader.add_xpath(
            'more_from_devs',
            '//div[contains(@class, "recommendation")]//div[contains(@class, "details-section-contents")]/div[@class="rec-cluster" and position()=2]//div[contains(@class, "card")]/@data-docid'
        )

        # print app_loader.load_item()
        # print app_loader.get_output_value('app_id')

        return app_loader.load_item()

示例#54

0

显示文件

    def parse_materials(self, response):
        text = unicode (response.body, response.encoding)
        hxs = HtmlXPathSelector(text=text)
        materials = hxs.select ('//table[@class="t16Standard"]/tr')
        if (len(materials) == 0):
            self.log('Materials data not present in response from {0}'.format(response.url), log.INFO)
        else:
            # Skip the first report record because this is the header row
            materials.pop (0)
            if (len(materials) == 0):
                self.log('No incident reports found in response', log.INFO)
            else:
                self.log('Retrieved {0} materials records'.format(len(materials)), log.INFO)

        for material in materials:
            l = XPathItemLoader(NrcScrapedMaterial(), material)
            l.add_value('reportnum', response.url, TakeFirst(), re='P3_SEQNOS:(\d+)')
            for name, params in NrcScrapedMaterial.fields.items():
                if 'xpath' in params:
                    l.add_xpath(name, params['xpath'])
            item = l.load_item()
            yield item

示例#55

0

显示文件

文件： NrcFullReportScraper.py 项目： netconstructor/scraper-2

    def parse_full_report(self, response):
        reportnum = response.request.meta['reportnum']

        # need to work around weird bug where lxml can't handle encode=WINDOWS-1252
        # so pull out the body, convert to utf-8 and create a new TextResponse object to contain it
        # since XPathItemLoader requires a Response object
        text = unicode(response.body, response.encoding)
        if len(
                text
        ) < 1000:  # check for an empty response- if so then bail out - we'll try again next time around
            return

        t = TextResponse(url=response.url,
                         body=text.encode('utf-8'),
                         encoding='utf-8')

        l = XPathItemLoader(NrcScrapedFullReport(), response=t)
        url_parts = urlsplit(response.url)
        l.add_value('reportnum', reportnum)
        l.add_xpath('full_report_body', '//body')
        l.add_value('full_report_url', response.url)
        item = l.load_item()
        yield item
        self.db.setBotTaskStatus(reportnum, self.name, 'DONE')

示例#56

0

显示文件

文件： merval.py 项目： mhumpher/finance-1

    def parse(self, response):

        rate = XPathItemLoader(item=FinanceIndex(), response=response)
        
        rate.add_value("name", "Merval")
        rate.add_value("unit", "")

        hxs = HtmlXPathSelector(response)
        rate.add_value("value", hxs.select("//span[contains(@id,'UltimoMerval')]/text()")[0].extract())
        
        return [rate.load_item()]