Python ItemLoader.add_xpath示例，scrapy.loader.ItemLoader.add_xpath Python示例

示例#1

0

显示文件

文件： imgspider.py 项目： donnki/ScrapyImage

	def parse_page(self, response):
		#爬取图片
		# print u'~~~~', unicode(response.body, "gbk").encode("utf8")
		# print(self.config["xpathImagesPath"])
		# print(response.xpath(self.config["xpathImagesPath"]))
		l = ItemLoader(item=PageItem(), response=response)
		l.add_value('title', response.request.cookies['title'])
		l.add_value('name', self.config["id"])
		l.add_value('url', response.url)
		if self.config.has_key("imageUrlReplacement"):
			l.add_value('replace', self.config["imageUrlReplacement"])
			
		if self.config.has_key("xpathImagesPath"):
			l.add_xpath('image_urls', self.config["xpathImagesPath"])
		if self.config.has_key("xpathFilesPath"):
			l.add_xpath('file_urls', self.config["xpathFilesPath"])
		yield l.load_item()
		
		#TODO：获取下一页地址，递归调用自parse_page
		if self.config.has_key("xpathNextImageUrl"):
			nextUrls = response.xpath(self.config["xpathNextImageUrl"])
			if len(nextUrls) > 0:
				nextPage = nextUrls.extract()[0]
				if not nextPage.startswith("http"):
					if nextPage.startswith("/"):
						nextPage = response.url[0:response.url.index("/",10)+1]+nextPage 
					else:
						nextPage = response.url[0:response.url.rfind("/")+1]+nextPage 
				request = scrapy.Request(nextPage, callback=self.parse_page, cookies={'title': response.request.cookies['title']})
				yield request

示例#2

0

显示文件

文件： tuba77.py 项目： donnki/ScrapyImage

	def parse_item(self, response):
		l = ItemLoader(item=PageItem(), response=response)
		l.add_value('title', response.request.cookies['title'])
		l.add_value('name', self.name)
		l.add_value('url', response.url)
		l.add_xpath('image_urls', '//td[@valign="top"]/img/@src')
		return l.load_item()

示例#3

0

显示文件

文件： nfl_team_rosters.py 项目： AncillaryStats/AS-Scrapers

    def get_player_info(self, response):
        loader = ItemLoader(item=NFL_Player_2015(), response=response)
        loader.default_input_processor = MapCompose(unicode.strip)
        loader.default_output_processor = Join()

        number_and_position = response.xpath('//*[@id="content"]/div[3]/div[2]/div[3]/ul[1]/li[1]/text()').extract()[0]
        number_and_position = response.xpath('//*[@id="content"]/div[3]/div[2]/div[3]/ul[1]/li[1]/text()').extract()

        if type(number_and_position) is list:
            number_and_position = number_and_position[0]
            number = number_and_position.split()[0]
            position = number_and_position.split()[1]
        else:
            number = ''
            position = ''

        loader.add_value('number', number)
        loader.add_value('position', position)
        loader.add_xpath('name', '//*[@id="content"]/div[3]/div[2]/h1/text()')
        loader.add_xpath('team', '//*[@id="content"]/div[3]/div[2]/div[3]/ul[1]/li[3]/a/text()')

        # loader.add_xpath('name', '//*[@id="content"]/div[3]/div[2]/h1/text()')
        # loader.add_xpath('team', '//*[@id="content"]/div[3]/div[2]/div[3]/ul[1]/li[3]/a/text()')

        yield loader.load_item()

示例#4

0

显示文件

文件： beauty.py 项目： donnki/ScrapyImage

	def parse_item(self, response):
		l = ItemLoader(item=PageItem(), response=response)
		l.add_value('title', response.request.cookies['title'])
		l.add_value('url', response.url)
		l.add_value('name', self.name)
		l.add_xpath('image_urls', '//div[@class="l_effect_img_mid"]/a/img/@src')
		return l.load_item()

示例#5

0

显示文件

文件： basic.py 项目： parkchul72/scrapybook

    def parse(self, response):
        """ This function parses a property page.

        @url http://web:9312/properties/property_000000.html
        @returns items 1
        @scrapes title price description address image_urls
        @scrapes url project spider server date
        """
        
        """
        #1. First method
        item = PropertiesItem()
        item['title'] = response.xpath('//*[@itemprop="name"][1]/text()').extract()
        item['price'] = response.xpath('//*[@itemprop="price"][1]/text()').re('[.0-9]+')
        item['description'] = response.xpath('//*[@itemprop="description"][1]/text()').extract()
        item['address'] = response.xpath(
            '//*[@itemtype="http://schema.org/'
            'Place"][1]/text()').extract()
        item['image_urls'] = response.xpath('//*[@itemprop="image"][1]/@src').extract()
        return item
        """
        
        #2. Secode method
        l = ItemLoader(item=PropertiesItem(), response=response)
        
        l.add_xpath('title', '//*[@itemprop="name"][1]/text()')
        l.add_xpath('price', '//*[@itemprop="price"][1]/text()', re('[.0-9]+')
        l.add_xpath('description', '//*[@itemprop="description"][1]/text()')
        l.add_xpath('address', '//*[@itemtype="http://schema.org/Place"][1]/text()')
        l.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src')
        
        return l.load_item()
        
        
        """

示例#6

0

显示文件

文件： easy.py 项目： Fighting-Toghter/scrapybook

    def parse_item(self, response):
        """ This function parses a property page.

        @url http://web:9312/properties/property_000000.html
        @returns items 1
        @scrapes title price description address image_urls
        @scrapes url project spider server date
        """

        # Create the loader using the response
        l = ItemLoader(item=PropertiesItem(), response=response)

        # Load fields using XPath expressions
        l.add_xpath('title', '//*[@itemprop="name"][1]/text()',
                    MapCompose(unicode.strip, unicode.title))
        l.add_xpath('price', './/*[@itemprop="price"][1]/text()',
                    MapCompose(lambda i: i.replace(',', ''), float),
                    re='[,.0-9]+')
        l.add_xpath('description', '//*[@itemprop="description"][1]/text()',
                    MapCompose(unicode.strip), Join())
        l.add_xpath('address',
                    '//*[@itemtype="http://schema.org/Place"][1]/text()',
                    MapCompose(unicode.strip))
        l.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src',
                    MapCompose(lambda i: urlparse.urljoin(response.url, i)))

        # Housekeeping fields
        l.add_value('url', response.url)
        l.add_value('project', self.settings.get('BOT_NAME'))
        l.add_value('spider', self.name)
        l.add_value('server', socket.gethostname())
        l.add_value('date', datetime.datetime.now())

        return l.load_item()

示例#7

0

显示文件

文件： movie_spider.py 项目： A-Horse/douban_movie_scrapy

    def parse_movie(self,response):
        
        loader = ItemLoader(item=DoubanItem(),response=response)
        
        for attr,xpath in self.settings.getdict('INFO_XPATH').items():
            loader.add_xpath(attr,xpath)

        s = response.xpath('//div[@id="info"]').extract_first()
        for attr,regex in self.settings.getdict('RE').items():
            loader.add_value(attr,re.findall(regex,s))
            
        loader.add_value('rate',self.parse_rate(response))
        loader.add_value('url',response.url)
  
        if self.settings.get('ALLOW_COVER') == True:
            image_urls = self._get_urls(
                self.image_base_url,
                urljoin,
                response.xpath('//div[@id="mainpic"]/a/img/@src').extract(),
                lambda s:s.split('/')[-1],
            )

            loader.add_value('image_urls',image_urls)
        
        return loader.load_item()

示例#8

0

显示文件

文件： idealista.py 项目： vtisza/MilanRentalSpyder

    def parse(self, response):
        l=ItemLoader(item=RentalItem(),response=response)
        l.add_xpath('price','//*[(@id = "main-info")]//*[contains(concat( " ", @class, " " ), concat( " ", "txt-big", " " )) and contains(concat( " ", @class, " " ), concat( " ", "txt-bold", " " ))]/text()')
        l.add_xpath('adress','//*[(@id = "addressPromo")]//*[contains(concat( " ", @class, " " ), concat( " ", "txt-bold", " " ))]/text()')
        l.add_value('url', response.url)

        return l.load_item()

示例#9

0

显示文件

文件： nytSpider.py 项目： yongjianmu/DM_Similarity

    def parse(self,response):
        l = ItemLoader(item = NytimesItem(),response = response)
        l.add_xpath('topnews','//*[contains(@id,"topnews-100")]/h2/a/text()')
        l.add_xpath('sectionnews','//h3[contains(@class,"story-heading")]/text()')
        #print(type(l.load_item()))
        x = l.load_item()
        #print(len(x['date']),len(x['topnews']),len(x['sectionnews']))
        nytdict = dict()
        datelist = []
        datalist = datetime.date.today()
        topnewslist = []
        sectionnewslist = []
        nytdict['date'] = str(datalist)

        for t in x['topnews']:
            topnewslist.append(str(t.encode('ascii','ignore')))
        nytdict['topnews']=topnewslist

        for t in x['sectionnews']:
            sectionnewslist.append(str(t.encode('ascii','ignore')).strip())
        nytdict['sectionnews']=sectionnewslist

        filename = datetime.date.today()
        f=open('{}.json'.format(filename),'w')
        json.dump(nytdict,f)
        return l.load_item()

示例#10

0

显示文件

文件： allforyou.py 项目： angelosuinan/save22-productspiders

	def get_item(self,response):

		loader = ItemLoader(item=expansys_item(), response=response)
		loader.add_xpath('url', str('http//www.allforyou.sg'+response.url))
		    	
   		loader.add_xpath('title', '//span[contains(@itemprop, "name")]/text()')
   		return loader.load_items()

示例#11

0

显示文件

文件： meizitu_spider.py 项目： 29littlemonk/Scrapy

    def parse_item(self, response):
        l = ItemLoader(item=MeizituItem(), response=response)
        l.add_xpath('name', '//h2/a/text()')
        #l.add_xpath('tag', "//div[@id='maincontent']/div[@class='postmeta  clearfix']/div[@class='metaRight']/p")
        l.add_xpath('image_url', "//div[@id='picture']/p/img/@src", Identity())
 
        l.add_value('url', response.url)
        return l.load_item()

示例#12

0

显示文件

文件： movie_spider.py 项目： A-Horse/douban_movie_scrapy

    def parse_rate(self,response):

        loader = ItemLoader(item = RateItem(),response=response)
        
        for attr,xpath in self.settings.getdict('RATE_XPATH').items():
            loader.add_xpath(attr,xpath)

        return loader.load_item()

示例#13

0

显示文件

文件： fromcsv.py 项目： Fighting-Toghter/scrapybook

    def parse(self, response):
        item = Item()
        l = ItemLoader(item=item, response=response)
        for name, xpath in response.meta['fields'].iteritems():
            if xpath:
                item.fields[name] = Field()
                l.add_xpath(name, xpath)

        return l.load_item()

示例#14

0

显示文件

文件： org_name.py 项目： enplotz/webir2015

    def parse(self, response):
        item = ItemLoader(item=OrgItem(), response=response)
        item.add_value('id', self.curr)
        item.add_xpath('name', '//h2[@class="gsc_authors_header"]/text()')
        yield item.load_item()
        next_url = self.next_label_from_db()

        if next_url:
            yield Request(url=next_url,dont_filter=True)

示例#15

0

显示文件

文件： meizitu.py 项目： fengkaiwhu/Scrapy_meizitu

    def parse_content(self, response):
        logger.info('Dealing with images: %s', response.url)
        item_load = ItemLoader(item=ScrapyMeizituItem(), response=response)
        item_load.add_value('url', response.url)
        item_load.add_xpath('name', self._x_query['name'])
        item_load.add_xpath('tags', self._x_query['tags'])
        item_load.add_xpath('image_urls', self._x_query['image_urls'])

        return item_load.load_item()

示例#16

0

显示文件

文件： nfl_team_info_spider.py 项目： AncillaryStats/AS-Scrapers

    def parse_depth_chart(self, response):
        loader = ItemLoader(item=NFL_Team_2015(), response=response)
        loader.default_input_processor = MapCompose(unicode.strip)
        loader.default_output_processor = Join()

        loader.add_xpath("division", '//*[@id="sub-branding"]/div[2]/text()')
        loader.add_xpath("name", '//*[@id="sub-branding"]/h2/a/b/text()')

        yield loader.load_item()

示例#17

0

显示文件

文件： login.py 项目： xiaokangzailushang/code

    def parse_item(self, response):
        
	l=ItemLoader(item=PropertiesItem(),response=response)
	l.add_xpath('title','//*[@itemprop="name"][1]/text()')
	l.add_xpath('price','//*[@itemprop="price"][1]/text()',re='[.0-9]+')
	l.add_xpath('description','//*[@itemprop="description"][1]/text()')
	l.add_xpath('address','//*[@itemtype="http://schema.org/Place"][1]/text()')
	l.add_xpath('image_urls','//*[@itemprop="image"][1]/@src')

	return l.load_item()

示例#18

0

显示文件

文件： MySpider.py 项目： niklasben/BAThesis

    def parse_stuff(self, response):
        hxs = Selector(response)
        sites = hxs.xpath('//body')
        items_main = []

        for site in sites:
            loader = ItemLoader(item = Items_Main(), response = response)
            loader.add_xpath('fragment', '//*[not(self::script)]/text()')
            items_main.append(loader.load_item())
            return items_main

示例#19

0

显示文件

文件： seedb_spider.py 项目： adriennefranke/SeedDB-Crawler

 def parse_accelerator(self, response):
     for sel in response.xpath('//table/tbody/tr'):
         l = ItemLoader(item=SeedDB2Item(), selector=sel)
         l.add_xpath('accelerator', 'td/a/strong/text()')
         l.add_xpath('accelerator_website', 'td/a/@href')
         l.add_xpath('num_cohorts', 'td[3]/span/text()')
         l.add_xpath('num_exits', 'td[4]/span/text()')
         l.add_xpath('num_funding', 'td[5]/span/text()')
         l.add_xpath('num_avg_funding', 'td[6]/span/text()')
         yield l.load_item()

示例#20

0

显示文件

文件： XCSP.py 项目： joyceloo/learn_Notes

	def parse_item(self,response):
		l=ItemLoader(item=XcspiderItem(),response=response)
		m=response.xpath("//span[@class='ellipsis']/a/@title")
		# print m
		l.add_xpath('dp_content',"//ul/li[@class='main_con']/text()",MapCompose(unicode.strip),Join())
		l.add_xpath('dp_user',"//span[@class='ellipsis']/a/@title",MapCompose(unicode.strip))
		l.add_value('dp_link',response.url)
		l.add_xpath('dp_scence',"//div[@class='f_left']/h1/text()")
		l.add_xpath('dp_provice',"//div[@class='breadbar_v1 cf']/ul/li[4]/a/text()",MapCompose(lambda i:i.replace("景点",'')))
		l.add_xpath('dp_time',"//span[@class='youcate']/text()",MapCompose(unicode.strip))
		return l.load_item()

示例#21

0

显示文件

文件： idealista_list.py 项目： vtisza/MilanRentalSpyder

    def parse_item(self, selector,response):

        # Create the loader using the response
        l = ItemLoader(item=RentalItem(), selector=selector)
        l.add_xpath('price','(.//span[contains(@class, "item-price")]/text())[1]')
        l.add_xpath('size','.//small/text()[. = "m2"]/../../text()')
        l.add_xpath('rooms','.//small/text()[. = "locali"]/../../text()')
        l.add_xpath('address','.//a[contains(@class, "item-link")]/@title')
        l.add_xpath('elevator','.//span[text()="piano"]/../text()')
        #l.add_xpath('floor','(.//span[text()="piano"]/../../text())[1]')
        return l.load_item()

示例#22

0

显示文件

文件： auction_item_spider.py 项目： fredriksoderberg/auction-analysis

    def parse_auction_item(self, response):
        
        loader = ItemLoader(AuctionItems(), response=response)

        loader.default_input_processor = MapCompose(lambda v: v.split(), replace_escape_chars)
        loader.default_output_processor = Join()
       
        for field, xpath in auction_item_fields.iteritems():
            loader.add_xpath(field, xpath)        

              
        yield loader.load_item()

示例#23

0

显示文件

文件： uniprot.py 项目： tt6746690/EdgoDB

    def parse(self, response):
        l = ItemLoader(item=UniprotItem(), response=response)
        l.add_xpath('proteinName', "//*[@id='page-header']/h2/span/text()")
        l.add_value('uniprotAccession', response.url)
        l.add_xpath('uniprotProteinLength', "//*[@id='sequences-section']/div[1]/div[2]/div[1]/span[2]/text()")
        listing = response.xpath("//*[@id='subcellular_location']/div[1]/ul")
        subcellular_location = []
        for li in listing:
            subcellular_location.append(li.xpath("./li/a/text()").extract())
        l.add_value('uniprotLocalization', subcellular_location)

        yield l.load_item()

示例#24

0

显示文件

文件： manual.py 项目： choupijiang/scrapylearning

    def parse_item(self, response):

        l = ItemLoader(item=StartupItem(), response=response)
        l.add_xpath('title', '//*[@id="C-Main-Article-QQ"]//h1/text()')
        l.add_xpath('abstract', '//*[@id="C-Main-Article-QQ"]//p[@class="Introduction"]/text()',)

        # Housekeeping fields
        l.add_value('url', response.url)
        l.add_value('project', self.settings.get('scrapy_test'))
        l.add_value('spider', self.name)
        l.add_value('server', socket.gethostname())
        l.add_value('date', datetime.datetime.now())
        return l.load_item()

示例#25

0

显示文件

文件： tech.py 项目： pyorc/pyorcnews

 def parse_item(self, response):
     logging.info(u"start crawl  --->  " + response.url)
     item = ItemLoader(item=NewsItem(), response=response)
     sel = Selector(response)
     content = sel.xpath('//div[@id="Cnt-Main-Article-QQ"]/p')
     article_time = content.xpath('//span[@class="pubTime"]/text()').extract()
     date_time = compare_time(article_time, u"%Y年%m月%d日%H:%M")
     if not date_time:
         return
     item.add_xpath('keywords', "//head/meta[@name='keywords']/@content")
     item.add_value('date_time', date_time)
     item.add_xpath('title', '//div[@class="hd"]/h1/text()')
     item.add_xpath('reading_number', '//em[@id="top_count"]/text()')
     item.add_xpath('author', '//span[@class="auth"]/text()')
     item.add_value('original_link', response.url)
     elements = sel.xpath('//div[@id="Cnt-Main-Article-QQ"]/p').extract()
     images, content = translate_content(elements)
     if images:
         item.add_value('image_url', hashlib.sha1(images[0]).hexdigest() + ".jpg")
     item.add_value('content', content)
     item.add_value('image_urls', images)
     item.add_value('source', u'腾讯科技')
     item.add_value('category', CATEGORY.TECHNOLOGY)
     logging.info(u"finished crawl  --->  " + response.url)
     yield item.load_item()

示例#26

0

显示文件

文件： salefinder.py 项目： janusle/SpecialFinderCrawler

    def parse_item(self, response):
        # FIXME: fix array issue
        i = ItemLoader(item=SalefinderItem(), response=response)
        title = r'//div[@id="product-details-container"]//h1/text()'
        price = r'//div[@id="product-details-container"]//span[@class="price"]/text()'
        per = r'//div[@id="product-details-container"]//span[@class="price"]/text()'
        image_url = r'//a[@id="product-image-container"]//img/@src'

        i.add_xpath('title', title, MapCompose(unicode.lower))
        i.add_xpath('price', price, re=r'[,.0-9]+')
        i.add_xpath('per', per, re=r'pk|each|kg')
        i.add_xpath('image_url', image_url)

        i.add_value('url', response.url)
        i.add_value('date', date.today().isoformat())

        product_buy = response.xpath("//div[@class='product-container']//div[@id='product-buy']")
        product_buy_text = product_buy.extract_first().lower()

        # Detect the vendor from a product-buy div
        if 'coles' in product_buy_text:
            i.add_value('vendor', 'coles')
        elif 'woolworths' in product_buy_text:
            i.add_value('vendor', 'woolworths')
        else:
            i.add_value('vendor', 'unknown')
        return i.load_item()

示例#27

0

显示文件

文件： unica.py 项目： carlosp420/lunch_in_turku_scraper

    def parse(self, response):
        today = datetime.date.today()
        today_long_date = datetime.datetime.strftime(today, '%A, %d %b %Y')
        today = datetime.datetime.strftime(today, '%A')

        sel = response.xpath
        restaurant = self.get_title(sel)

        l = ItemLoader(item=LunchItem(), response=response)
        l.add_value('restaurant', restaurant)
        l.add_xpath('dishes', "//h4[text()='" + today + "']/following-sibling::table//td[@class='lunch']")
        l.add_value('day', today_long_date)

        yield l.load_item()

示例#28

0

显示文件

文件： pfam.py 项目： tt6746690/EdgoDB

    def parse(self, response):

        sel = Selector(response)
        table = sel.xpath("//*[@id='imageKey']/tbody/tr")

        for tr in table:
            l = ItemLoader(item=PfamItem(), selector=tr)
            l.add_value('proteinName', response.url)
            l.add_xpath('pfamAccession', "./td[position() = 1 and text() = 'Pfam']/@class")
            l.add_xpath('pfamID', "./td[2]/a/text()")
            l.add_xpath('sequenceStart', "./td[3]/text()")
            l.add_xpath('sequenceEnd', "./td[4]/text()")
            l.add_xpath('proteinLength', '//*[@id="proteinSummaryBlock"]/div[2]/table[1]/tbody/tr[3]/td[2]/text()')
            yield l.load_item()

示例#29

0

显示文件

文件： user.py 项目： nosiar/algo-crawl

    def parse(self, response):
        sel = Selector(response)
        last_page = sel.xpath('//span[@class="step-links"]/a/text()')[-1].extract()
        self.num_page = int(last_page)

        loader = ItemLoader(item=User(), response=response)
        loader.add_value('uid', self.uid)
        loader.add_xpath('name', '//a[@class="username"]/text()')

        for i in range(1, self.num_page + 1):
            url = self.start_urls[0] + '/' + str(i)
            yield Request(url,
                          callback=self.parse_list,
                          meta={'loader': loader})

示例#30

0

显示文件

文件： wendaSpider.py 项目： joyceloo/learn_Notes

 def parse_item(self,response):
 	l=ItemLoader(item=AskspiderItem())
 	l.add_xpath('q_title',"//h1[@class='ask_title']/text()",MapCompose(unicode.strip),Join())
 	l.add_xpath('q_time',"//span[@class='ask_time']/text()",MapCompose(unicode.strip))
 	l.add_xpath('q_province',"//div[@class='abouttdd']/ul/li[1]/h3/span/text()",MapCompose(unicode.strip))
 	l.add_value('q_link',response.url)
 	l.add_xpath('q_user',"//a[@class='ask_username']/text()")
 	return l.load_item()

示例#31

0

显示文件

文件： Spiders.py 项目： Vianyraj/callspiderproj

    def parse(self, response):

        if response.xpath(self.TitleXpath).get() is None:
            raise ValueError("the TitleXpath of Bmbf webpage has changed")
        elif response.xpath(self.DateXpath).get() is None:
            raise ValueError("the Bmbf webpage xpath has changed")
        elif response.xpath(self.UrlXpath).get() is None:
            raise ValueError("the URl of Bmbf webpage xpath has changed")

        loader = ItemLoader(item=EventItemBmbf(), response=response)
        loader.add_xpath("TitleBMBF", self.TitleXpath)
        loader.add_xpath("DateBMBF", self.DateXpath)
        loader.add_xpath(
            "UrlBMBF", self.UrlXpath
        )  #//div[@class="main"]//div[@class="content"]//div[@class="article-section"]//p/strong
        item = loader.load_item()
        #
        if "TitleBMBF" not in item:
            raise ValueError("TitleBMBF item is not loaded")
        elif "DateBMBF" not in item:
            raise ValueError("DateBMBF item is not loaded")
        elif "UrlBMBF" not in item:
            raise ValueError("UrlBMBF item is not loaded")

        #store crawled data in a dict, then yield it to pipeline
        mydict = {
            "title": [],
            "date": [],
            "url": [],
            "paperType": [],
            "where": [],
            "deadline": []
        }

        for k in range(int(len(item["UrlBMBF"]))):

            mydict["title"].append(item["TitleBMBF"][k])
            mydict["date"].append(
                datetime.strptime(
                    item["DateBMBF"][k].replace(" ", "").split("-")[0],
                    '%d.%m.%Y'))
            #print("date",datetime.strptime( item["DateBMBF"][k].replace(" ","").split("-")[0], '%d.%m.%Y'))
            #print("----------------------")
            mydict["url"].append('https://www.bmbf.de/' + item["UrlBMBF"][k])

            if (len(item["DateBMBF"][k].split("-")) > 1):
                #if formatchecker!=datetime.strptime( (item["DateBMBF"][k].replace(" ","")).split("-")[1], '%d.%m.%Y')
                #raise ValueError("the webpage has changed or the date format has changed")
                #print(datetime.strptime( (item["DateBMBF"][k].replace(" ","")).split("-")[1], '%d.%m.%Y'))
                mydict["deadline"].append(
                    datetime.strptime(
                        (item["DateBMBF"][k].replace(" ", "")).split("-")[1],
                        '%d.%m.%Y'))

            else:
                mydict["deadline"].append(None)

        myPipeline = ScrapyProjectPipeline()
        myPipeline.process_item(mydict, SpiderBmbf)

        yield mydict  #if we put this outside for loop calling pipelines 1 times

示例#32

0

显示文件

文件： spider.py 项目： liao961120/collegeSNA

    def parse_item(self, response):
        l = ItemLoader(item=CollegeNetworkItem(), response=response)

        l.add_xpath('college',
                    "//div[@class='row school-title-wrapper']/p/b/a/text()")
        l.add_xpath('department',
                    "//div[@class='row school-title-wrapper']/p/b/text()[2]")

        l.add_xpath('department_attr',
                    "//div[@class='card-block']/p/b[position() <= 6]/text()")
        l.add_xpath('department_attr_val',
                    "//div[@class='card-block']/p/text()[position() <= 6]")

        l.add_xpath('overlap_college',
                    "//div[@class='card-block']//small/a/text()")
        l.add_xpath('overlap_college_num',
                    "//div[@class='card-block']//small/text()")
        l.add_xpath(
            'applied_region',
            "//div[@class='card-block']/text()[re:test(., 'x\d{1,3}')]")

        # Housekeeping fields
        l.add_value('url', response.url)
        l.add_value('rtrv_date', datetime.datetime.now())

        return l.load_item()

示例#33

0

显示文件

文件： comments_spider.py 项目： pombredanne/data-repeatability-in-web-science

 def parse_comment(self, comment_scope):
     # Extract the data for a single comment
     selector = Selector(text=comment_scope)
     comment_loader = ItemLoader(item=Comment(), selector=selector)
     comment_loader.default_output_processor = TakeFirst()
     comment_loader.add_xpath('comment_id', '//li/@id')
     comment_loader.add_xpath('comment_author',
                              '//div/div[1]/div/span[1]/a/span/text()')
     comment_loader.add_xpath(
         'comment_text',
         '//div/div[2]/div[2]/div[@class="d-comment__body"]')
     comment_loader.add_xpath('timestamp',
                              '//div/div[1]/div/div/a/time/@datetime')
     comment_loader.add_xpath(
         'parent_comment_id',
         '//a[@class="js-discussion-author-link"]/@href')
     comment_loader.add_xpath('upvotes',
                              '//div/div[2]/div[1]/span/span[1]/text()')
     return comment_loader.load_item()

示例#34

0

显示文件

    def parse_item(self, response):
        url = response.url
        item_list = item_code(url, self.web_name, '/loan/(.*?)$')
        item = ItemLoader(item=YzmSx5170Item(), response=response)
        item.add_value('web_name', self.web_name)
        item.add_value('web_code', self.name)
        item.add_value('url', url)
        item.add_value('item_code', item_list.get('item_code'))
        item.add_xpath('title', '//title/text()')
        item.add_xpath('amount', "//ul[@class='left-1-ul']//li[1]//p[1]")
        item.add_xpath('rate', "//ul[@class='left-1-ul']//li[2]//p[1]")
        item.add_xpath('period', "//ul[@class='left-1-ul']//li[3]//p[1]")
        item.add_xpath(
            'loan_using',
            '//*[contains(text(),"资金用途")]/following-sibling::div[1]/p/text()')
        # item.add_xpath('loaner_info', '//*[@id="userName"]')
        item.add_xpath('pay_type', '//*[contains(text(),"还款方式")]/text()')
        item.add_xpath('progress',
                       "//ol[@class='left-1-ol']//li[2]/span/text()")

        # invest records
        i_v = []
        invest_records_temp = '{{username={lst[0]}|rate=-1|postmoney={lst[2]}|money={lst[2]}|postdate={lst[1]}|status=全部通过}}'
        invest_records_format = ""
        tr = response.css('.invest-table').css('tr')
        if not tr:
            tr = response.css('.tou.info-tab-main').css('tr')
        try:
            for i in tr:
                lst = i.css('td::text').extract()
                i_v.append(lst)
            for n in i_v:
                invest_records_format += invest_records_temp.format(lst=n)
            item.add_value('invest_records', invest_records_format)
            item.add_value('start', i_v[1][1])
            item.add_value('end', i_v[-1][1])
        except Exception:
            print(url, 'invest records is error')
        yield item.load_item()

示例#35

0

显示文件

文件： QuarterlyBalanceSheet.py 项目： Shroudless/backup

    def parse(self, response):
        l = ItemLoader(item=FinanceItem(), response=response)
        l.add_xpath(
            "Currency",
            '//div[@id="balinterimdiv"]//*[@id="fs-table"]/thead/tr/th[1]/text()',
            MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            "TimePeriod",
            '//div[@id="balinterimdiv"]//*[@id="fs-table"]/thead/tr/th[2]/text()',
            MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            "CashAndEquivalents",
            '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[1]/td[2]/text()',
            MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            "ShortTermInvestments",
            '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[2]/td[2]/text()',
            MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            "CashAndShortTermInvestments",
            '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[3]/td[2]/text()',
            MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            "AccountsReceivableTradeNet",
            '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[4]/td[2]/text()',
            MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            "ReceivablesOther",
            '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[5]/td[2]/text()',
            MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            "TotalReceivablesNet",
            '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[6]/td[2]/text()',
            MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            "TotalInventory",
            '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[7]/td[2]/text()',
            MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            "PrepaidExpenses",
            '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[8]/td[2]/text()',
            MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            "OtherCurrentAssetsTotal",
            '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[9]/td[2]/text()',
            MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            "TotalCurrentAssets",
            '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[10]/td[2]/text()',
            MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            "PropertyPlantEquipmentTotalGross",
            '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[11]/td[2]/text()',
            MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            "AccumulatedDepreciationTotal",
            '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[12]/td[2]/span/text()',
            MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            "GoodwillNet",
            '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[13]/td[2]/text()',
            MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            "IntangiblesNet",
            '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[14]/td[2]/text()',
            MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            "LongTermInvestments",
            '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[15]/td[2]/text()',
            MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            "OtherLongTermAssetsTotal",
            '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[16]/td[2]/text()',
            MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            "TotalAssets",
            '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[17]/td[2]/text()',
            MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            "AccountsPayable",
            '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[18]/td[2]/text()',
            MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            "AccruedExpenses",
            '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[19]/td[2]/text()',
            MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            "NotesPayableShortTermDebt",
            '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[20]/td[2]/text()',
            MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            "CurrentPortofLTDebtCapitalLeases",
            '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[21]/td[2]/text()',
            MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            "OtherCurrentliabilitiesTotal",
            '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[22]/td[2]/text()',
            MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            "TotalCurrentLiabilities",
            '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[23]/td[2]/text()',
            MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            "LongTermDebt",
            '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[24]/td[2]/text()',
            MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            "CapitalLeaseObligations",
            '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[25]/td[2]/text()',
            MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            "TotalLongTermDebt",
            '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[26]/td[2]/text()',
            MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            "TotalDebt",
            '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[27]/td[2]/text()',
            MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            "DeferredIncomeTax",
            '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[28]/td[2]/text()',
            MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            "MinorityInterest",
            '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[29]/td[2]/text()',
            MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            "OtherLiabilitiesTotal",
            '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[30]/td[2]/text()',
            MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            "TotalLiabilities",
            '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[31]/td[2]/text()',
            MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            "RedeemablePreferredStockTotal",
            '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[32]/td[2]/text()',
            MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            "PreferredStockNonRedeemableNet",
            '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[33]/td[2]/text()',
            MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            "CommonStockTotal",
            '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[34]/td[2]/text()',
            MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            "AdditionalPaidInCapital",
            '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[35]/td[2]/text()',
            MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            "RetainedEarningsAccumulatedDeficit",
            '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[36]/td[2]/text()',
            MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            "TreasuryStockCommon",
            '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[37]/td[2]/span/text()',
            MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            "OtherEquityTotal",
            '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[38]/td[2]/span/text()',
            MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            "TotalEquity",
            '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[39]/td[2]/text()',
            MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            "TotalLiabilitiesShareholdersEquity",
            '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[40]/td[2]/text()',
            MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            "SharesOutsCommonStockPrimaryIssue",
            '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[41]/td[2]/text()',
            MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            "TotalCommonSharesOutstanding",
            '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[42]/td[2]/text()',
            MapCompose(unicode.strip, unicode.title))

        return l.load_item()

示例#36

0

显示文件

    def parse_match(self, response, **kwargs):
        """
        Fetches data about particular event from returned content.
        Creates match item and fills with fetched data.
        """
        html_event_part = HtmlResponse(url=response.url,
                                       body=json.loads(
                                           response.body)['content1'].encode())

        match_loader = ItemLoader(item=MatchItem(), response=html_event_part)
        match_loader.add_value('id', uuid4())
        match_loader.add_value('league_id', kwargs.get('league_id'))
        match_loader.add_xpath('timestamp', '//p[@data-time]/@data-time')
        match_loader.add_xpath(
            'home_team',
            '//span[@itemprop="homeTeam"]/span[@itemprop="name"]/@content')
        match_loader.add_xpath(
            'away_team',
            '//span[@itemprop="awayTeam"]/span[@itemprop="name"]/@content')
        match_loader.add_xpath('stadium',
                               '//small/span[@itemprop="name"]/text()')
        match_loader.add_xpath(
            'home_result', '//div[contains(@class, "h2h-final-score")]/'
            'div[@class="widget-content"]/h2/text()')
        match_loader.add_xpath(
            'away_result', '//div[contains(@class, "h2h-final-score")]/'
            'div[@class="widget-content"]/h2/text()')
        match = match_loader.load_item()
        yield match

        html_post_match = HtmlResponse(url=response.url,
                                       body=json.loads(
                                           response.body)['content2'].encode())

        if html_post_match.xpath('//div[@class="w100 cf ac"]'):
            # if post match statistics data exists
            statistics_loader = ItemLoader(item=PostMatchStatisticsItem(),
                                           response=html_post_match)
            statistics_loader.add_value('id', uuid4())
            statistics_loader.add_value('match_id', match['id'])
            statistics_loader.add_xpath(
                'possession_home',
                '//span[contains(@class, "possession")]/text()')
            statistics_loader.add_xpath(
                'possession_away',
                '//span[contains(@class, "possession")]/text()')
            statistics_loader.add_xpath(
                'shots_home',
                '//div[@class="w100 m0Auto"]/div/div[contains(text(), "Shots")]'
                '/following-sibling::div[contains(@class, "bbox")]/span/text()'
            )
            statistics_loader.add_xpath(
                'shots_away',
                '//div[@class="w100 m0Auto"]/div/div[contains(text(), "Shots")]'
                '/following-sibling::div[contains(@class, "bbox")]/span/text()'
            )
            statistics_loader.add_xpath(
                'cards_home',
                '//div[@class="w100 m0Auto"]/div/div[contains(text(), "Cards")]'
                '/following-sibling::div[contains(@class, "bbox")]/span/text()'
            )
            statistics_loader.add_xpath(
                'cards_away',
                '//div[@class="w100 m0Auto"]/div/div[contains(text(), "Cards")]'
                '/following-sibling::div[contains(@class, "bbox")]/span/text()'
            )
            statistics_loader.add_xpath(
                'corners_home',
                '//div[@class="w100 m0Auto"]/div/div[contains(text(), "Corners")]'
                '/following-sibling::div[contains(@class, "bbox")]/span/text()'
            )
            statistics_loader.add_xpath(
                'corners_away',
                '//div[@class="w100 m0Auto"]/div/div[contains(text(), "Corners")]'
                '/following-sibling::div[contains(@class, "bbox")]/span/text()'
            )
            statistics_loader.add_xpath(
                'fouls_home',
                '//div[@class="w100 m0Auto"]/div/div[contains(text(), "Fouls")]'
                '/following-sibling::div[contains(@class, "bbox")]/span/text()'
            )
            statistics_loader.add_xpath(
                'fouls_away',
                '//div[@class="w100 m0Auto"]/div/div[contains(text(), "Fouls")]'
                '/following-sibling::div[contains(@class, "bbox")]/span/text()'
            )
            statistics_loader.add_xpath(
                'offsides_home',
                '//div[@class="w100 m0Auto"]/div/div[contains(text(), "Offsides")]'
                '/following-sibling::div[contains(@class, "bbox")]/span/text()'
            )
            statistics_loader.add_xpath(
                'offsides_away',
                '//div[@class="w100 m0Auto"]/div/div[contains(text(), "Offsides")]'
                '/following-sibling::div[contains(@class, "bbox")]/span/text()'
            )
            yield statistics_loader.load_item()

示例#37

0

显示文件

文件： prueba-checkpoint.py 项目： arrpak/MasterDataScience-1

 def parse_items(self, response):
     
     item = ItemLoader(AirbnbItem(), response)
     item.add_xpath('tipo', '//*[@id-"summary"]/div/div/div[1]/div/div/div/div[2]/div[2]/div/div[1]/text()')
     item.add_xpath('capacidad', '//*[@id-"summary"]/div/div/div[1]/div/div/div/div[2]/div[2]/div/div[2]/text()', MapCompose(lambda i: i[0]))
     yield item.load_item()

示例#38

0

显示文件

    def parse_reply(self, response):
        '''
        parse reply to comments, root comment is added if flag
        '''
        if response.meta['flag'] == 'init':
            #parse root comment
            for root in response.xpath(
                    '//div[contains(@id,"root")]/div/div/div[count(@id)!=1 and contains("0123456789", substring(@id,1,1))]'
            ):
                new = ItemLoader(item=CommentsItem(), selector=root)
                new.context['lang'] = self.lang
                new.add_xpath('source', './/h3/a/text()')
                new.add_value('reply_to', 'ROOT')
                new.add_xpath('text', './/div[1]//text()')
                new.add_xpath('date', './/abbr/text()')
                new.add_xpath(
                    'reactions',
                    './/a[contains(@href,"reaction/profile")]//text()')
                new.add_value('url', response.url)
                yield new.load_item()
            #parse all replies in the page
            for reply in response.xpath(
                    '//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]'
            ):
                new = ItemLoader(item=CommentsItem(), selector=reply)
                new.context['lang'] = self.lang
                new.add_xpath('source', './/h3/a/text()')
                new.add_value('reply_to', response.meta['reply_to'])
                new.add_xpath('text', './/div[h3]/div[1]//text()')
                new.add_xpath('date', './/abbr/text()')
                new.add_xpath(
                    'reactions',
                    './/a[contains(@href,"reaction/profile")]//text()')
                new.add_value('url', response.url)
                yield new.load_item()

            back = response.xpath(
                '//div[contains(@id,"comment_replies_more_1")]/a/@href'
            ).extract()
            if back:
                self.logger.info('Back found, more nested comments')
                back_page = response.urljoin(back[0])
                yield scrapy.Request(back_page,
                                     callback=self.parse_reply,
                                     priority=100,
                                     meta={
                                         'reply_to': response.meta['reply_to'],
                                         'flag': 'back',
                                         'url': response.meta['url'],
                                         'index': response.meta['index']
                                     })
            else:
                next_reply = response.meta['url']
                self.logger.info(
                    'Nested comments crawl finished, heading to proper page: {}'
                    .format(response.meta['url']))
                yield scrapy.Request(
                    next_reply,
                    callback=self.parse_page,
                    meta={'index': response.meta['index'] + 1})

        elif response.meta['flag'] == 'back':
            #parse all comments
            for reply in response.xpath(
                    '//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]'
            ):
                new = ItemLoader(item=CommentsItem(), selector=reply)
                new.context['lang'] = self.lang
                new.add_xpath('source', './/h3/a/text()')
                new.add_value('reply_to', response.meta['reply_to'])
                new.add_xpath('text', './/div[h3]/div[1]//text()')
                new.add_xpath('date', './/abbr/text()')
                new.add_xpath(
                    'reactions',
                    './/a[contains(@href,"reaction/profile")]//text()')
                new.add_value('url', response.url)
                yield new.load_item()
            #keep going backwards
            back = response.xpath(
                '//div[contains(@id,"comment_replies_more_1")]/a/@href'
            ).extract()
            self.logger.info('Back found, more nested comments')
            if back:
                back_page = response.urljoin(back[0])
                yield scrapy.Request(back_page,
                                     callback=self.parse_reply,
                                     priority=100,
                                     meta={
                                         'reply_to': response.meta['reply_to'],
                                         'flag': 'back',
                                         'url': response.meta['url'],
                                         'index': response.meta['index']
                                     })
            else:
                next_reply = response.meta['url']
                self.logger.info(
                    'Nested comments crawl finished, heading to home page: {}'.
                    format(response.meta['url']))
                yield scrapy.Request(
                    next_reply,
                    callback=self.parse_page,
                    meta={'index': response.meta['index'] + 1})

示例#39

0

显示文件

 def parse_video(self, response):
     item = ItemLoader(Video(), response)
     item.add_xpath('titulo', '//h1/text()')
     item.add_xpath('fecha_de_publicacion',
                    '//span[@class="publish-date"]/text()')
     yield item.load_item()

示例#40

0

显示文件

文件： helpers.py 项目： uk-gov-mirror/nationalarchives.Discovery-Elasticsearch

def parse_title(response):
    il = ItemLoader(item=TnaWebsiteItem(), response=response)
    il.default_input_processor = MapCompose(lambda v: v.split(), replace_escape_chars)
    il.default_output_processor = Join()
    il.add_xpath('TITLE', '//h1[contains(@class, "parchment")]//text()')
    return il.load_item()

示例#41

0

显示文件

文件： helpers.py 项目： uk-gov-mirror/nationalarchives.Discovery-Elasticsearch

def parse_keywords(response):
    il = ItemLoader(item=TnaWebsiteItem(), response=response)
    il.default_input_processor = MapCompose(lambda v: v.split(), replace_escape_chars)
    il.default_output_processor = Join()
    il.add_xpath('KEYWORDS', '//td[contains(@class, "tabbody")]//ul/li/a/text()')
    return il.load_item()

示例#42

0

显示文件

    def parse(self, response):

        self.crawler.stats.set_value('pages_to_visit', len(self.urls))

        loader = ItemLoader(item=SofifaItem(), response=response)
        col_4_loader = loader.nested_xpath(
            ".//div[@class='column col-4 text-center']")

        # GENERAL PLAYER INFORMATION

        loader.add_xpath('id', ".//div[@class='info']/h1/text()")
        loader.add_xpath('name', ".//div[@class='info']/h1/text()")
        loader.add_xpath('full_name', ".//div[@class='meta']/text()")
        loader.add_xpath(
            'age',
            ".//div[@class='meta']/text()/following-sibling::text()[last()]")
        loader.add_xpath(
            'dob',
            ".//div[@class='meta']/text()/following-sibling::text()[last()]")
        loader.add_xpath(
            'height',
            ".//div[@class='meta']/text()/following-sibling::text()[last()]")
        loader.add_xpath(
            'weight',
            ".//div[@class='meta']/text()/following-sibling::text()[last()]")
        loader.add_xpath('nationality', ".//div[@class='meta']/a/@title")

        # GENERAL PLAYER STATS

        loader.add_xpath(
            'preferred_foot',
            "(.//label[text()='Preferred Foot']/following::text())[1]")
        loader.add_xpath(
            'international_reputation',
            "(.//label[text()='International Reputation']/following::text())[1]"
        )
        loader.add_xpath(
            'weak_foot', "(.//label[text()='Weak Foot']/following::text())[1]")
        loader.add_xpath(
            'skill_moves',
            "(.//label[text()='Skill Moves']/following::text())[1]")
        loader.add_xpath(
            'work_rate',
            "(.//label[text()='Work Rate']/following::span/text())[1]")
        loader.add_xpath(
            'body_type',
            "(.//label[text()='Body Type']/following::span/text())[1]")
        loader.add_xpath(
            'real_face',
            "(.//label[text()='Real Face']/following::span/text())[1]")

        # CLUB/TEAM INFORMATION

        col_4_loader.add_xpath(
            'value',
            "following::text()[contains(., 'Value')]/following::span[1]/text()"
        )
        col_4_loader.add_xpath(
            'wage',
            "following::text()[contains(., 'Wage')]/following::span[1]/text()")
        loader.add_xpath(
            'release_clause',
            "(.//label[text()='Release Clause']/following::span/text())[1]")
        loader.add_xpath('club_name', "(.//ul[@class='pl']//a/text())[1]")
        loader.add_xpath(
            'club_rating',
            ".//div[@class='column col-4'][3]/ul/li[2]/span/text()")
        loader.add_xpath(
            'club_position',
            "(.//label[text()='Position']/following::text()[1])[1]")
        loader.add_xpath(
            'club_jersey_number',
            "(.//label[text()='Jersey Number']/following::text()[1])[1]")
        loader.add_xpath('club_join_date',
                         ".//label[text()='Joined']/following::text()[1]")
        loader.add_xpath(
            'loaned_from',
            ".//label[text()='Loaned From']/following::a[1]/text()")
        loader.add_xpath(
            'club_contract_end_date',
            ".//label[text()='Contract Valid Until']/following::text()[1]")
        loader.add_xpath('team_name', "(.//ul[@class='pl']//a/text())[2]")
        loader.add_xpath(
            'team_rating',
            ".//div[@class='column col-4'][4]/ul/li[2]/span/text()")
        loader.add_xpath(
            'team_position',
            "(.//label[text()='Position']/following::text()[1])[2]")
        loader.add_xpath(
            'team_jersey_number',
            "(.//label[text()='Jersey Number']/following::text()[1])[2]")

        # PLAYER GAME STATS

        loader.add_xpath(
            'overall_rating', "(.//div[@class='column col-4 text-center']"
            "/preceding::text()[contains(.,'Overall Rating')])[2]/following::span[1]/text()"
        )
        col_4_loader.add_xpath(
            'potential_rating',
            "following::text()[contains(., 'Potential')]/following::span[1]"
            "/text()")
        loader.add_xpath('positions', ".//div[@class='meta']/span/text()")
        loader.add_xpath('unique_attributes', ".//div[@class='mt-2']/a/text()")

        if 'GK' in response.xpath(
                ".//div[@class='meta']/span/text()").getall():

            loader.add_xpath(
                'DIV',
                ".//div[@class='wrapper']//script[contains(text(), 'var overallRating')]/text()"
            )
            loader.add_xpath(
                'HAN',
                ".//div[@class='wrapper']//script[contains(text(), 'var overallRating')]/text()"
            )
            loader.add_xpath(
                'KIC',
                ".//div[@class='wrapper']//script[contains(text(), 'var overallRating')]/text()"
            )
            loader.add_xpath(
                'REF',
                ".//div[@class='wrapper']//script[contains(text(), 'var overallRating')]/text()"
            )
            loader.add_xpath(
                'SPD',
                ".//div[@class='wrapper']//script[contains(text(), 'var overallRating')]/text()"
            )
            loader.add_xpath(
                'POS',
                ".//div[@class='wrapper']//script[contains(text(), 'var overallRating')]/text()"
            )

        else:

            loader.add_xpath(
                'PAC',
                ".//div[@class='wrapper']//script[contains(text(), 'var overallRating')]/text()"
            )
            loader.add_xpath(
                'SHO',
                ".//div[@class='wrapper']//script[contains(text(), 'var overallRating')]/text()"
            )
            loader.add_xpath(
                'PAS',
                ".//div[@class='wrapper']//script[contains(text(), 'var overallRating')]/text()"
            )
            loader.add_xpath(
                'DRI',
                ".//div[@class='wrapper']//script[contains(text(), 'var overallRating')]/text()"
            )
            loader.add_xpath(
                'DEF',
                ".//div[@class='wrapper']//script[contains(text(), 'var overallRating')]/text()"
            )
            loader.add_xpath(
                'PHY',
                ".//div[@class='wrapper']//script[contains(text(), 'var overallRating')]/text()"
            )

        # PLAYER DETAILED STATS

        loader.add_xpath('crossing', "(.//span[../span='Crossing']/text())[1]")
        loader.add_xpath('finishing',
                         "(.//span[../span='Finishing']/text())[1]")
        loader.add_xpath('heading_accuracy',
                         "(.//span[../span='Heading Accuracy']/text())[1]")
        loader.add_xpath('short_passing',
                         "(.//span[../span='Short Passing']/text())[1]")
        loader.add_xpath('volleys', "(.//span[../span='Volleys']/text())[1]")
        loader.add_xpath('aggression',
                         "(.//span[../span='Aggression']/text())[1]")
        loader.add_xpath('interceptions',
                         "(.//span[../span='Interceptions']/text())[1]")
        loader.add_xpath('positioning',
                         "(.//span[../span='Positioning']/text())[1]")
        loader.add_xpath('vision', "(.//span[../span='Vision']/text())[1]")
        loader.add_xpath('penalties',
                         "(.//span[../span='Penalties']/text())[1]")
        loader.add_xpath('composure',
                         ".//li[contains(text(), 'Composure')]/span/text()")
        loader.add_xpath('dribbling',
                         "(.//span[../span='Dribbling']/text())[1]")
        loader.add_xpath('curve', "(.//span[../span='Curve']/text())[1]")
        loader.add_xpath('fk_accuracy',
                         "(.//span[../span='FK Accuracy']/text())[1]")
        loader.add_xpath('long_passing',
                         "(.//span[../span='Long Passing']/text())[1]")
        loader.add_xpath('ball_control',
                         "(.//span[../span='Ball Control']/text())[1]")
        loader.add_xpath('marking', "(.//span[../span='Marking']/text())[1]")
        loader.add_xpath('standing_tackle',
                         "(.//span[../span='Standing Tackle']/text())[1]")
        loader.add_xpath('sliding_tackle',
                         "(.//span[../span='Sliding Tackle']/text())[1]")
        loader.add_xpath('acceleration',
                         "(.//span[../span='Acceleration']/text())[1]")
        loader.add_xpath('sprint_speed',
                         "(.//span[../span='Sprint Speed']/text())[1]")
        loader.add_xpath('agility', "(.//span[../span='Agility']/text())[1]")
        loader.add_xpath('reactions',
                         "(.//span[../span='Reactions']/text())[1]")
        loader.add_xpath('balance', "(.//span[../span='Balance']/text())[1]")
        loader.add_xpath('gk_diving',
                         ".//li[contains(text(), 'GK Diving')]/span/text()")
        loader.add_xpath('gk_handling',
                         ".//li[contains(text(), 'GK Handling')]/span/text()")
        loader.add_xpath('gk_kicking',
                         ".//li[contains(text(), 'GK Kicking')]/span/text()")
        loader.add_xpath(
            'gk_positioning',
            ".//li[contains(text(), 'GK Positioning')]/span/text()")
        loader.add_xpath('gk_reflexes',
                         ".//li[contains(text(), 'GK Reflexes')]/span/text()")
        loader.add_xpath('shot_power',
                         "(.//span[../span='Shot Power']/text())[1]")
        loader.add_xpath('jumping', "(.//span[../span='Jumping']/text())[1]")
        loader.add_xpath('stamina', "(.//span[../span='Stamina']/text())[1]")
        loader.add_xpath('strength', "(.//span[../span='Strength']/text())[1]")
        loader.add_xpath('long_shots',
                         "(.//span[../span='Long Shots']/text())[1]")
        loader.add_xpath(
            'traits',
            ".//h5[text()='Traits']/following-sibling::ul/li/span/text()")

        # PLAYER REAL OVERALL RATING (POSITIONAL STATS)

        loader.add_xpath('LS', "(.//div[../div='LS']/following::text())[1]")
        loader.add_xpath('ST', "(.//div[../div='ST']/following::text())[1]")
        loader.add_xpath('RS', "(.//div[../div='RS']/following::text())[1]")
        loader.add_xpath('LW', "(.//div[../div='LW']/following::text())[1]")
        loader.add_xpath('LF', "(.//div[../div='LF']/following::text())[1]")
        loader.add_xpath('CF', "(.//div[../div='CF']/following::text())[1]")
        loader.add_xpath('RF', "(.//div[../div='RF']/following::text())[1]")
        loader.add_xpath('RW', "(.//div[../div='RW']/following::text())[1]")
        loader.add_xpath('LAM', "(.//div[../div='LAM']/following::text())[1]")
        loader.add_xpath('CAM', "(.//div[../div='CAM']/following::text())[1]")
        loader.add_xpath('RAM', "(.//div[../div='RAM']/following::text())[1]")
        loader.add_xpath('LM', "(.//div[../div='LM']/following::text())[1]")
        loader.add_xpath('LCM', "(.//div[../div='LCM']/following::text())[1]")
        loader.add_xpath('CM', "(.//div[../div='CM']/following::text())[1]")
        loader.add_xpath('RCM', "(.//div[../div='RCM']/following::text())[1]")
        loader.add_xpath('RM', "(.//div[../div='RM']/following::text())[1]")
        loader.add_xpath('LWB', "(.//div[../div='LWB']/following::text())[1]")
        loader.add_xpath('LDM', "(.//div[../div='LDM']/following::text())[1]")
        loader.add_xpath('CDM', "(.//div[../div='CDM']/following::text())[1]")
        loader.add_xpath('RDM', "(.//div[../div='RDM']/following::text())[1]")
        loader.add_xpath('RWB', "(.//div[../div='RWB']/following::text())[1]")
        loader.add_xpath('LB', "(.//div[../div='LB']/following::text())[1]")
        loader.add_xpath('LCB', "(.//div[../div='LCB']/following::text())[1]")
        loader.add_xpath('CB', "(.//div[../div='CB']/following::text())[1]")
        loader.add_xpath('RCB', "(.//div[../div='RCB']/following::text())[1]")
        loader.add_xpath('RB', "(.//div[../div='RB']/following::text())[1]")

        # COMMUNITY INFORMATION

        loader.add_xpath(
            'followers',
            "(.//div[@class='operation mt-2']/a/text()[contains(.,'Follow')]"
            "/following::span)[1]/text()")
        loader.add_xpath(
            'likes',
            "(.//div[@class='operation mt-2']/a/text()[contains(.,'Like')]"
            "/following::span)[1]/text()")
        loader.add_xpath(
            'dislikes',
            "(.//div[@class='operation mt-2']/a/text()[contains(.,'Dislike')]"
            "/following::span)[1]/text()")

        # MEDIA

        loader.add_xpath('face_img', ".//div/div/article/div/img//@data-src")
        loader.add_xpath('flag_img', ".//div[@class='meta']/a/img/@data-src")
        loader.add_xpath('club_logo_img',
                         "(.//div/ul/li/figure/img/@data-src)[1]")
        loader.add_xpath('team_logo_img',
                         "(.//div/ul/li/figure/img/@data-src)[2]")

        self.logger.info(f'Parse function called on {response.url}')

        self.logger.info(
            f"Currently on page {self.crawler.stats.get_value('page_counter')} out of "
            f"{self.crawler.stats.get_value('pages_to_visit')}")

        # TODO: enable continued logging of page_counter after a pause/resume.
        self.crawler.stats.inc_value(key='page_counter', count=1, start=0)

        print(response.request.headers['User-Agent'])
        print(
            f"{self.crawler.stats.get_value('page_counter')} out of {self.crawler.stats.get_value('pages_to_visit')}"
        )

        yield loader.load_item()

示例#43

0

显示文件

文件： lianjia.py 项目： fan-qiang/real_estate

    def parse_content(self, response):
        item = ItemLoader(item=RealEstateItem(), response=response)

        item.add_value("id", str(uuid1()))

        item.add_value("domain", 'lianjia')

        # 简单的描述信息
        item.add_xpath("title", '//*[@class="header-title"]/text()')

        # 小区名称
        item.add_xpath("housing_estate",
                       "//*[@class='maininfo-estate-name']/a[1]/text()")

        # 房产总价万元
        item.add_xpath("price_num", '//*[@class="price-num"]/text()')

        # 小区地址
        item.add_xpath(
            "address",
            "//*[@class='item-cell maininfo-estate-address']/text()")

        # 房型介绍

        item.add_xpath(
            "rooms",
            '//*[@id="js-baseinfo-header"]/div[1]/div[1]/div[2]/ul/li[1]/span[2]/text()'
        )

        # 房源编号

        item.add_xpath(
            "house_code",
            '//*[@class="maininfo-minor maininfo-item"]/li[4]/span[2]/text()[1]'
        )

        # 抓取的url
        item.add_value("url", response.url)

        # 建筑面积

        item.add_xpath(
            "floorage",
            '//*[@id="js-baseinfo-header"]/div[1]/div[1]/div[2]/ul/li[3]/span[2]/text()'
        )

        # 装修 0-毛坯 1-简装 2 中等装修 3 精装

        item.add_xpath(
            "decoration_situation",
            '//*[@id="js-baseinfo-header"]/div[1]/div[1]/div[3]/ul/li[2]/span[2]/text()'
        )

        # 每平方米单价

        item.add_xpath("price_unit_num",
                       '//*[@class="price-unit-num"]/span/text()')

        # 楼层
        item.add_xpath(
            "floor",
            '//*[@id="js-baseinfo-header"]/div[1]/div[1]/div[3]/ul/li[1]/span[2]/text()'
        )

        # 房本年限
        item.add_xpath(
            "term",
            '//*[@id="js-baseinfo-header"]/div[1]/div[2]/div[2]/ul/li[2]/span[2]/text()'
        )

        # 建成时间

        item.add_xpath("year", '//*[@class="main-item u-tr"]/p[2]/text()')

        # 朝向

        item.add_xpath(
            "orientation",
            '//*[@id="js-baseinfo-header"]/div[1]/div[1]/div[3]/ul/li[3]/span[2]/text()[1]'
        )

        # 标签

        item.add_xpath(
            "tags",
            '//*[@id="js-baseinfo-header"]/div[1]/div[4]/div[2]/ul/li/span/text()'
        )

        # 城市名称

        item.add_value("city", "苏州")

        # 所处的区域

        item.add_value("district", "工业园区")
        # 数据创建时间

        item.add_value("create_time",
                       datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

        return item.load_item()

示例#44

0

显示文件

文件： Spiders.py 项目： Vianyraj/callspiderproj

    def parse(self, response):
        print("typeeeeeeeeeeeeeeeeeeeeeee", response)
        print("typeeeeeeeeeeeeeeeeeeeeeee", type(response))
        # if response.xpath(self.DateXpath).get() is None \
        #     or response.xpath(self.UrlXpath).get() is None\
        #     or response.xpath(self.DeadlineXpath).get() is None\
        #     or response.xpath(self.PlaceXpath).get()  is None \
        #     or response.xpath(self.TitleXpath).get() is None    :
        #
        #     raise ValueError("Wiki webpage has changed")
        if response.xpath(self.DateXpath).get() is None:
            raise ValueError("the DateXpath of Wiki webpage has changed")
        if response.xpath(self.UrlXpath).get() is None:
            raise ValueError("the UrlXpath of Wiki webpage has changed")
        if response.xpath(self.DeadlineXpath).get() is None:
            raise ValueError("the DeadlineXpath of Wiki webpage has changed")
        if response.xpath(self.PlaceXpath).get() is None:
            raise ValueError("the PlaceXpath of Wiki webpage has changed")
        if response.xpath(self.TitleXpath).get() is None:
            raise ValueError("the TitleXpath of Wiki webpage has changed")

        loader = ItemLoader(item=EventItemWiki(),
                            response=response)  #loader of type EventItemWiki
        loader.add_xpath("DateWiki", self.DateXpath)
        loader.add_xpath("UrlWiki", self.UrlXpath)
        loader.add_xpath("DeadlineWiki", self.DeadlineXpath)
        loader.add_xpath("PlaceWiki", self.PlaceXpath)
        j = 0
        for i in range(int(len(loader.load_item()["PlaceWiki"]))):
            title = response.xpath(self.TitleXpath)[j + 1].extract()
            loader.add_value("TitleWiki", title)
            j = j + 5

        item = loader.load_item()  # item is loaded with loader data dictionary

        # if "TitleWiki" not in item or "DateWiki" not in item or "UrlWiki" not in item or "DeadlineWiki" not in item or "PlaceWiki" not in item: #you do it
        #     print("item is not loaded properly")
        #     raise ValueError
        if "TitleWiki" not in item:
            raise ValueError("TitleWiki item is not loaded")
        if "DateWiki" not in item:
            raise ValueError("DateWiki item is not loaded")
        if "UrlWiki" not in item:
            raise ValueError("UrlWiki item is not loaded")
        if "DeadlineWiki" not in item:
            raise ValueError("DeadlineWiki item is not loaded")
        if "PlaceWiki" not in item:
            raise ValueError("PlaceWiki item is not loaded")

        #store crawled data to a dict, then yield to pipeline #calling pipelines 1 times
        mydict = {
            "title": [],
            "date": [],
            "url": [],
            "paperType": [],
            "where": [],
            "deadline": []
        }

        for k in range(int(len(item["TitleWiki"]))):
            mydict["title"].append(item["TitleWiki"][k])
            if item['DateWiki'][k] == "N/A" or item['DateWiki'][
                    k] == "TBD" or item["DateWiki"][k] == "Online" or item[
                        "DateWiki"][k] == "ONLINE":
                mydict["date"].append(None)
                #mydict["date"].append(item['DateWiki'][k].split("-")[0][0:12].rstrip())
            else:
                mydict["date"].append(
                    datetime.strptime(
                        item['DateWiki'][k].split("-")[0][0:12].rstrip(),
                        '%b %d, %Y'))

            mydict["url"].append('http://www.wikicfp.com' + item["UrlWiki"][k])
            mydict["paperType"].append("Wiki")
            mydict["where"].append(item["PlaceWiki"][k])

            if item["DeadlineWiki"][k] == "TBD" or item["DeadlineWiki"][
                    k] == "N/A":  #TBD
                mydict["deadline"].append(None)
            elif (len(item["DeadlineWiki"][k]) >= 12):  #with brackets
                mydict["deadline"].append(
                    datetime.strptime(
                        item['DeadlineWiki'][k].split("(")[0][0:12].rstrip(),
                        '%b %d, %Y'))

            else:  # normal case
                mydict["deadline"].append(
                    datetime.strptime(item['DeadlineWiki'][k], '%b %d, %Y'))

        #Calling pipeline
        myPipeline = ScrapyProjectPipeline()
        myPipeline.process_item(mydict, SpiderWiki)
        yield mydict  #calling pipelines 1 time

示例#45

0

显示文件

文件： despesas.py 项目： social-scrapy/cms

    def parse(self, response):

        try:

            if response.status == 404:

                self.append(self.bad_log_file, response.url)

            elif response.status == 200:

                selectors = response.xpath(
                    '//*[@id="ContentPlaceHolder1_UpdatePanel1"]/div')
                del selectors[:2]
                del selectors[-1]

                for divs in selectors:

                    #Parse despesas
                    l = ItemLoader(item=Despesa(), selector=divs)
                    l.add_xpath(
                        'data',
                        './b[contains(text(),"Data")]/following-sibling::text()[1]'
                        .encode('utf-8'), MapCompose(str.strip))
                    l.add_xpath(
                        'tipo',
                        './b[contains(text(),"Tipo")]/following-sibling::text()[1]'
                        .encode('utf-8'), MapCompose(str.strip))
                    l.add_xpath(
                        'responsavel',
                        u'./b[contains(text(),"Responsável")]/following-sibling::text()[1]',
                        MapCompose(str.strip))
                    l.add_xpath(
                        'usuario',
                        './b[contains(text(),"Usuário")]/following-sibling::text()[1]',
                        MapCompose(str.strip))
                    l.add_xpath(
                        'valor',
                        './b[contains(text(),"Valor")]/following-sibling::text()[1]'
                        .encode('utf-8'), MapCompose(str.strip))
                    l.add_xpath(
                        'localidade',
                        './b[contains(text(),"Localidade")]/following-sibling::text()[1]'
                        .encode('utf-8'), MapCompose(str.strip))
                    l.add_xpath(
                        'justificativa',
                        './b[contains(text(),"Justificativa")]/following-sibling::text()[1]'
                        .encode('utf-8'),
                        MapCompose(str.strip, remove_tags,
                                   replace_escape_chars, remove_comments))

                    yield l.load_item()

            else:

                self.append(self.bad_log_file, response.url)

        except Exception as e:

            self.log('[exception] : %s' % e)

        #Post request pagination
        yield scrapy.FormRequest.from_response(
            response,
            url="http://www.cms.ba.gov.br/despesa.aspx/",
            formdata={
                '__EVENTTARGET':
                'ctl00$ContentPlaceHolder1$dpNoticia$ctl02$ctl00',
                'ctl00$ContentPlaceHolder1$dpNoticia$ctl02$ctl00':
                'ctl00$ContentPlaceHolder1$UpdatePanel1|ctl00$ContentPlaceHolder1$dpNoticia$ctl02$ctl00'
            },
            callback=self.parse)

示例#46

0

显示文件

文件： yahoo_finance.py 项目： jaysuan/yahoo-finance-crawler

    def parse_statistics(self, response):
        driver = response.meta['driver']
        nav_urls = response.meta['nav_urls']
        parent_loader = response.meta['loader']
        loader = ItemLoader(parent=parent_loader, response=response)
        fiftytwo_week_high = response.xpath(
            "//tr/td/span[text()='52 Week High']/parent::td/following-sibling::td[1]/text()"
        ).get()
        loader.add_value('fiftytwo_week_high', fiftytwo_week_high)
        previous_close = locale.atof(loader.get_output_value('previous_close'))
        one_year_target_est = locale.atof(
            loader.get_output_value('one_year_target_est'))
        diff_to_52_week_high = 1 - (previous_close -
                                    locale.atof(fiftytwo_week_high))
        diff_to_1y_target_est = 1 - (one_year_target_est - previous_close)
        loader.add_value(
            'diff_to_52_week_high',
            f"{self._round_off_2_decimal(diff_to_52_week_high)}%")
        loader.add_value(
            'diff_to_1y_target_est',
            f"{self._round_off_2_decimal(diff_to_1y_target_est)}%")

        forward_pe = self._wait_and_find_elem(
            driver,
            "//tr/td/span[text()='Forward P/E']/parent::td/following-sibling::td[1]"
        ).text
        loader.add_xpath('forward_pe', forward_pe)
        market_cap = response.xpath(
            "//tr/td/span[contains(text(), 'Market Cap')]/parent::td/following-sibling::td[1]/text()"
        ).get()
        unit = market_cap[-1]
        if unit == 'B':
            multiplier = 1000
        elif unit == 'T':
            multiplier = 1000000
        else:
            multiplier = 1
        market_cap = float(market_cap[0:-1]) * multiplier
        loader.add_value('market_cap', market_cap)
        peg_ratio = response.xpath("//tr/td/span[contains(text(), 'PEG Ratio')]/parent::td/following-sibling::td[1]/text()").get() or \
            response.xpath("//tr/td/span[contains(text(), 'PEG Ratio')]/parent::td/following-sibling::td[1]/span/text()").get()
        loader.add_value('peg_ratio', peg_ratio)
        loader.add_xpath(
            'price_over_sales',
            "//tr/td/span[contains(text(), 'Price/Sales')]/parent::td/following-sibling::td[1]/text()"
        )
        price_over_book = response.xpath("//tr/td/span[contains(text(), 'Price/Book')]/parent::td/following-sibling::td[1]/text()").get() or \
            response.xpath("//tr/td/span[contains(text(), 'Price/Book')]/parent::td/following-sibling::td[1]/span/text()").get()
        loader.add_value('price_over_book', price_over_book)
        return_on_assets = response.xpath("//tr/td/span[contains(text(), 'Return on Assets')]/parent::td/following-sibling::td[1]/text()").get() or \
            response.xpath("//tr/td/span[contains(text(), 'Return on Assets')]/parent::td/following-sibling::td[1]/span/text()").get()
        loader.add_value('return_on_assets', return_on_assets)
        return_on_equity = response.xpath("//tr/td/span[contains(text(), 'Return on Equity')]/parent::td/following-sibling::td[1]/text()").get() or \
            response.xpath("//tr/td/span[contains(text(), 'Return on Equity')]/parent::td/following-sibling::td[1]/span/text()").get()
        loader.add_value('return_on_equity', return_on_equity)
        loader.add_xpath(
            'diluted_eps',
            "//tr/td/span[contains(text(), 'Diluted EPS')]/parent::td/following-sibling::td[1]/text()"
        )
        quarterly_earnings_growth = response.xpath("//tr/td/span[contains(text(), 'Quarterly Earnings Growth')]/parent::td/following-sibling::td[1]/text()").get() or \
            response.xpath("//tr/td/span[contains(text(), 'Quarterly Earnings Growth')]/parent::td/following-sibling::td[1]/span/text()").get()
        loader.add_value('quarterly_earnings_growth',
                         quarterly_earnings_growth)
        fwd_annual_dividend_rate = response.xpath("//tr/td/span[contains(text(), 'Forward Annual Dividend Rate')]/parent::td/following-sibling::td[1]/text()").get() or \
            response.xpath("//tr/td/span[contains(text(), 'Forward Annual Dividend Rate')]/parent::td/following-sibling::td[1]/span/text()").get()
        loader.add_value('fwd_annual_dividend_rate', fwd_annual_dividend_rate)
        fwd_annual_dividend_yield = response.xpath("//tr/td/span[contains(text(), 'Forward Annual Dividend Yield')]/parent::td/following-sibling::td[1]/text()").get() or \
            response.xpath("//tr/td/span[contains(text(), 'Forward Annual Dividend Yield')]/parent::td/following-sibling::td[1]/span/text()").get()
        loader.add_value('fwd_annual_dividend_yield',
                         fwd_annual_dividend_yield)
        ex_dividend_date = response.xpath("//tr/td/span[contains(text(), 'Ex-Dividend Date')]/parent::td/following-sibling::td[1]/text()").get() or \
            response.xpath("//tr/td/span[contains(text(), 'Ex-Dividend Date')]/parent::td/following-sibling::td[1]/span/text()").get()
        loader.add_value('ex_dividend_date', ex_dividend_date)

        yield SeleniumRequest(url=nav_urls['profile_url'],
                              callback=self.parse_profile,
                              previous_response=response,
                              meta={
                                  "loader": loader,
                                  "nav_urls": nav_urls
                              })

示例#47

0

显示文件

    def parse_item(self, response):
        item = {}
        #item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
        #item['name'] = response.xpath('//div[@id="name"]').get()
        #item['description'] = response.xpath('//div[@id="description"]').get()
        i = 0
        while (1):
            i += 1
            if len(response.xpath(
                    '(//*[@class="team-name"])[%d]/text()' % i)) < 1:
                break
            else:
                print("name: %s" % response.xpath(
                    '(//*[@class="team-name"])[%d]/text()' % i).extract())
                # print("web: %s" % response.xpath('//html').extract())
                l = ItemLoader(item=GameRecord(), response=response)
                l.add_xpath('name1',
                            '(//*[@class="team-name"])[%d]/text()' % i)
                l.add_xpath('name2',
                            '(//*[@class="team-name f-toe"])[%d]/text()' % i)
                l.add_xpath('time', '(//*[@class="td time"])[%d]/text()' % i)
                l.add_xpath('series',
                            '(//*[@class="f-toe f-csp"])[%d]/text()' % i)
                l.add_xpath(
                    'score1',
                    '(//*[@class="vs-data f-csp"])[%d]/@data-matchhomescore' %
                    i, MapCompose(int))
                # l.add_xpath('score1', '(//*[@class="td vs"])[%d]/a/@data-matchhomescore' % i)
                l.add_xpath(
                    'score2',
                    '(//*[@class="vs-data f-csp"])[%d]/@data-matchawayscore' %
                    i, MapCompose(int))

                l.add_value('last_updated',
                            'today')  # you can also use literal values
                item.append(l.load_item())
                l.add_value('url', response.url)
                l.add_value('spider', self.name)
                l.add_value('server', socket.gethostname())
                l.add_value('date', datetime.datetime.now())
        return item

示例#48

0

显示文件

    def parse_item(self, response):
        url = response.url
        item_list = item_code(url, self.web_name, 'id=(.*?)$')
        print(item_list)
        item = ItemLoader(item=GdSz6652Item(), response=response)
        item.add_value('web_name', self.web_name)
        item.add_value('web_code', self.name)
        item.add_value('url', url)
        item.add_value('item_code', item_list.get('item_code'))
        item.add_css('title', '.title.border-bottom-light::text')
        item.add_xpath(
            'amount',
            '//*[contains(text(),"借款金额")]/following-sibling::td[1]/text()')
        item.add_xpath('rate',
                       '//*[contains(text(),"历史年化结算利率")]/../span[1]/text()')
        item.add_xpath('period',
                       '//*[contains(text(),"借款期限（天）")]/../span[1]/text()')
        item.add_xpath(
            'loan_using',
            '//*[contains(text(),"借款用途")]/following-sibling::td[1]/text()')
        # item.add_xpath('loaner_info', '//*[@id="userName"]')
        item.add_xpath('pay_type', '//*[contains(text(),"回款方式")]/text()')
        item.add_xpath('progress',
                       '//*[contains(text(),"剩余可出借金额（元）")]/../span[1]/text()')

        # invest records
        i_v = []
        invest_records_temp = '{{username={lst[0]}|rate=-1|postmoney={lst[1]}|money={lst[1]}|postdate={lst[2]}|status=全部通过}}'
        invest_records_format = ""
        tr = response.css('#investRecordWrap').css('tr')
        try:
            for i in tr:
                lst = i.css('td::text').extract()
                if lst:
                    i_v.append(lst)
            for n in i_v:
                invest_records_format += invest_records_temp.format(lst=n)
            item.add_value('invest_records', invest_records_format)
            item.add_value('start', i_v[-1][2])
            item.add_value('end', i_v[0][2])
        except Exception:
            print(url, 'invest records is error')
        yield item.load_item()

示例#49

0

显示文件

文件： basic.py 项目： Markovicc/Serbian_parl_tw_conn

    def parse(self, response):

        l = ItemLoader(item=ParlamentItem(), response=response)
        l.add_xpath(
            'ime', '//h2/text()',
            MapCompose(lambda i: i.replace('\n', ''), str.strip,
                       str.capitalize))
        l.add_xpath('prezime', '//h2/span/text()', MapCompose(str.capitalize))
        l.add_xpath(
            'stranka',
            '//h4[contains(text(), "stranka")]/following::p[1]/text()',
            MapCompose(
                lambda i: i.replace('\n', ''),
                str.strip,
            ))
        l.add_xpath(
            'posl_grupa',
            '//h4[contains(text(), "grupa")]/following::p[1]/a/text()')
        l.add_xpath('mesto',
                    '//h4[contains(text(), "Mesto")]/following::p[1]/text()')
        l.add_xpath(
            'zanimanje',
            '//h4[contains(text(), "Zanimanje")]/following::p[1]/text()')
        l.add_xpath('godina',
                    '//h4[contains(text(), "Godina")]/following::p[1]/text()')
        l.add_xpath('foto', '//div[@class = "image_holder left"]/img/@src')
        l.add_xpath('twitter', '//ul[@class = "social-list"]/li[1]/a/@href')
        l.add_xpath('facebook', '//ul[@class = "social-list"]/li[2]/a/@href')

        return l.load_item()

示例#50

0

显示文件

文件： zhihucrawl.py 项目： 1097150187/zhihu_crawler

    def parse_question(self, response):
        item_loader = ItemLoader(item=ZhihuItemQuestion(), response=response)
        item_loader.add_value("zhihu_id", response.meta.get("question_id"))
        item_loader.add_value("url", response.url)
        item_loader.add_xpath("title",
                              "//h1[@class='QuestionHeader-title']//text()")
        item_loader.add_xpath("main_content",
                              "//div[@class='QuestionHeader-detail']//text()")
        item_loader.add_xpath("tag",
                              "//div[@class='QuestionHeader-topics']//text()")
        item_loader.add_xpath(
            "focus_num",
            "//button[@class='Button NumberBoard-item Button--plain']//strong//text()"
        )
        item_loader.add_xpath(
            "click_num", "//div[@class='NumberBoard-item']//strong//text()")
        item_loader.add_xpath(
            "comment_num",
            "normalize-space(//div[@class='QuestionHeader-Comment']/button/text()[1])"
        )
        item_loader.add_xpath(
            "answer_num",
            "normalize-space(//div[@class='List-header']//span//text()[1])")

        question_item = item_loader.load_item()

        yield scrapy.Request(url=self.start_answer_url.format(
            response.meta.get("question_id"), 20, 0),
                             callback=self.parse_answer)
        yield question_item

示例#51

0

显示文件

 def parse_article(self, response):
     loader = ItemLoader(item=ClaimsItem(), response=response)
     loader.add_xpath("text", '//div[@class="claim"]/p')
     loader.add_xpath("rating", '//h5[starts-with(@class,"rating-label")]')
     loader.add_value("fact_check", response.url)
     yield loader.load_item()

示例#52

0

显示文件

文件： Frio.py 项目： camgrafiman/decathlonbot

    def parse(self, response):

        sel = Selector(response)
        productos = sel.xpath('//div[@id="js-product-wrapper"]/article')

        # sel.css también puede ser usado.
        # iterar sobre todos los productos:
        for i, elem in enumerate(productos):
            item = ItemLoader(Producto(), elem)
            item.add_xpath(
                # 'imagen', './div[@class="dkt-product__gallery"]/div/div/div/div/picture/source[5]/@srcset')
                # 'imagen', './div/div/div/div/div/picture/source[position()=4]/@srcset')
                'imagen',
                './div[@class="dkt-product__gallery"]/div/div/div[position()=1]/div/picture/source/source/source/source/source/@srcset'
            )
            item.add_xpath(
                'titulo',
                'normalize-space(div[@class="dkt-product__infos-wrapper"]/div[@class="dkt-product__infos__link"]/div/div/a/h2/text())'
            )
            item.add_xpath(
                # 'precio', './div[@class="dkt-product__infos-wrapper"]/div/div/div[@class="dkt-product__price"]/div/div/@data-price')
                'precio',
                './div[@class="dkt-product__infos-wrapper"]/div/div/div[@class="dkt-product__price"]/div/div[@class="dkt-price__cartridge"]/@data-price'
                or
                './div[@class="dkt-product__infos-wrapper"]/div/div/div[@class="dkt-product__price"]/div/div/@data-price'
            )
            item.add_xpath(
                'precio_a',
                'normalize-space(.//div[@class="dkt-price__cartridge"]/text())'
            )
            item.add_xpath(
                'precio_b',
                'normalize-space(.//div[@class="dkt-price__cartridge"]/sup/text())'
            )
            item.add_xpath(
                'precio_previo',
                './div[@class="dkt-product__infos-wrapper"]/div/div/div[@class="dkt-product__price"]/div/span/span[position()=1]/text()'
            )
            item.add_xpath(
                'reduccion',
                './div[@class="dkt-product__infos-wrapper"]/div/div/div[@class="dkt-product__price"]/div/span/span[position()=2]/text()'
            )
            item.add_xpath(
                'marca',
                './div[@class="dkt-product__infos-wrapper"]/div/div/div/span/span/text()'
            )
            item.add_xpath(
                'url',
                './div[@class="dkt-product__infos-wrapper"]/div[@class="dkt-product__infos__link"]/div/div/a/@href'
            )
            item.add_xpath(
                'rating',
                './div[@class="dkt-product__infos-wrapper"]/div/div/span[@itemprop="ratingValue"]/text()'
            )
            item.add_xpath(
                'review',
                './div[@class="dkt-product__infos-wrapper"]/div/div/span[@itemprop="reviewCount"]/text()'
            )
            item.add_xpath(
                'modelId',
                './div[@class="dkt-product__gallery"]/div/div[position()=1]/div[position()=1]/@data-modelid'
            )
            item.add_value('id', i + random.randrange(10, 4000000))
            item.add_value('control_type', 'A')

            yield item.load_item()
        # Paginacion con el botón más productos:
        boton_next = response.css('button #more_product_a').extract_first()
        if boton_next:
            boton_next = response.urljoin(boton_next)
            # ahora repetir el proceso en la nueva url con la funcion parse
            yield scrapy.Request(url=boton_next, callback=self.parse)

示例#53

0

显示文件

    def parse_page(self, response):
        '''
        Parse the given page selecting the posts.
        Then ask recursively for another page.
        '''
        #select all posts
        for post in response.xpath(
                "//div[contains(@data-ft,'top_level_post_id')]"):
            new = ItemLoader(item=FbcrawlItem(), selector=post)
            self.logger.info('Parsing post n = {}'.format(abs(self.count)))
            new.add_xpath('comments', "./div[2]/div[2]/a[1]/text()")
            new.add_xpath('url', ".//a[contains(@href,'footer')]/@href")

            #page_url #new.add_value('url',response.url)
            #returns full post-link in a list
            post = post.xpath(".//a[contains(@href,'footer')]/@href").extract()
            temp_post = response.urljoin(post[0])
            self.count -= 1
            yield scrapy.Request(temp_post,
                                 self.parse_post,
                                 priority=self.count,
                                 meta={'item': new})

        #load following page
        #tries to click on "more", otherwise it looks for the appropriate
        #year for 1-click only and proceeds to click on others
        new_page = response.xpath(
            "//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href"
        ).extract()
        if not new_page:
            if response.meta['flag'] == self.k and self.k >= self.year:
                self.logger.info('There are no more, flag set at = {}'.format(
                    self.k))
                xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(
                    self.k) + "')]/@href"
                new_page = response.xpath(xpath).extract()
                if new_page:
                    new_page = response.urljoin(new_page[0])
                    self.k -= 1
                    self.logger.info('Everything OK, new flag: {}'.format(
                        self.k))
                    yield scrapy.Request(new_page,
                                         callback=self.parse_page,
                                         meta={'flag': self.k})
                else:
                    while not new_page:  #sometimes the years are skipped
                        self.logger.info(
                            'XPATH not found for year {}'.format(self.k - 1))
                        self.k -= 1
                        self.logger.info(
                            'Trying with previous year, flag={}'.format(
                                self.k))
                        if self.k < self.year:
                            self.logger.info(
                                'The previous year to crawl is less than the parameter year: {} < {}'
                                .format(self.k, self.year))
                            self.logger.info(
                                'This is not handled well, please re-run with -a year="{}" or less'
                                .format(self.k))
                            break
                        xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(
                            self.k) + "')]/@href"
                        new_page = response.xpath(xpath).extract()
                    self.logger.info('New page found with flag {}'.format(
                        self.k))
                    new_page = response.urljoin(new_page[0])
                    self.k -= 1
                    self.logger.info('Now going with flag {}'.format(self.k))
                    yield scrapy.Request(new_page,
                                         callback=self.parse_page,
                                         meta={'flag': self.k})
            else:
                self.logger.info('Crawling has finished with no errors!')
        else:
            new_page = response.urljoin(new_page[0])
            if 'flag' in response.meta:
                self.logger.info(
                    'Page scraped, click on more! flag = {}'.format(
                        response.meta['flag']))
                yield scrapy.Request(new_page,
                                     callback=self.parse_page,
                                     meta={'flag': response.meta['flag']})
            else:
                self.logger.info('FLAG DOES NOT ALWAYS REPRESENT ACTUAL YEAR')
                self.logger.info(
                    'First page scraped, click on more! Flag not set, default flag = {}'
                    .format(self.k))
                yield scrapy.Request(new_page,
                                     callback=self.parse_page,
                                     meta={'flag': self.k})

示例#54

0

显示文件

文件： fb.py 项目： gwrxuk/20181113

    def parse_page(self, response):
        '''
        Parse the given page selecting the posts.
        Then ask recursively for another page.
        '''
        #        #open page in browser for debug
        #        from scrapy.utils.response import open_in_browser
        #        open_in_browser(response)

        #select all posts
        for post in response.xpath(
                "//div[contains(@data-ft,'top_level_post_id')]"):

            many_features = post.xpath('./@data-ft').get()
            date = []
            date.append(many_features)

            date = parse_date(date, {'lang': self.lang})
            current_date = datetime.strptime(
                date, '%Y-%m-%d %H:%M:%S') if date is not None else date
            print(post)
            if current_date is None:
                date_string = post.xpath('.//abbr/text()').get()
                date = parse_date2([date_string], {'lang': self.lang})
                current_date = datetime(date.year, date.month,
                                        date.day) if date is not None else date
                date = str(date)
            print(current_date)
            #if 'date' argument is reached stop crawling
            #if self.date > current_date:
            #    raise CloseSpider('Reached date: {}'.format(self.date))
            print("stop2")
            new = ItemLoader(item=FbcrawlItem(), selector=post)
            if abs(self.count) + 1 > self.max:
                raise CloseSpider(
                    'Reached max num of post: {}. Crawling finished'.format(
                        abs(self.count)))
            self.logger.info('Parsing post n = {}, post_date = {}'.format(
                abs(self.count) + 1, date))
            new.add_xpath('comments', './div[2]/div[2]/a[1]/text()')
            new.add_value('date', date)
            new.add_xpath('post_id', './@data-ft')
            new.add_xpath('url', ".//a[contains(@href,'footer')]/@href")
            #page_url #new.add_value('url',response.url)

            #returns full post-link in a list
            post = post.xpath(".//a[contains(@href,'footer')]/@href").extract()
            temp_post = response.urljoin(post[0])

            self.count -= 1
            yield scrapy.Request(temp_post,
                                 self.parse_post,
                                 priority=self.count,
                                 meta={'item': new})

        #load following page, try to click on "more"
        #after few pages have been scraped, the "more" link might disappears
        #if not present look for the highest year not parsed yet
        #click once on the year and go back to clicking "more"

        #new_page is different for groups
        if self.group == 1:
            new_page = response.xpath(
                "//div[contains(@id,'stories_container')]/div[2]/a/@href"
            ).extract()
        else:
            new_page = response.xpath(
                "//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href"
            ).extract()
            #this is why lang is needed                                            ^^^^^^^^^^^^^^^^^^^^^^^^^^

        if not new_page:

            self.logger.info(
                '[!] "more" link not found, will look for a "year" link')
            #self.k is the year link that we look for
            if 'flag' in response.meta and response.meta[
                    'flag'] == self.k and self.k >= self.year:
                xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(
                    self.k) + "')]/@href"
                new_page = response.xpath(xpath).extract()
                if new_page:
                    new_page = response.urljoin(new_page[0])
                    self.k -= 1
                    self.logger.info(
                        'Found a link for year "{}", new_page = {}'.format(
                            self.k, new_page))
                    yield scrapy.Request(new_page,
                                         callback=self.parse_page,
                                         meta={'flag': self.k})
                else:
                    while not new_page:  #sometimes the years are skipped this handles small year gaps
                        self.logger.info(
                            'Link not found for year {}, trying with previous year {}'
                            .format(self.k, self.k - 1))
                        self.k -= 1
                        if self.k < self.year:
                            raise CloseSpider(
                                'Reached date: {}. Crawling finished'.format(
                                    self.date))
                        xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(
                            self.k) + "')]/@href"
                        new_page = response.xpath(xpath).extract()
                    self.logger.info(
                        'Found a link for year "{}", new_page = {}'.format(
                            self.k, new_page))
                    new_page = response.urljoin(new_page[0])
                    self.k -= 1
                    yield scrapy.Request(new_page,
                                         callback=self.parse_page,
                                         meta={'flag': self.k})
            else:
                self.logger.info('Crawling has finished with no errors!')
        else:
            new_page = response.urljoin(new_page[0])
            if 'flag' in response.meta:
                self.logger.info(
                    'Page scraped, clicking on "more"! new_page = {}'.format(
                        new_page))
                yield scrapy.Request(new_page,
                                     callback=self.parse_page,
                                     meta={'flag': response.meta['flag']})
            else:
                self.logger.info(
                    'First page scraped, clicking on "more"! new_page = {}'.
                    format(new_page))
                yield scrapy.Request(new_page,
                                     callback=self.parse_page,
                                     meta={'flag': self.k})

示例#55

0

显示文件

    def parse_course(self, response):
        course_details = response.xpath('//form[@id="formCourseSearchDetails"]/div[contains(@id, "courseProfilePanel_")]')
        if not course_details:
            return False

        l = ItemLoader(WcsuwocaCourseItem(), response=response)
        l.default_output_processor = TakeFirst()

        l.add_value('institution_name', 'Western Continuing Studies')
        l.add_value('course_code', response.meta['course_code'])
        l.add_value('course_name', response.meta['course_name'])
        l.add_xpath('delivery_types', '//div[@class="courseProfileInstructionMethods"]/span[not(@class)]/span/text()')
        l.add_value('url', response.url)
        l.add_xpath('description', 'string(//div[@id="courseProfileOfficialCourseDescription"])')
        # l.add_value('subject', response.meta['subject'])
        l.add_value('subject', response.meta['program'])

        course_section = course_details.xpath('.//div[contains(@id, "courseSectionPanel_")]')
        if not course_section:
            return False
        course_data = course_section[0]

        # price = course_data.xpath('.//td[@class="tuitionProfileFees"]/text()').get()
        # price = course_data.xpath('.//tr[descendant::a[contains(., "Course")] and td[span[contains(., "") and @class="creditType" and contains(., "non-credit")]]]/td[@class="tuitionProfileFees"]/text()').get()
        price = course_data.xpath('.//td[@class="tuitionProfileFees"]/text()')
        if price:
            # prices = [p.strip().lstrip('$') for p in price.getall()]
            prices = list(map(lambda x: ', '.join(x), [re.findall(r'\d*\,?\d+\.\d{2}', p) for p in price.getall()]))
            prices = ', '.join(prices)
            prices = prices.split(', ')

            price = '0.0'
            for price_val in prices:
                try:
                    check_zerro_price = float(price_val.replace(',', ''))
                    if check_zerro_price:
                        price = price_val
                        break
                except ValueError:
                    continue

            # price = price.strip().lstrip('$')
        else:
            return False
            # price = '0.0'

        # # Skip courses with price $0.00
        # try:
        #     check_zerro_price = float(price.replace(',', ''))
        # except ValueError:
        #     check_zerro_price = False
        # if not check_zerro_price:
        #     return False
        l.add_value('price', [price])

        weekdays = course_data.xpath('string(.//div[contains(@class, "sectionScheduleMeetingDays")]//div[contains(@class, "content")])').get()
        if weekdays:
            weekdays = weekdays.strip()
            weekdays = re.sub(r'\s+', '', weekdays)
            weekdays = weekdays.split(',')
        else:
            weekdays = []
        l.add_value('days', [weekdays])
        # l.add_value('program', 'Continuing Education')
        # l.add_value('program', response.meta['program'])
        l.add_xpath('program', '//div[@id="courseProfileCertificates"]//li/a/text()')

        duration_hours_list = course_data.xpath('string(.//div[contains(@class, "section sectionScheduleMeetingTime")]//div[contains(@class, "content")])').get()
        if duration_hours_list:
            duration_hours_list = re.findall(r'\d{1,2}:\d{1,2}\w{2}', duration_hours_list)
            duration_hours_list = [t.lower() for t in duration_hours_list]
        else:
            duration_hours_list = []
        l.add_value('duration_hours', [duration_hours_list])
        l.add_value('duration_days_week', l.get_collected_values('days'))

        duration_month_list = course_data.xpath('string(.//div[contains(@class, "section sectionScheduleMeetingDates")]//div[contains(@class, "content")])').get()
        if duration_month_list:
            duration_month_list = re.findall(r'\w+\s\d{1,2},\s\d{4}', duration_month_list)
            if len(duration_month_list) == 2:
                duration_month_list = [datetime.strptime(d, '%b %d, %Y') for d in duration_month_list]

            if len(duration_month_list) == 1:
                duration_month_list = [datetime.strptime(duration_month_list[0], '%b %d, %Y')]
        else:
            duration_month_list = [None]
        l.add_value('duration_months', [duration_month_list])
        l.add_value('duration_as_string', [
            l.get_collected_values('duration_hours'),
            l.get_collected_values('duration_days_week'),
            l.get_collected_values('duration_months'),
        ])

        hours_site = course_data.xpath('string(.//div[contains(@class, "sectionContactHours")]//div[contains(@class, "content")])').get()
        if hours_site:
            hours_site = hours_site.strip()

        l.add_value('total_hours', [
            l.get_collected_values('duration_hours'),
            l.get_collected_values('duration_days_week'),
            hours_site,
        ])

        yield l.load_item()

示例#56

0

显示文件

文件： fb.py 项目： gwrxuk/20181113

 def parse_reactions(self, response):
     new = ItemLoader(item=FbcrawlItem(),
                      response=response,
                      parent=response.meta['item'])
     new.context['lang'] = self.lang
     new.add_xpath('likes',
                   "//a[contains(@href,'reaction_type=1')]/span/text()")
     new.add_xpath('ahah',
                   "//a[contains(@href,'reaction_type=4')]/span/text()")
     new.add_xpath('love',
                   "//a[contains(@href,'reaction_type=2')]/span/text()")
     new.add_xpath('wow',
                   "//a[contains(@href,'reaction_type=3')]/span/text()")
     new.add_xpath('sigh',
                   "//a[contains(@href,'reaction_type=7')]/span/text()")
     new.add_xpath('grrr',
                   "//a[contains(@href,'reaction_type=8')]/span/text()")
     print(new.load_item())
     yield (new.load_item())

示例#57

0

显示文件

    def parse_season(self, response, **kwargs):
        """
        Parses page with particular league season. Creates `league` item and
        fills it with data parsed from returned content.
        """
        loader = ItemLoader(item=LeagueItem(), response=response)
        loader.add_value('id', uuid4())
        loader.add_xpath(
            'title',
            '//div[@id="teamSummary"]/h1[contains(@class, "teamName")]/text()')
        loader.add_value('country_id', kwargs.get('country_id'))
        loader.add_xpath(
            'teams_count',
            ('//div[@class="league-details"]/div[@class="detail"]'
             '/div[contains(., "Teams")]/following-sibling::div/text()'))
        loader.add_xpath(
            'season_start',
            ('//div[@class="league-details"]/div[@class="detail season"]'
             '/div[contains(., "Season")]/following-sibling::div/text()'))
        loader.add_xpath(
            'season_end',
            ('//div[@class="league-details"]/div[@class="detail season"]'
             '/div[contains(., "Season")]/following-sibling::div/text()'))
        loader.add_xpath('all_matches_count',
                         ('//div[@class="league-details"]/div[@class="detail"]'
                          '/following-sibling::div[contains(., "Matches")]'
                          '/div[@class="w65 fl boldFont"]/text()'))
        loader.add_xpath('image_url', '//div[@id="teamSummary"]/img/@src')
        league = loader.load_item()

        matches = response.xpath(
            '//div[@id="teamSummary"]/ul[contains(@class, "secondary-nav")]/'
            'li[contains(@class, "middle")]/a')  # selector
        href = matches.xpath('@href').get()

        # matches for this league available for premium account
        league['blocked'] = True if href else False
        yield league

        if href == '#':
            # parameters for urls query string
            params = {
                'hash': matches.attrib['data-hash'],
                'zzz': matches.attrib['data-zzz'],
                'cur': matches.attrib['data-z']
            }

            yield FormRequest(url=self.make_url('ajax_league.php'),
                              method='POST',
                              formdata=params,
                              callback=self.parse_matches,
                              cb_kwargs={'league_id': league['id']})
        elif href and href != '#':
            yield response.follow(url=href,
                                  callback=self.parse_matches,
                                  cb_kwargs={'league_id': league['id']})

示例#58

0

显示文件

 def parse_news(self, response):
     item = ItemLoader(Articulo(), response)
     item.add_xpath('titulo', '//h1/text()')
     item.add_xpath('contenido', '//div[@id="id_text"]//*/text()')
     yield item.load_item()

示例#59

0

显示文件

    def parse(self, response):

        l = ItemLoader(item=EbookItem(), response=response)

        # Primary Fields
        l.add_xpath("title", "//header/h1/text()",
                    MapCompose(lambda i: i.strip()))
        # TODO add custom pipeline to append subtitle if key doesn't
        # l.add_xpath('subtitle',
        #        '//header/h4/text()',
        #        MapCompose(lambda i: i.strip()), default=' ')

        l.add_value("subtitle", "N/A")  # not all books have subtitles

        l.add_xpath(
            "image",
            '//img[contains(@class,"attachment-post-thumbnail")]/@src',
            MapCompose(lambda i: i.strip()),
        )
        l.add_xpath(
            "author",
            '//div[contains(@class, "book-detail")]//dd[1]/a/text()',
            MapCompose(lambda i: i.strip()),
        )
        l.add_xpath(
            "isbin",
            '//div[contains(@class, "book-detail")]//dd[2]/text()',
            MapCompose(lambda i: i.strip(), lambda i: i.replace("-", "")),
        )
        l.add_xpath(
            "year",
            '//div[contains(@class, "book-detail")]//dd[3]/text()',
            MapCompose(lambda i: i.strip()),
        )
        l.add_xpath(
            "pages",
            '//div[contains(@class, "book-detail")]//dd[4]/text()',
            MapCompose(lambda i: i.strip()),
        )
        l.add_xpath(
            "language",
            '//div[contains(@class, "book-detail")]//dd[5]/text()',
            MapCompose(lambda i: i.strip()),
        )
        l.add_xpath(
            "file_size",
            '//div[contains(@class, "book-detail")]//dd[6]/text()',
            MapCompose(lambda i: i.strip()),
        )
        l.add_xpath(
            "file_format",
            '//div[contains(@class, "book-detail")]//dd[7]/text()',
            MapCompose(lambda i: i.strip()),
        )
        l.add_xpath(
            "category",
            '//div[contains(@class, "book-detail")]//dd[8]//a/text()',
            MapCompose(lambda i: i.strip()),
        )
        l.add_xpath(
            "description",
            '//div[contains(@class,"entry-content")]',
            MapCompose(
                lambda s: s.replace("\n", ""),
                lambda s: s.replace("\b", ""),
                lambda s: s.replace("\f", ""),
                lambda s: s.replace("\r", ""),
                lambda s: s.replace("\t", ""),
                lambda s: s.replace("\v", ""),
                lambda s: s.replace("\x00", ""),
                lambda i: i.strip(),
                # TODO check for other stray characters
            ),
        )
        l.add_xpath(
            "download_link",
            '//a[contains(@href,"file")]/@href',
            MapCompose(lambda s: s.replace(" ", "%20"), lambda i: i.strip()),
        )

        # TODO where to add Housekeeping Fields
        # l.add_value('url', response.url)
        # l.add_value('project', self.settings.get('BOT_NAME'))
        # l.add_value('spider', self.name)
        # l.add_value('server', socket.gethostname())
        # l.add_value('date', date.today())

        return l.load_item()

示例#60

0

显示文件

文件： fast.py 项目： December1208/scrapy_properties

    def parse_item(self, selector, response):
        """ This function parses a property page

        @url http://localhost:9312/properties/property_000000.html
        @returns items 1
        @scrapes title price description address image_urls
        @scrapes url project spider server date
        """
        loader = ItemLoader(item=PropertiesItem(), selector=selector)
        loader.add_xpath('title', './/*[@itemprop="name"][1]/text()',
                         MapCompose(str.strip, str.title))
        loader.add_xpath('price',
                         './/*[@itemprop="price"][1]/text()',
                         MapCompose(lambda i: i.replace(',', ''), float),
                         re='[,.0-9]+')
        loader.add_xpath(
            'description', './/*[@itemprop="description"][1]/text()',
            MapCompose(str.strip, lambda i: i.replace('\r\n', ' ')))
        loader.add_xpath(
            'address', './/*[@itemtype="http://schema.org/Place"][1]/*/text()',
            MapCompose(str.strip))
        loader.add_xpath('image_urls', './/*[@itemprop="image"][1]/@src',
                         MapCompose(lambda i: parse.urljoin(response.url, i)))

        loader.add_xpath('url', './/*[@itemprop="url"]/@href',
                         MapCompose(lambda i: parse.urljoin(response.url, i)))
        loader.add_value('project', self.settings.get('BOT_NAME'))
        loader.add_value('spider', self.name)
        loader.add_value('server', socket.gethostname())
        loader.add_value('date', datetime.datetime.now())
        yield loader.load_item()