def parse_titles(self, response): loader = ItemLoader(item=BlogCategory(), response=response) loader.add_value('hub', response.meta['hname']) loader.add_css('title', 'div.company_post h1 span::text') loader.add_css('date', 'div.published::text') loader.add_css('article', 'div.content::text') yield loader.load_item()
def parse_item(self, response): l = ItemLoader(item=PageItem(), response=response) l.add_value('title', response.request.cookies['title']) l.add_value('url', response.url) l.add_value('name', self.name) l.add_xpath('image_urls', '//div[@class="l_effect_img_mid"]/a/img/@src') return l.load_item()
def parse_item(self,response): l = ItemLoader(item =MeizituItem(),response = response) l.add_xpath('name','//h2/a/text()') l.add_xpath('tags', "//div[@id='maincontent']/div[@class='postmeta clearfix']/div[@class='metaRight']/p") l.add_xpath('image_urls', "//div[@id='picture']/p/img/@src",Identity()) l.add_value('url', response.url) return l.load_item()
def _parse(self, response): l = ItemLoader(item=BookmarksItem(), response=response) l.add_xpath(u"name", u"/html/head/title") l.add_xpath(u"anchors", u"//a/@href'") l.add_xpath(u"description", u"/html/body/text()") l.add_value(u"last_updated", datetime.datetime) # you can also use literal values return l.load_item()
def parse_item(self, response): l = ItemLoader(item=PageItem(), response=response) l.add_value('title', response.request.cookies['title']) l.add_value('name', self.name) l.add_value('url', response.url) l.add_xpath('image_urls', '//td[@valign="top"]/img/@src') return l.load_item()
def parse_rate(self,response): loader = ItemLoader(item = RateItem(),response=response) for attr,xpath in self.settings.getdict('RATE_XPATH').items(): loader.add_xpath(attr,xpath) return loader.load_item()
def parse(self, response): for item in self.find_items(response): loader = ItemLoader(item=self.item_class()) for target in self.get_targets(): loader.add_value(target.name, target.get_value(item, response)) val = self.Meta.detail_path.get_value(item, response) yield gen_request(val, self.parse_details, loader.load_item())
def parse_content(self,response): bbsItem_loader = ItemLoader(item=BbsDmozItem(),response = response) url = str(response.url) bbsItem_loader.add_value('url',url) bbsItem_loader.add_xpath('forum',self._x_query['forum']) bbsItem_loader.add_xpath('poster',self._x_query['poster']) bbsItem_loader.add_xpath('content',self._x_query['page_content']) return bbsItem_loader.load_item()
def test_load_item_using_default_loader(self): i = TestItem() i['summary'] = u'lala' il = ItemLoader(item=i) il.add_value('name', u'marta') item = il.load_item() assert item is i self.assertEqual(item['summary'], u'lala') self.assertEqual(item['name'], [u'marta'])
def parse(self, response): item = Item() l = ItemLoader(item=item, response=response) for name, xpath in response.meta['fields'].iteritems(): if xpath: item.fields[name] = Field() l.add_xpath(name, xpath) return l.load_item()
def parse_detail(self, response): il = ItemLoader(NewsItem(), response=response) il.add_css("title", "%s::text" % self.title) il.add_css("date", "%s::text" % self.date) il.add_css("auth", "%s::text" % self.auth) il.add_css("content", "%s > p::text" % self.content) il.add_value("cate", response.meta["cate"]) return il.load_item()
def parse(self, response): l = ItemLoader(item=PlantItem(), response=response) l.add_xpath('name', "//div[@id='bodycontent']/div[@class='post']/div[@class='pagebanner']/h2/text()") l.add_xpath('species', "//div[@id='bodycontent']/div[@class='post']/div[@class='pagebanner']/div[@class='clear resultSpecies']/text()") l.add_xpath('key', "//div[@id='bodycontent']/div[@class='post']/div[@class='contents']/div[@id='tabbedinfo']/div[@class='tabscontain']/div[@class='tabs']/div[@class='post-meta']/div[@class='post-meta-key']/text()") l.add_xpath('value', "//div[@id='bodycontent']/div[@class='post']/div[@class='contents']/div[@id='tabbedinfo']/div[@class='tabscontain']/div[@class='tabs']/div[@class='post-meta']/div[@class='post-meta-value']/child::node()") # l.add_xpath('value', "//div[@id='bodycontent']/div[@class='post']/div[@class='contents']/div[@id='tabbedinfo']/div[@class='tabscontain']/div[@class='tabs']/div[@class='post-meta']/div[@class='post-meta-value']/a/text()") return l.load_item()
def parse_stuff(self, response): hxs = Selector(response) sites = hxs.xpath('//body') items_main = [] for site in sites: loader = ItemLoader(item = Items_Main(), response = response) loader.add_xpath('fragment', '//*[not(self::script)]/text()') items_main.append(loader.load_item()) return items_main
def parse(self, response): l = ItemLoader(item=UniprotItem(), response=response) l.add_xpath('proteinName', "//*[@id='page-header']/h2/span/text()") l.add_value('uniprotAccession', response.url) l.add_xpath('uniprotProteinLength', "//*[@id='sequences-section']/div[1]/div[2]/div[1]/span[2]/text()") listing = response.xpath("//*[@id='subcellular_location']/div[1]/ul") subcellular_location = [] for li in listing: subcellular_location.append(li.xpath("./li/a/text()").extract()) l.add_value('uniprotLocalization', subcellular_location) yield l.load_item()
def parse_event_detail(self, response): event = response.meta['event'] events = response.meta['events'] players = response.xpath('//table[@class="sticky-enabled"]/tbody/tr') event_loader = ItemLoader(event) for player in players: event_loader.add_value( 'players', player.xpath('td/text()').extract()) events.append(event_loader.load_item()) return events
def print_url(self, response): """ @url http://www.ura.org.hk/en/schemes-and-policies/redevelopment/ura-implemented-projects/reimbursement.aspx @returns items 1 1 @returns requests 0 0 @scrapes title link html text last_updated file_urls """ l = ItemLoader(item=UrbanRenewalItem(), response=response) l.add_xpath('title', '//title') l.add_value('link', response.url) l.add_xpath('text', '//div[@id="content"]') l.add_xpath('last_updated', '//div[@class="lastUpdated"]') return l.load_item()
def parse(self, response): l=ItemLoader(item=RentalItem(),response=response) l.add_xpath('price','//*[(@id = "main-info")]//*[contains(concat( " ", @class, " " ), concat( " ", "txt-big", " " )) and contains(concat( " ", @class, " " ), concat( " ", "txt-bold", " " ))]/text()') l.add_xpath('adress','//*[(@id = "addressPromo")]//*[contains(concat( " ", @class, " " ), concat( " ", "txt-bold", " " ))]/text()') l.add_value('url', response.url) return l.load_item()
def parse(self, response): sel = Selector(response) last_page = sel.xpath('//span[@class="step-links"]/a/text()')[-1].extract() self.num_page = int(last_page) loader = ItemLoader(item=User(), response=response) loader.add_value('uid', self.uid) loader.add_xpath('name', '//a[@class="username"]/text()') for i in range(1, self.num_page + 1): url = self.start_urls[0] + '/' + str(i) yield Request(url, callback=self.parse_list, meta={'loader': loader})
def parse(self, response): if not self.fields: # init database fields from saved state self.fields = self.next_window() search_fos = urlparse.parse_qs(urlparse.urlparse(response.url).query)['mauthors'][0].split(':')[1] self.logger.debug('Search fos: %s' % search_fos) # get 10 author divs for divs in response.xpath('//div[@class="gsc_1usr gs_scl"]')[0:9]: user = divs.extract() # Content in the img's alt tag is the actual name, shown on the profile # However, the name in the actual link differs sometimes slightly # EH Roberts (link) instead of E H Roberts (on profile + alt) id = re.search('citations\?user=([^&]+)(&|)',user) name = re.search('alt="([^"]+)"', user) citecount = re.search('<div class="gsc_1usr_cby">.*([0-9]+)</div>', user) fostmp = re.findall('label:([^"]+)("|)', user) fos = [i[0] for i in fostmp] if id and name: item = ItemLoader(item=AuthorItem(), response=response) item.add_value('fos', fos) item.add_value('id', id.group(1)) item.add_value('name', name.group(1)) # unknown citation count: cited = citecount.group(1) if citecount else None item.add_value('cited', cited) yield item.load_item() # Also scrape field of studies while we are at it for f in fos: if f != search_fos: fos_item = FOSItem() fos_item['field_name'] = f yield fos_item # generate next url new1 = response.xpath('//*[@id="gsc_authors_bottom_pag"]/span/button[2]').extract_first() if new1: new2 = re.search('mauthors(.*)\'"', new1) if new2: newUrl = str(new2.group(1)).replace('\\x3d','=').replace('\\x26', '&') newUrl = 'https://scholar.google.de/citations?view_op=search_authors&hl=de&mauthors' + newUrl self.container.append(newUrl) # proceed with another random url or label to randomize access pattern to gscholar next_url = self.choose_next() if next_url: yield Request(url=next_url)
def parse_first_page(self, response): count = int(response.xpath('//div[@id="aplist"]/ul/li[1]/a/text()')[0].re(r'.*?(\d+).*?')[0]) title = response.request.cookies['title'] albumURL = response.url.replace(".html", '') for x in xrange(1,count+1): suffix = ".html" if x > 1: suffix = "_"+str(x)+".html" request = scrapy.Request(albumURL+suffix, callback=self.parse_item, cookies={'title': title}) yield request l = ItemLoader(item=PageItem(), response=response) l.add_value('title', title) l.add_value('name', self.name) l.add_value('url', response.url) l.add_xpath('image_urls', '//p[@id="contents"]/a/img/@src') yield l.load_item()
def parse_content(self, response): logger.info('Dealing with images: %s', response.url) item_load = ItemLoader(item=ScrapyMeizituItem(), response=response) item_load.add_value('url', response.url) item_load.add_xpath('name', self._x_query['name']) item_load.add_xpath('tags', self._x_query['tags']) item_load.add_xpath('image_urls', self._x_query['image_urls']) return item_load.load_item()
def parse_depth_chart(self, response): loader = ItemLoader(item=NFL_Team_2015(), response=response) loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() loader.add_xpath("division", '//*[@id="sub-branding"]/div[2]/text()') loader.add_xpath("name", '//*[@id="sub-branding"]/h2/a/b/text()') yield loader.load_item()
def _set_loader(self, response, from_page, xs, item): self.from_page = from_page rpt = self.scraper.get_rpt(from_page) if not self.from_page == 'MP': item = response.request.meta['item'] if rpt.content_type == 'J': json_resp = json.loads(response.body_as_unicode()) self.loader = JsonItemLoader(item=item, selector=json_resp) else: self.loader = ItemLoader(item=item, response=response) else: if rpt.content_type == 'J': self.loader = JsonItemLoader(item=item, selector=xs) else: self.loader = ItemLoader(item=item, selector=xs) self.loader.default_output_processor = TakeFirst() self.loader.log = self.log
def parse(self,response): l = ItemLoader(item = NytimesItem(),response = response) l.add_xpath('topnews','//*[contains(@id,"topnews-100")]/h2/a/text()') l.add_xpath('sectionnews','//h3[contains(@class,"story-heading")]/text()') #print(type(l.load_item())) x = l.load_item() #print(len(x['date']),len(x['topnews']),len(x['sectionnews'])) nytdict = dict() datelist = [] datalist = datetime.date.today() topnewslist = [] sectionnewslist = [] nytdict['date'] = str(datalist) for t in x['topnews']: topnewslist.append(str(t.encode('ascii','ignore'))) nytdict['topnews']=topnewslist for t in x['sectionnews']: sectionnewslist.append(str(t.encode('ascii','ignore')).strip()) nytdict['sectionnews']=sectionnewslist filename = datetime.date.today() f=open('{}.json'.format(filename),'w') json.dump(nytdict,f) return l.load_item()
def parse(self, response): for outer in response.css('#comapreTable tr:not(:first-child)'): if outer.css('td[align="center"]'): ccode = outer.css('td[align="center"]>a::attr(id)').extract_first() cname = outer.css('td[align="center"]>a::text').extract_first() for inner in outer.xpath('td[div[@align="left"]/a]'): loader = ItemLoader(item=EolZhuanyeItem(), selector=inner) loader.add_value('ccode', ccode) loader.add_value('cname', cname) loader.add_css('url', 'a::attr(href)', lambda urls: urljoin(self.start_urls[0], urls[0])) loader.add_xpath('code', 'following-sibling::td[1]/text()', MapCompose(unicode.strip)) loader.add_css('name', 'a::text', MapCompose(unicode.strip)) item = loader.load_item() yield Request(url=item['url'][0], meta={'item': item}, callback=self.parse_item)
def parse_CatalogRecord(self, response): CatalogRecord = ItemLoader(item=catalogscraperItem(), response=response) CatalogRecord.default_output_processor = TakeFirst() keywords = '|'.join(r"\b" + re.escape(word.strip()) + r"\b" for word in open('Catalog_Scraper/spiders/keys.txt')) r = re.compile('.*(%s).*' % keywords, re.IGNORECASE|re.MULTILINE|re.UNICODE) if r.search(response.body_as_unicode()): # The following lines tell the spider how to populate the fields defined in "items.py". The first argument of "CatalogRecord.add_xpath" indicated which field the spider is being directed to fill, while the second provides an xpath, directing the spider to where the relevent information is contained on a give webpage. CatalogRecord.add_xpath('title', './/div[@id="dublin-core-title"]/div[@class="element-text"]/text()') # CatalogRecord.add_xpath('subject', '') # CatalogRecord.add_xpath('description', '') # CatalogRecord.add_xpath('creator', '') # CatalogRecord.add_xpath('source', '') # CatalogRecord.add_xpath('published', '') # CatalogRecord.add_xpath('published', '') # CatalogRecord.add_xpath('rights', '') # CatalogRecord.add_xpath('citation', '') # CatalogRecord.add_xpath('url', '') return CatalogRecord.load_item()
def parse_details(self, response): item = response.meta["item"] urlLast = response.meta["urlLast"] loader = ItemLoader(item,response=response) loader.add_xpath("Description","//*[@id='body']/p[3]/text()") loader.add_xpath("Education","//td[. = 'Education Level (Highest Grade Completed)']/following-sibling::td[1]/text()") if urlLast.endswith("no_last_statement.html"): loader.add_value('Message',u'') return loader.load_item() else: request = scrapy.Request(urlLast, meta={"item": loader.load_item()}, callback=self.parse_details2) return request
def parse_first_page(self, response): count = int(response.xpath('//ul[@class="image"]/text()')[0].re(r'.*?(\d+).*?')[0]) title = response.request.cookies['title'] albumURL = response.url.replace(".shtml", '') # print u'', count, title, albumURL for x in xrange(1,count+1): suffix = ".shtml" if x > 1: suffix = "_"+str(x)+".shtml" # print u'',albumURL+suffix request = scrapy.Request(albumURL+suffix, callback=self.parse_item, cookies={'title': title}) yield request l = ItemLoader(item=PageItem(), response=response) l.add_value('title', title) l.add_value('name', self.name) l.add_value('url', response.url) l.add_xpath('image_urls', '//td[@valign="top"]/img/@src') yield l.load_item()
def parse(self, response): def strip_dollar(x): return x.strip('$') self.driver.get(response.url) try: WebDriverWait(self.driver, 15).until( EC.presence_of_element_located( (By.XPATH, '//*[@id="depart-container"]/div[2]/div[1]/div/[@style="width: 0%;"]'))) except TimeoutException: print 'Page load time out' pass while True: try: try: WebDriverWait(self.driver, 15).until( EC.presence_of_element_located( (By.XPATH, '//*[@id="depart-container"]/div/div/div/button'))) except TimeoutException: break next = self.driver.find_element_by_xpath( '//*[@id="depart-container"]/div/div/div/button') next.click() except ElementNotVisibleException: break for trips in Selector( text=self.driver.page_source).xpath(self.trips_list_xpath): loader = ItemLoader(BusTrip(), selector=trips) loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() loader.price_in = MapCompose(strip_dollar) for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) dateoftrip = str(response.url).split("/")[-1] loader.add_value('dateoftrip', dateoftrip.decode('unicode-escape')) yield loader.load_item()
def parse_item(self, response): l = ItemLoader(item=GetEmailsItem(), response=response) l.default_output_processor = MapCompose(lambda v: v.strip(), replace_escape_chars) emails = response.xpath('//text()').re(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,4}") l.add_value('email', emails) l.add_value('url', response.url) return l.load_item()
def parse(self, response): productos = response.css('div.product-tile-inner') promedio = 0.0 num_items = 0 for prod in productos: text_price = prod.css('.price::attr(data-bind)') precio = str(text_price).replace(").formatMoney(2, '.', '\">]","").replace(").formatMoney(2, '.', ',\">]","").replace("[<Selector xpath=\"descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' price ')]/@data-bind\" data=\"text:'$' + (","") try: promedio = promedio + float(precio) num_items = num_items +1 except: print(precio) for producto in productos: existe_producto = len( producto.css('div.detail')) if(existe_producto > 0): # titulo = producto.css('a.name::text') # url = producto.xpath('//div[contains(@class,"detail")]/a[contains(@class,"image")]/img[contains(@id,"gImg")]/@src') producto_loader = ItemLoader( item = ProductoFybeca(), selector = producto ) producto_loader.default_output_processor = TakeFirst() producto_loader.add_css( 'titulo', 'a.name::text' ) producto_loader.add_xpath( 'imagen', 'div[contains(@class,"detail")]/a[contains(@class,"image")]/img[contains(@id,"gImg")]/@src' ) producto_loader.add_value( 'promedio', promedio/num_items ) producto_loader.add_css( 'precio', '.price::attr(data-bind)' ) #producto_imprimir = producto_loader.load_item() #print(producto_imprimir) yield producto_loader.load_item()
def parse(self, response): for i in range(1, 11): #range to 10 because bing results are 10 per page for row in response.xpath("//li[@class='b_algo'][%s]" % i): l = ItemLoader(item=CandcrawlerItem(), selector=row) l.add_xpath("headline", "h2//text()") l.add_xpath( "metadata", "div[@class='b_caption']/div[@class='b_factrow b_twofr']/div[@class='b_vlist2col']/ul/li/div//text()" ) l.add_xpath( "li_url", "div[@class='b_caption']/div[@class='b_attribution']/cite/text()" ) l.add_xpath("summary", "div[@class='b_caption']/p//text()") l.add_xpath("search", "//div[@class='b_searchboxForm']/input/@value") l.add_value("link", response.request.url) #this is to get only the LinkedIn results if 'linkedin.com/in' in response.xpath( "//li[@class='b_algo'][%s]/div[@class='b_caption']/div[@class='b_attribution']/cite/text()" % i).get(): yield l.load_item() else: pass next_page = response.xpath( "//li[@class='b_pag']/nav/ul/li/a[@aria-label='Page 2']/@href" ).get() if next_page is not None: next_page = "http://www.bing.com" + next_page yield response.follow(next_page, callback=self.parse)
def parse(self, response, **kwargs): loader = ItemLoader(item=YelpItem(), response=response) for script in response.css('script').getall(): if '{"gaConfig' in script: detail_json = json.loads(re.search(r'({"gaConfig.*?)-->', script).group(1)) loader.add_value('direct_url', detail_json['staticUrl']) loader.add_value('business_id', detail_json['bizDetailsPageProps']['bizContactInfoProps']['businessId']) loader.add_value('categories', detail_json['gaConfig']['dimensions']['www']['second_level_categories'][1]) if detail_json['bizDetailsPageProps']['bizContactInfoProps']['businessWebsite']: loader.add_value('site', detail_json['bizDetailsPageProps']['bizContactInfoProps']['businessWebsite']['linkText']) loader.add_value('title', detail_json['bizDetailsPageProps']['businessName']) loader.add_value('review_count', detail_json['bizDetailsPageProps']['ratingDetailsProps']['numReviews']) #TODO: find way to not use hardcoded documentIds post_data = [{"operationName":"getLocalBusinessJsonLinkedData","variables":{"BizEncId": "".join(loader.get_output_value('business_id'))},"extensions":{"documentId":"1cf362b8e8f9b3dae26d9f55e7204acd8355c916348a038f913845670139f60a"}}] yield scrapy.Request('https://www.yelp.com/gql/batch', method='POST', body=json.dumps(post_data), headers={'Content-Type': 'application/json'}, callback=self.linkedData, meta={'item': loader.load_item()})
def parse_item(self, response): loader = ItemLoader(item=SpiderItem(), response=response) content = '' try: title = response.xpath(r'//*[@class="dbt"]//text()').extract() date = response.xpath(r'//*[@class="lf"]//text()').extract_first() if date is not None: date = date.split(" ")[0] else: date = '1970-01-01' content = response.xpath( r'//*[@class="nra"]//text() | //*[@class="bzzx_xjnr"]//text()' ).extract() loader.add_value('date', date) loader.add_value('title', title) loader.add_value('content', content) except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) loader.add_value('date', '1970-01-01') loader.add_value('title', 'unknown') loader.add_value('content', '') finally: self.logger.info("crawled url: %s" % response.url) loader.add_value('url', response.url) loader.add_value('collection_name', self.name) loader.add_value("website", self.website) if content == '': self.logger.warning(' url: %s msg: %s' % (response.url, ' content is None')) yield loader.load_item()
def populate_item(self, selector, url): item_loader = ItemLoader(item=MediumScraperItem(), selector=selector) item_loader.default_output_processor = TakeFirst() item_loader.add_xpath('author', './/a[@data-action="show-user-card"]/text()') item_loader.add_xpath('title', './/*[contains(@class, "title")]/text()') item_loader.add_xpath('title', './/h3[contains(@class, "title")]/*/text()') item_loader.add_xpath('subtitle_preview', './/*[@name="previewSubtitle"]/text()') item_loader.add_xpath( 'collection', './/a[@data-action="show-collection-card"]/text()') item_loader.add_xpath('read_time', './/*[@class="readingTime"]/@title') item_loader.add_xpath( 'claps', './/button[@data-action="show-recommends"]/text()') item_loader.add_xpath( 'responses', './/a[@class="button button--chromeless u-baseColor--buttonNormal"]/text()' ) item_loader.add_xpath('published_date', './/time/text()') item_loader.add_xpath( 'article_url', './/a[contains(@class, "button--smaller")]/@href') item_loader.add_value('scraped_date', datetime.now()) return item_loader.load_item()
def parse(self, response): sites = response.xpath('//table/tbody/tr') for site in sites: url = urljoin(response.url, site.xpath("td[2]/a/@href").extract_first()) urlLast = urljoin(response.url, site.xpath("td[3]/a/@href").extract_first()) item = DeathItem() loader = ItemLoader(item, selector=site) loader.add_xpath('Mid', 'td[1]/text()') loader.add_xpath('firstName', 'td[5]/text()') loader.add_xpath('lastName', 'td[4]/text()') loader.add_xpath('Date', 'td[8]/text()') loader.add_xpath('Race', 'td[9]/text()') loader.add_xpath('County', 'td[10]/text()') loader.add_xpath('Age', 'td[7]/text()') loader.add_value('OILink', url) loader.add_value('OLastStatement', urlLast) if url.endswith(("jpg", "no_info_available.html")): loader.add_value('Description', u'') loader.add_value('Education', u'') if urlLast.endswith("no_last_statement.html"): loader.add_value('Message', u'') yield loader.load_item() else: request = scrapy.Request(urlLast, meta={"item": loader.load_item()}, callback=self.parse_details2) yield request else: request = scrapy.Request(url, meta={ "item": loader.load_item(), "urlLast": urlLast }, callback=self.parse_details) yield request
def parse_profile(self, response: Response): query = urllib.parse.urlparse(response.url).query id = int(urllib.parse.parse_qs(query)['id'][0]) l = ItemLoader(item=UserItem(), response=response) l.add_value('id', id) l.add_xpath( 'name', '//div[@id="viewprofile"]//td[@id="profile-left"]/li[@id="profile-name"]/strong/text()' ) l.add_xpath( 'avatar_url', '//div[@id="viewprofile"]//td[@id="profile-left"]//img//@src') l.add_xpath( 'registration_date', '//div[@id="viewprofile"]//td[@id="profile-right"]//li/span[text()="Зарегистрирован:"]/following-sibling::strong/text()' ) return l.load_item()
def parse_item(self, response): l = ItemLoader(item=SpiderItem(), response=response) try: for attr in ['title', 'date', 'content']: function = getattr(self, 'get' + attr, None) if function: l.add_value(attr, function(response)) else: self.logger.error('no method for %s' % attr) except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) l = ItemLoader(item=SpiderItem(), response=response) l.add_value('title', '') l.add_value('date', '1970-01-01 00:00:00') l.add_value('source', '') l.add_value('content', '') pass finally: l.add_value('url', response.url) l.add_value('collection_name', self.name) l.add_value('website', self.website) return l.load_item()
def parse(self, response): curr_url = response.url key = get_schemenetloc(curr_url) if key in self.websites: rule = self.websites[key] item = ItemLoader(item=GovernwebcrawlerItem(), response=response) root = rule['root_div'] title = rule['title'] content = rule['content'] time = rule['time'] desc = rule['desc'] item.add_xpath('title', root + title) item.add_xpath('time', root + time) item.add_xpath('content', root + content) item.add_value('url', curr_url) item.add_value('desc', desc) yield item.load_item() body = response.body content = body.decode('utf8', errors='ignore') results = Selector(text=content).xpath('//a').extract() for res in results: sel = Selector(text=res) url = sel.xpath('//a/@href').extract() name = sel.xpath('//a/text()').extract() if len(url) != 0: url = urljoin(curr_url, url[0]) req = Request(url=url, callback=self.parse) if not url.endswith('.html'): req.meta['PhantomJS'] = True yield req
def parse_goods(self, response: HtmlResponse): loader = ItemLoader(item=LeroyItem(), response=response) loader.add_xpath('name', '//h1[@class="header-2"]/text()') loader.add_xpath('photos', '//uc-pdp-media-carousel//img[@slot="thumbs"]/@src') loader.add_xpath('params', '//dl[@class="def-list"]/div') loader.add_value('url', response.url) loader.add_xpath('price', '//span[@slot="price"]/text()') yield loader.load_item()
def item_parse(self, response: HtmlResponse): loader = ItemLoader(item=LeroymerlinItem(), response=response) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') loader.add_xpath('parameters', '//div[@class="def-list__group"]') loader.add_xpath('photos', '//img[@itemprop="image"]/@src') loader.add_xpath('price', '//span[@slot="price"]/text()') loader.add_value('link', response.url) yield loader.load_item()
def parse_author(self, response): quote_item = response.meta['quote_item'] loader = ItemLoader(item=quote_item, response=response) loader.add_css('author_name', '.author-title::text') loader.add_css('author_birthday', '.author-born-date::text') loader.add_css('author_bornlocation', '.author-born-location::text') loader.add_css('author_bio', '.author-description::text') yield loader.load_item()
def parse_content(self, response): print('in parseMore') def deal_publish_time(publish_time_raw): if type(publish_time_raw) == type([]): publish_time_raw_str = publish_time_raw.pop() else: publish_time_raw_str = publish_time_raw time_splited = publish_time_raw_str.split(',') year = str(time_splited[1]).strip() mounth_day = time_splited[0].split(' ') day = str(mounth_day[1]).strip() mounth = mounth_day[0] mounth_dict = { u'一月': '01', u'二月': '02', u'三月': '03', u'四月': '04', u'五月': '05', u'六月': '06', u'七月': '07', u'八月': '08', u'九月': '09', u'十月': '10', u'十一月': '11', u'十二月': '12', } mounth_num_str = mounth_dict[mounth] if len(day) < 2: day = '0' + day publish_time_dealed = year + '-' + mounth_num_str + '-' + day + ' 00:00:00' return publish_time_dealed def deal_publish_user(publish_user_raw): if type(publish_user_raw) == type([]): if publish_user_raw: publish_user_name = publish_user_raw.pop() else: publish_user_name = '' else: publish_user_name = publish_user_raw return publish_user_name.strip() def deal_read_count(read_count_raw): if read_count_raw: #这里边一定是list对象。 read_count_str = read_count_raw.pop() read_count_str = str(read_count_str) read_count = str(read_count_str).replace('阅读次数:', '').replace(',', '') return int(read_count) else: return 0 loader1 = ItemLoader(response=response, item=YfspiderspeakItem()) loader1.add_value('url', response.url) loader1.add_value('id', response.url.split('/')[-1]) loader1.add_value('spider_time', time.time()) loader1.add_xpath( 'title', '//div[@id="main"]//h1[@class="entry-title"]/text()', lambda x: x[0].strip()) loader1.add_xpath( 'content', '//div[@id="main"]//div[@class="entry-content"]//text()', lambda x: ''.join([oneP.strip() for oneP in x])) loader1.add_xpath('publish_time', '//div[@id="main"]//span[@class="date"]/text()', deal_publish_time) loader1.add_xpath('publish_user', '//div[@id="main"]//span[@class="author"]//text()', deal_publish_user) loader1.add_value( 'read_count', response.xpath( "//div[@id='content']/article/div[contains(@class,'tags')]//text()" ).re(u'阅读次数\:(.*)'), deal_read_count) loader1.add_xpath( 'video_urls', '//div[@id="main"]//div[@class="entry-content"]//iframe/@src') loader1.add_xpath( 'img_urls', '//div[@id="main"]//div[@class="entry-content"]//img/@src') item1 = loader1.load_item() return item1
def parse_content_english(self, response): def deal_publish_time(publish_time): if publish_time: publish_time_split = publish_time[0].strip().split('/') return publish_time_split[2] + '-' + publish_time_split[ 1] + '-' + publish_time_split[0] else: return None loader1 = ItemLoader(response=response, item=YfspiderspeakItem()) loader1.add_value('url', response.url) loader1.add_value('id', response.url.split('/')[-1]) loader1.add_value('spider_time', time.time()) loader1.add_xpath( 'title', '//div[@id="container"]//h1[@class="entry-title"]//text()', lambda x: x[0].strip()) loader1.add_xpath( 'content', '//div[@id="container"]//div[@class="entry-content"]//p//text()', lambda x: Join([oneP.strip() for oneP in x])) loader1.add_xpath( 'publish_time', '//div[@id="container"]//div[@class="entry-meta"]//span[@class="entry-date"]/text()', deal_publish_time) loader1.add_xpath( 'publish_user', '//div[@id="container"]//div[@class="entry-meta"]//span[@class="author vcard"]/a/text()', lambda x: x.strip() if x else None) loader1.add_value( 'read_count', response.xpath('//div[@id="content"]/text()').re('^\s*\d+\s*'), lambda x: x.strip() if x else 0) item1 = loader1.load_item() return item1
def parse_lot(self, response): l = ItemLoader(item=LarsenDelpetersonItem(), response=response) l.default_output_processor = TakeFirst() l.add_xpath('LotNum', '//h1/text()') l.add_xpath( 'LotDescription', '//h2[contains(text(), "Item Details:")]/following-sibling::p[1]/text()[1]' ) address = response.xpath( '//b[contains(text(), "Item Location:")]/following-sibling::text()[1]' ).extract_first() city, region = address.split(',') l._add_value('City', city) l._add_value('State', region) l._add_value('ZIP', region) l.add_xpath( 'Contact', '//b[contains(text(), "Equipment Contact:")]/following-sibling::text()[1]' ) l.add_xpath( 'Phone', '//b[contains(text(), "Phone Number:")]/following-sibling::text()[1]' ) l.add_xpath( 'Category', '//strong[contains(text(), "Category:")]/following-sibling::text()[1]' ) l.add_xpath( 'ClosesOn', '//strong[contains(text(), "Closes On")]/following-sibling::text()[1]' ) l.add_xpath('image_urls', '//div[@id="gallery"]//a/@href') l.add_value('folder_name', self.auction_id) yield l.load_item()
def parse_article_child_page(self, response): """Extracts and yields article item & author-article relation item from article child page""" self.logger.info('Parsing article child page {}'.format(response.url)) article_loader = ItemLoader(item=ArticleItem(), response=response) article_loader.add_value('url', response.url) article_loader.add_css('title', '#woe #hero h2::text') article_loader.add_css('pub_date', '#woe #hero .authwrp .sdate::text') article_loader.add_css('text', '#woe .postbody *::text') article_loader.add_css( 'tags', "head meta[property='article:tag'] ::attr(content)") article_item = article_loader.load_item() article_author_loader = ItemLoader(item=ArticleAuthorItem(), response=response) article_author_loader.add_css('authors', '.goauthor::attr(href)') article_author_loader.add_value('article_url', article_item['url']) article_author_item = article_author_loader.load_item() yield article_item yield article_author_item
def read_news(self, response): print('simple_spider: read_news') titulo = response.xpath(self.tituloPath).get() cuerpo = response.xpath(self.cuerpoPath).getall() fecha_publicacion = response.xpath(self.fechaPath).get() # Date should has format: YYYY-MM-DDTHH:MM:SS fecha_publicacion = self.format_fecha(fecha_publicacion) if datetime.strptime(fecha_publicacion, '%Y-%m-%dT%H:%M:%S') < self.date_pbl_min: self.date_pbl_min = datetime.strptime(fecha_publicacion, '%Y-%m-%dT%H:%M:%S') news = ItemLoader(item=News()) news.add_value('titulo', titulo) news.add_value('cuerpo', cuerpo) news.add_value('fecha_publicacion', fecha_publicacion) news.add_value('url', response.url) news.add_value('diario', self.name) news.add_value('page', self.current_page) return news.load_item()
def parse_definition(self, response:HtmlResponse): loader = ItemLoader(item=DictionaryItem(), response=response) loader.add_xpath('aword', "//h1/text() | //h1/span/text()") loader.add_xpath('definition', "//div[@id='medical-entry-1']/div[@class='vg']//span[@class='dtText']/em[@class='mw_t_it']/text() | //div[@id='medical-entry-1']/div[@class='vg']//span[@class='dtText']/text()") loader.add_value('link', response.url) yield loader.load_item()
def parse_detail(self, response): # 处理question页面,从页面中提取quesiont item match_obj = re.match('(.*zhihu.com/question/(\d+))(/|$).*', response.url) question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css('title', 'h1.QuestionHeader-title::text') item_loader.add_css('content', '.QuestionHeader-detail' ) # .QuestionHeader-detail span.RichText::text item_loader.add_value('url', response.url) item_loader.add_value('zhihu_id', question_id) item_loader.add_css('answer_num', '.List-headerText span::text') item_loader.add_css( 'comments_num', '.QuestionHeader-Comment button.Button--plain::text') item_loader.add_css( 'watch_user_num', '.NumberBoard-itemInner strong.NumberBoard-itemValue::attr("title")' ) item_loader.add_css('topics', '.QuestionHeader-topics .Popover div::text') question_item = item_loader.load_item() yield scrapy.Request(self.start_answer_urls.format(question_id, 3, 0), headers=self.headers, callback=self.parse_answer) yield question_item
def parse(self, response): filename = response.url.split("/")[-2] with open(filename, 'wb') as f: f.write(response.body) print response.css('title::text').extract() # for quote in response.css('div.quote'): # yield { # 'text': ''.split(quote.css('span.text::text').extract_first()), # 'author': quote.css('small.author::text').extract_first(), # 'tags': quote.css('div.tags a.tag::text').extract(), # } l = ItemLoader(item=Product(), response=response) l.add_xpath('name', '//div[@class="product_name"]') l.add_xpath('name', '//div[@class="product_title"]') l.add_xpath('price', '//p[@id="price"]') # l.add_css('stock', 'p#stock]') l.add_value('last_updated', 'today') # you can also use literal values print l.load_item() return l.load_item()
def parse_post(self, response): date = response.xpath('//div[@class="section simple Component-StandardContent "]/p[position()<4]//text()|//div[@class="section simple Component-StandardContent "]/span[position()<2]//text()').getall() date = re.findall(r'\b(?:\w+\s\d+\s)?\w+\S+(?:\s\d+(?:th)?)?\,\s\d+\S+', ' '.join(date)) if not date: date = "Date is not published" title = response.xpath('//div[@class="section simple Component-StandardContent "]/strong/text()|//div[@class="section simple Component-StandardContent "]/p/strong/text()').get() content = response.xpath('//div[@class="section simple Component-StandardContent "]//text()[not (ancestor::strong)]').getall() content = [p.strip() for p in content if p.strip()] content = re.sub(pattern, "",' '.join(content)) item = ItemLoader(item=WwealthonebankofcanadaItem(), response=response) item.default_output_processor = TakeFirst() item.add_value('title', title) item.add_value('link', response.url) item.add_value('content', content) item.add_value('date', date) yield item.load_item()
def parse_question(self, response): if 'QuestionHeader-title' in response.text: match_obj = re.match('(.*www.zhihu.com/question/(\d+))(/|$).*', response.url) if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css("title", "h1.QuestionHeader-title::text") item_loader.add_css("content", ".QuestionHeader-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", ".List-headerText span::text") item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text") item_loader.add_css("watch_user_num", ".NumberBoard-value::text") item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text") question_item = item_loader.load_item() else: # 处理老版本页面的item提取 match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) # item_loader.add_css("title", ".zh-question-title h2 a::text") item_loader.add_xpath( "title", "//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()" ) item_loader.add_css("content", "#zh-question-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", "#zh-question-answer-num::text") item_loader.add_css( "comments_num", "#zh-question-meta-wrap a[name='addcomment']::text") # item_loader.add_css("watch_user_num", "#zh-question-side-header-wrap::text") item_loader.add_xpath( "watch_user_num", "//*[@id='zh-question-side-header-wrap']/text()|//*[@class='zh-question-followers-sidebar']/div/a/strong/text()" ) item_loader.add_css("topics", ".zm-tag-editor-labels a::text") question_item = item_loader.load_item() yield scrapy.Request(self.start_answer_url.format(question_id, 20, 2), headers=self.header, callback=self.parse_answer) yield question_item
def parse(self, response): # change the code here articles = response.xpath("//div[@data-article-body]") for article in articles: info = article.css(".css-jy1umg").xpath(".//text()").getall() article_info = ''.join(info) article_topics = article.css("div.css-0") topics = [] for topic in article_topics[1:]: # content_loader = ItemLoader( # item=HealthlineContentItem(), selector=topic) # content_loader.add_css("topic_name", "a::attr(name)") topic_data = topic.xpath(".//text()").getall() topic_data = ' '.join(topic_data[1:]) # content_loader.add_value("topic_data", topic_data) topics.append(topic_data) content = ' '.join(topics) loader = ItemLoader(item=HealthlineArticleItem(), selector=article) loader.add_css("title", "h1::text") loader.add_value("url", response.meta.get('url')) loader.add_value("article_info", article_info) loader.add_value("content", content) os.remove(response.meta.get('temp_file')) yield loader.load_item()
def parse_article(self, response): if 'pdf' in response.url: return item = ItemLoader(Article()) item.default_output_processor = TakeFirst() title = response.xpath('//h1/text()').get() if title: title = title.strip() date = response.xpath('//span[@class="date"]/text()').get() if date: date = date.strip() content = response.xpath( '//div[@class="text__inner"]//text()').getall() content = [text for text in content if text.strip()] content = "\n".join(content[2:]).strip() item.add_value('title', title) item.add_value('date', date) item.add_value('link', response.url) item.add_value('content', content) return item.load_item()
def parse_article_child_page(self, response): selector = response.css('article #postcontent') article_loader = ItemLoader(item=ArticleItem(), selector=selector) article_loader.add_value('url', response.url) article_loader.add_css('title', 'h1::text') article_loader.add_css('pub_date', 'meta[itemprop=datePublished]::attr(content)') article_loader.add_css('text', '#mypost *::text') article_loader.add_css('tags', 'article #postcontent a.tag.secondary::text') article_item = article_loader.load_item() article_author_loader = ItemLoader(item=ArticleAuthorItem(), selector=selector) article_author_loader.add_css( 'authors', 'span[itemprop=author] a.goauthor::attr(href)') article_author_loader.add_value('article_url', article_item['url']) article_author_item = article_author_loader.load_item() yield article_item yield article_author_item
print Exception,":",e if Some_Info: for key in Some_Info.keys(): item.fields[key] = Field() l.add_value(key , Some_Info[key]) yield l.load_item() else: #感觉这里不能用itemloader的add_xxx方法了,因为要先找到一个页面所有的含有目标item的块,再在每个块里面提取出单个item,itemloader的话是一次性直接全取出,add_xpath不能再细分了;;打算用add_value方法 my_Final_Xpath = Final_Xpath.copy() All_Xpath = my_Final_Xpath['All_Xpath'].copy() del my_Final_Xpath['All_Xpath'] all_xpath = All_Xpath['all_xpath'] del All_Xpath['all_xpath'] for i in response.xpath(all_xpath[0]): item = NettvSpiderItem() l = ItemLoader(item=item, response=response) #把All_Xpath中的数据提取出来 for key in All_Xpath.keys(): item.fields[key] = Field() try: #itemloader在add_xxx方法找不到值的时候,会自动忽略这个字段,可是我不想忽略它,这时候需要将其置为空("") if map(lambda x:1 if x else 0, map(lambda x:response.xpath(x).extract() if x != "/" else "",Final_Xpath[key])) in [[0,0],[0]]: map(lambda x:l.add_value(key , ""),["just_one"]) else: map(lambda x:l.add_value(key, i.xpath(x).extract()) if i.xpath(x).extract() != [] else "",Final_Xpath[key]) except Exception,e: print Exception,",",e #将除了All_Xpath中的数据提取出来,像豆瓣就特别需要这种情况,一般下面的数据是(多次取得),All_Xpath中才是真正单条的数据 for key in my_Final_Xpath.keys(): item.fields[key] = Field() try:
def product_parse(self, response: HtmlResponse): loader = ItemLoader(item=ShopparserItem(), response=response) loader.add_value('_id', response.url, re='-(\d+)\/$') loader.add_xpath('name', "//h1/text()") loader.add_value('link', response.url) loader.add_xpath('price', "//span[@slot='price']/text()") loader.add_xpath('params', "//div[@class='def-list__group']/dt/text()") loader.add_xpath('params', "//div[@class='def-list__group']/dd//text()") loader.add_xpath('photos', "//img[@alt='product image']/@src") yield loader.load_item()
def parse_sub_item_detail(self, response): l = ItemLoader(item=AmzGenericCrawlerItem(), response=response) try: supplier = response.xpath( '//*[@id="bylineInfo"]/text()').extract_first() if supplier == None: supplier = response.xpath( '//*[@id="brand"]/text()').extract_first().strip() if supplier == None: supplier = "no info" except AttributeError: supplier = "" try: product_name = response.xpath( '//*[@id="productTitle"]/text()').extract_first().strip(' \n') except AttributeError: product_name = "" try: availability = response.xpath('//*[@id="availability"]/span/text()' ).extract_first().strip(' \n') except AttributeError: try: availability = response.xpath('//*[@id="availability"]/text()' ).extract_first().strip(' \n') except AttributeError: availability = "no info" try: review = response.xpath( '//*[@id="acrPopover"]/span[1]/a/i[1]/span/text()' ).extract_first().split(" ", 1)[0] except (AttributeError, IndexError): review = "no review" try: rank = response.xpath( '//th[contains(text(),"Best Sellers Rank")]/following-sibling::td/span/span[1]/text()' ).extract_first().split(" ", 1)[0].strip('#').replace(',', '') except (AttributeError, IndexError): try: rank = response.xpath( '//*[@id="SalesRank"]/text()').extract()[1].strip().split( " ", 1)[0].strip('#').replace(',', '') except (AttributeError, IndexError): rank = "no rank" try: category = response.xpath( '//th[contains(text(),"Best Sellers Rank")]/following-sibling::td/span/span[1]/text()' ).extract_first().split(" ", 1)[1].rsplit(" ", 1)[0].split(" ", 1)[1] except (AttributeError, IndexError): try: category = \ response.xpath('//*[@id="SalesRank"]/text()').extract()[1].strip().split(" ", 1)[1].rsplit(" ", 1)[ 0].split(" ", 1)[1] except (AttributeError, IndexError): category = "no info" item_url = response.request.url l.add_value('supplier', supplier) l.add_value('product_name', product_name) l.add_value('availability', availability) l.add_value('review', review) l.add_value('rank', rank) l.add_value('category', category) l.add_value('item_url', item_url) return l.load_item()
def parse_content(self,response): print (response.url) def deal_img_urls(img_url_list): # for one_img_url in img_url_list: # print (one_img_url) return img_url_list def deal_publish_time(publish_time_raw_list): try: year=str(publish_time_raw_list[0]) mounth=str(publish_time_raw_list[1]) if len(str(publish_time_raw_list[1]))==2 else '0'+str(publish_time_raw_list[1]) days=str(publish_time_raw_list[2]) if len(str(publish_time_raw_list[2]))==2 else '0'+str(publish_time_raw_list[2]) hourse=str(publish_time_raw_list[3]) minite=str(publish_time_raw_list[4]) publish_time=year+'-'+mounth+'-'+days+' '+hourse+':'+minite+':00' return publish_time except Exception as e: print(e) def deal_reply_nodes(response_url): # for one_reply_nodes in reply_nodes: # one_reply_nodes.xpath('') # 这里边的评论需要重新发起请求,所以这里全部设置成连接,后期的处理中再生成对应的reply_nodes。------mark! reply_id=response_url.split('/')[-1].split('?')[0] reply_url='http://www.ftchinese.com/index.php/c/newcomment/'+reply_id+'?v=1' return reply_url def deal_publish_user(publisher_list): publish_user_list=[] for one_user in publisher_list: _=one_user.strip() publish_user_list.append(_) return publish_user_list if not response.xpath('//span[@class="story-time"]/text()').re('(\d{4}).(\d{1,2}).(\d{1,2}). (\d{1,2})\:(\d{1,2})'): return #charge the content is empty by this? loader1 = ItemLoader(item=YfspiderspeakItem(), response=response) loader1.add_value('url', response.url) loader1.add_value('spider_time', time.time()) loader1.add_xpath('title','//h1[@class="story-headline"]/text()',TakeFirst()) # loader1.add_xpath('abstract','//div[@class="story-lead"]/text()')#没有abstract这个字段 loader1.add_value('id',response.url.split('/')[-1].split('?')[0]) loader1.add_value('img_urls',response.xpath('//div[@class="story-container"]//img/@src|//div[@class="story-container"]//figure/@data-url').extract(),deal_img_urls) loader1.add_xpath('content','//div[@class="story-body"]//p//text()',Join()) loader1.add_value('publish_time',response.xpath('//span[@class="story-time"]/text()').re('(\d{4}).(\d{1,2}).(\d{1,2}). (\d{1,2})\:(\d{1,2})'),deal_publish_time) loader1.add_xpath('publish_user','//span[@class="story-author"]/a/text()',deal_publish_user) loader1.add_value('reply_count',response.xpath('//div[@id="allcomments"]/div[@class="commentcontainer"]'),lambda x:len(x)) # loader1.add_value('reply_nodes',response.url,deal_reply_nodes) item1=loader1.load_item() return item1
def parse(self, response): response.selector.remove_namespaces() document = response.xpath('//document') manu_products = document.xpath('.//subject/manufacturedProduct') spl_il = ItemLoader(item=SplItem(), selector=document) spl_il.add_xpath('id', './id/@root') spl_il.add_xpath('set_id', './setId/@root') spl_il.add_xpath('labeler', './/representedOrganization/name/text()') for product in manu_products: product_il = ItemLoader(item=ProductItem(), selector=product) product_il.add_xpath('code', './manufacturedProduct/code/@code') product_il.add_xpath('name', './manufacturedProduct/name/text()') product_il.add_xpath( 'schedule', './/policy[@classCode="DEADrugSchedule"]/code/@displayName') inactive_ingredients = product.xpath( './/ingredient[starts-with(@classCode, "IACT")]') for inactive_ingredient in inactive_ingredients: inactive_il = ItemLoader( item=InactiveIngredient(), selector=inactive_ingredient, ) inactive_il.add_xpath( 'name', './ingredientSubstance/name/text()', ) inactive_il.add_xpath( 'unii', './ingredientSubstance/code/@code', ) product_il.add_value( 'inactive_ingredients', inactive_il.load_item(), ) for package in product.xpath('.//containerPackagedProduct'): package_il = ItemLoader(item=PackageItem(), selector=package) package_il.add_xpath('code', './code/@code') if not package_il.load_item(): continue product_il.add_value('packages', package_il.load_item()) spl_il.add_value('products', product_il.load_item()) return spl_il.load_item()