def parse(self, response): xxs = XmlXPathSelector(response) links = xxs.select( "//item/*[local-name()='origLink']/text()").extract() return [Request(x, callback=self.parse_item) for x in links]
def parse(self, response): xxs = XmlXPathSelector(response) for product in xxs.select('//product'): category = product.select('./Category/text()').extract() loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('identifier', './product-id/text()') loader.add_xpath('sku', './product-id/text()') loader.add_xpath('url', './product-url/text()') loader.add_xpath('name', './product-name/text()') loader.add_xpath('brand', './brand/text()') loader.add_value( 'price', extract_price_eu(' '.join( product.select('./price/text()').extract()))) if category: loader.add_value('category', category[0].split('/')[-1].strip()) loader.add_xpath('image_url', './image-url/text()') loader.add_xpath('stock', './stock/text()') if loader.get_output_value('price') > 499: loader.add_value('shipping_cost', '0') else: loader.add_value('shipping_cost', '25') yield loader.load_item()
def parse(self, response): # inspect_response(response, self) # return # hxs = HtmlXPathSelector(response) # file_path = "d:/work/GoogleFeed.xml" # f = open(file_path) # xxs = XmlXPathSelector(text=f.read()) xxs = XmlXPathSelector(response) for sel in xxs.select('//channel/item'): # ## loader = ProductLoader(item=Product(), response=response) tmp = sel.select('link/text()').extract() if tmp: loader.add_value('url', tmp[0]) # ID tmp = sel.select('*[name()="g:id"]/text()').extract() if tmp: loader.add_value('identifier', tmp[0]) # Sku tmp = sel.select('*[name()="g:id"]/text()').extract() if tmp: loader.add_value('sku', tmp[0]) # Name tmp = sel.select('title/text()').extract() if tmp: loader.add_value('name', tmp[0]) # price tmp = sel.select('*[name()="g:sale_price"]/text()').extract() if not tmp: tmp = sel.select('*[name()="g:price"]/text()').extract() if tmp: price = round(extract_price(tmp[0]) / Decimal('1.20'), 2) loader.add_value('price', price) # image_url tmp = sel.select('*[name()="g:image_link"]/text()').extract() if tmp: loader.add_value('image_url', tmp[0]) # Brand tmp = sel.select('*[name()="g:brand"]/text()').extract() if tmp and tmp[0] != 'Alliance': loader.add_value('brand', tmp[0]) # category tmp = sel.select('*[name()="g:product_type"]/text()').extract() if tmp: try: loader.add_value('category', tmp[0].split('>')[1].strip()) except: loader.add_value('category', tmp[0].strip()) # shipping_cost price = loader.load_item()['price'] if price and price < 50.00: loader.add_value('shipping_cost', 5.90) # stock tmp = sel.select('*[name()="g:availability"]/text()').extract() if tmp and tmp[0] == 'in stock': loader.add_value('stock', 1) else: loader.add_value('stock', 0) yield loader.load_item()
def xmliter_lxml(obj, nodename): from lxml import etree reader = _StreamReader(obj) iterable = etree.iterparse(reader, tag=nodename, encoding=reader.encoding) for _, node in iterable: nodetext = etree.tostring(node) node.clear() yield XmlXPathSelector(text=nodetext).select('//' + nodename)[0]
def parse(self, response): #print '-----------------------------------------1111111111111111111111111111111111111-------------------------------------' #print 'parseparseparseparseparseparse' #print type(self) #print '--------------------------------------------2222222222222222222222222222222222222------------------------------------' if self.scraper.content_type == 'H': xs = HtmlXPathSelector(response) else: xs = XmlXPathSelector(response) base_elem = self.scraper.get_base_elem() url_elem = self.scraper.get_detail_page_url_elem() base_objects = xs.select(base_elem.x_path) if (len(base_objects) == 0): self.log("No base objects found!", log.ERROR) if (self.conf['MAX_ITEMS_READ']): items_left = min( len(base_objects), self.conf['MAX_ITEMS_READ'] - self.items_read_count) base_objects = base_objects[0:items_left] #print '-------------------------------------55555555555555555555555555555555555555555-----------------------------------' #print 'before for obj in base_objects:' #print type(base_objects) #print '------------------------------------6666666666666666666666666666666666666666666------------------------------------' for obj in base_objects: item_num = self.items_read_count + 1 self.log("Starting to crawl item %s." % str(item_num), log.INFO) item = self.parse_item(response, obj) # print '-------------------------------333333333333333333333333333333333333333333-------------------------------------' # print type(item) # print '---------------------------------777777777777777777777777777777777777777-------------------------------------' #print item url_name = url_elem.scraped_obj_attr.name if (item and url_name in item): url = item[url_name] cnt = self.scraped_obj_class.objects.filter( url=item[url_name]).count() cnt1 = self.scraper.get_standard_update_elems_from_detail_page( ).count() cnt2 = self.scraper.get_from_detail_page_scrape_elems().count() # Mark item as DOUBLE item if cnt > 0: item[url_name] = 'DOUBLE' + item[url_name] # (DOUBLE item with no standard update elements to be scraped from detail page) or # generally no attributes scraped from detail page if (cnt > 0 and cnt1 == 0) or cnt2 == 0: #loader = XPathItemLoader(item=Article(), response=response) #print 111111111111111111 #loader.add_xpath('description', '//p[not(@align="center")]/text()') #l.add_value('last_updated', 'today') yield item else: yield Request(url, callback=self.parse_item, meta={'item': item}) else: self.log("Detail page url elem could not be read!", log.ERROR)
def parse(self, response): xxs = XmlXPathSelector(response) xxs.register_namespace('soapenv', 'http://schemas.xmlsoap.org/soap/envelope/') xxs.register_namespace('xsd', 'http://www.w3.org/2001/XMLSchema') xxs.register_namespace('xsi', 'http://www.w3.org/2001/XMLSchema-instance') xxs.register_namespace( 'CurrentsAndMetadata', 'http://opendap.co-ops.nos.noaa.gov/axis/webservices/currents/wsdl' ) timelist = xxs.select( '//CurrentsAndMetadata:data/CurrentsAndMetadata:item/CurrentsAndMetadata:timeStamp/text()' ).extract() cspdlist = xxs.select( '//CurrentsAndMetadata:data/CurrentsAndMetadata:item/CurrentsAndMetadata:CS/text()' ).extract() cdirlist = xxs.select( '//CurrentsAndMetadata:data/CurrentsAndMetadata:item/CurrentsAndMetadata:CD/text()' ).extract() print len(timelist) for i in range(0, len(cdirlist)): sql_str = self.SQL_INSERT_STUB.format( self.get_current_station().lower(), str(timelist[i])[0:-2], str(cspdlist[i]), str(cdirlist[i]), 'datafactory_currentdata') #d_time = datetime.datetime(str(timelist[i])[0:-2], pytz.UTC) d_time_unware = datetime.datetime.strptime( str(timelist[i])[0:-2], "%Y-%m-%d %H:%M:%S") d_time1 = pytz.utc.localize(d_time_unware) d_time = d_time1.astimezone(pytz.utc) if self.needStore(d_time): self.db.query(sql_str) self.db.commit() if timelist: sql_str = "INSERT INTO {0} (sid, stime, etime) VALUES (\"{1}\", \"{2}\", \"{3}\")".format( DB_SETTINGS['DATABASE_TIME_TABLE'], self.get_current_station(), self.startDate.astimezone( pytz.utc).strftime("%Y-%m-%d %H:%M:%S"), self.endDate.astimezone( pytz.utc).strftime("%Y-%m-%d %H:%M:%S")) self.db.query(sql_str) self.db.commit() self.station_slot = self.station_slot + 1 if (self.station_slot < len(self.start_urls)): yield self.start_urls[self.station_slot]
def xmliter_lxml(obj, nodename, namespace=None): from lxml import etree reader = _StreamReader(obj) tag = '{%s}%s' % (namespace, nodename) if namespace else nodename iterable = etree.iterparse(reader, tag=tag, encoding=reader.encoding) selxpath = '//' + ('x:%s' % nodename if namespace else nodename) for _, node in iterable: nodetext = etree.tostring(node) node.clear() xs = XmlXPathSelector(text=nodetext) if namespace: xs.register_namespace('x', namespace) yield xs.select(selxpath)[0]
def parse(self, response): hxs = XmlXPathSelector(response) name = hxs.select('//name').extract() if self.task_id is not None: self.log('Processing item %s' % self.task_id, log.INFO) self.alert_context = 'task_id=%s' % self.task_id for item in self.process_item(self.bot_task_params(self.task_id)): yield item else: for item in self.process_items(): yield item
def parse(self, response): """ We define a custom parser here because we need to get the link from the feed item and then follow it to get the recipe data. Getting the data from <content:encoded> seems overly complex, as we would have to decode all the encoded characters and then build a DOM from that. """ xxs = XmlXPathSelector(response) links = xxs.select( "//item/*[local-name()='origLink']/text()").extract() # self.parse_item comes from OnehundredonecookbooksMixin return [Request(x, callback=self.parse_item) for x in links]
def parse(self, response): x = XmlXPathSelector(response) zp_nodes = x.xpath("//stats") source = response.meta.get("source", "") for zp_node in zp_nodes: name = zp_node.xpath("////stats/stat/name/text()").extract() xy = zp_node.xpath("//stats/stat/xy/text()").extract() for i in range(len(name)): gz_item = GJZDItem() gz_item["name"] = name[i] gz_item["source"] = source gz_item["lng"] = xy[i].split(",")[0] gz_item["lat"] = xy[i].split(",")[1] yield gz_item
def parsePart(self, response): item = response.meta['item'] xxs = XmlXPathSelector(response) if len(xxs.select("//ERRORSEGMENT")) == 0: part_num = response.meta['part_num'] end_range = response.meta['end_range'] part_prefix = response.meta['part_prefix'] item['parts'].append(self.part_format % (part_prefix, part_num)) if part_num < end_range: yield self.makePartRequest(part_prefix, part_num + 1, item, end_range) else: yield item else: yield item
def detect_feed(self, response): """Just detects the feed in the links and returns an Item""" xxs = XmlXPathSelector(response); '''Need to tweak the feedparser lib to just use the headers from response instead of d/l the feed page again, rather than d/l it again ''' if any(xxs.select("/%s" % feed_type) for feed_type in ['rss', 'feed', 'xml', 'rdf']): try: rssFeed = feedparser.parse(response.url); return self.extract_feed(rssFeed) except: raise Exception('Exception while parsing/extracting the feed') return None
def parse_rss(self, response): item = response.request.meta['item'] if response.status != 500: xxs = XmlXPathSelector(response) xxs.remove_namespaces() item['date'] = xxs.select('.//channel/date/text()').extract() description = xxs.select('.//channel/description/text()').extract() if (len(item.get('description', '')) < 10) and description: item['description'] = ''.join(description).strip() del (item['subpage_urls']) return item
def parse(self, response): xxs = XmlXPathSelector(response) hxs = HtmlXPathSelector(response) links = xxs.select('//link/text()').extract() log.msg('Link length: %s' % len(links), level=log.ERROR) if len(links) <= 0: log.msg('no links found, using regular parser', level=log.ERROR) links = hxs.select('//a/@href').extract() msg = 'Links: %s' % links log.msg(msg, level=log.ERROR) return [Request(x, callback=self.parse_item) for x in links]
def parse(self, response): base_url = get_base_url(response) xxs = XmlXPathSelector(response) xxs.register_namespace("g", "http://base.google.com/ns/1.0") products = xxs.select('//channel/item') for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('url', 'link/text()') loader.add_xpath('name', 'title/text()') loader.add_xpath('image_url', 'g:image_link/text()') loader.add_xpath('price', 'g:price/text()') loader.add_xpath('brand', 'g:brand/text()') loader.add_xpath('category', 'g:brand/text()') loader.add_xpath('sku', 'g:id/text()') loader.add_xpath('identifier', 'g:id/text()') yield loader.load_item()
def parse(self, response): xxs = XmlXPathSelector(response) xxs.remove_namespaces() products = xxs.select('//item') for product in products: mpn = product.xpath('mpn/text()') if mpn: mpn = mpn[0].extract().upper().strip() else: mpn = None row = self.monitored_products.get(mpn) if mpn else None if row is None or (row and row['Discontinued'].lower().strip() == 'yes'): continue loader = ProductLoader(selector=product, item=Product()) loader.add_xpath('identifier', 'id/text()') loader.add_xpath('sku', 'mpn/text()') loader.add_xpath('brand', 'brand/text()') loader.add_xpath('image_url', 'image_link/text()') loader.add_xpath('url', 'link/text()') loader.add_xpath('name', 'title/text()') price = product.select('sale_price/text()').extract() if not price: price = product.select('price/text()').extract() loader.add_value('price', extract_price(price[0])) categories = product.select( 'product_type/text()').extract()[-1].split('>') categories = map(lambda x: x.strip(), categories) loader.add_value('category', categories) shipping_cost = product.select('shipping/price/text()').extract() shipping_cost = extract_price( shipping_cost[0]) if shipping_cost else '' loader.add_value('shipping_cost', shipping_cost) in_stock = product.select( 'availability[contains(text(), "in stock")]').extract() if not in_stock: loader.add_value('price', 0) item = loader.load_item() item['metadata'] = RHSMeta() item['metadata']['cost_price'] = row['Cost Price'] yield item
def parse(self, response): x = XmlXPathSelector(response) zp_nodes = x.xpath("//lines") count = 0 for zp_node in zp_nodes: road = zp_node.xpath("//lines/line/name/text()").extract() stats = zp_node.xpath("//lines/line/stats/text()").extract() for i in range(len(road)): s = stats[i].split(";") for j in range(len(s)): count += 1 zd_item = ZDCXItem() zd_item["road"] = road[i] zd_item["station_name"] = s[j] zd_item["station_num"] = count yield zd_item count = 0
def parse(self, response): item = ArxivOrgItem() xxs = XmlXPathSelector(response) xxs.remove_namespaces() # 需要先将selector对象格式化成str xml_data = str(xxs.xpath('//link')) #logging.log(logging.INFO, xml_data) url_list = re.findall('http://arxiv.org/abs/\d+.\d+', xml_data) #logging.log(logging.INFO, url_list) for url in url_list: logging.log( logging.INFO, f'**************** crawling link: {url} ***************** ') yield Request(url=url, callback=self.parse_single_page, meta={'item': item}, dont_filter=True)
def populate_vars(self, response=None, request=None, spider=None): self.vars['item'] = self.item_class() self.vars['settings'] = self.crawler.settings self.vars['spider'] = spider self.vars['request'] = request self.vars['response'] = response self.vars['xxs'] = XmlXPathSelector(response) \ if isinstance(response, XmlResponse) else None self.vars['hxs'] = HtmlXPathSelector(response) \ if isinstance(response, HtmlResponse) else None if self.inthread: self.vars['fetch'] = self.fetch self.vars['view'] = open_in_browser self.vars['shelp'] = self.print_help self.update_vars(self.vars) if not self.code: self.print_help()
def parse(self, response): xxs = XmlXPathSelector(response) base_url = get_base_url(response) xxs.register_namespace("f", "http://www.w3.org/2005/Atom") products = xxs.select('//f:entry') for product in products: product.register_namespace("g", "http://base.google.com/ns/1.0") product.register_namespace("p", "http://www.w3.org/2005/Atom") product_loader = ProductLoader(item=Product(), selector=product) name = product.select('./p:title/text()').extract()[0] if 'B-STOCK' in name.upper(): continue product_loader.add_value('name', name) url = product.select('./p:link/@href').extract()[0] product_loader.add_value('url', urljoin_rfc(base_url, url)) image_url = product.select('./g:image_link/text()').extract() if image_url: product_loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) category = product.select('./g:product_type/text()').extract() if category: product_loader.add_value('category', category[0]) brand = product.select('./g:brand/text()').extract() if brand: product_loader.add_value('brand', brand[0]) price = product.select('./g:sale_price/text()').extract() if price: product_loader.add_value('price', extract_price(price[0])) else: price = product.select('./g:price/text()').extract() product_loader.add_value('price', extract_price(price[0])) # sku = product.select('./g:gtin/text()').extract() # if sku: # product_loader.add_value('sku', sku[0]) identifier = product.select('./g:id/text()').extract()[0] product_loader.add_value('identifier', identifier) product_loader.add_value('sku', identifier) shipping_cost = product.select( './g:shipping/g:price/text()').extract() if shipping_cost: product_loader.add_value('shipping_cost', extract_price(shipping_cost[0])) product = product_loader.load_item() yield product
def get_products(self, meta, response, colors, colors_ids): hxs = XmlXPathSelector(response) names, ids = self.get_names(meta['base_name'], meta['product_id'], meta['current_data'], colors, colors_ids) for i, name in enumerate(names): p = ProductLoader(item=Product(), response=response) p.add_value('identifier', ids[i]) p.add_value('name', name) p.add_value('brand', meta['brand']) p.add_value('url', meta['url']) p.add_value('image_url', meta['image_url']) price = hxs.select('//cmd[@t="discounted_price"]/text()').extract() if price: price = price[0].replace('.', '').replace(',', '.') price = extract_price(price) if not price or price == Decimal(1): if not price: self.log('Price not found %s' % meta['url']) else: self.log('Price is one %s' % meta['url']) if not self.retries.get( meta['url']) or self.retries.get(meta['url']) < 3: self.log('Retrying %s' % meta['url']) self.retries[meta['url']] = self.retries.get( meta['url'], 0) + 1 p = meta['url'] yield Request(p, meta={ 'category': response.meta.get('category', ''), 'cookiejar': p + str(self.retries.get(meta['url'])) }, callback=self.parse_product, dont_filter=True) else: self.log('Max retries reached %s' % meta['url']) return p.add_value('price', price) p.add_value('shipping_cost', '0') p.add_value('category', response.meta.get('category')) yield p.load_item()
def parse(self, response): if self.scraper.content_type == 'H': xs = HtmlXPathSelector(response) else: xs = XmlXPathSelector(response) base_elem = self.scraper.get_base_elem() url_elem = self.scraper.get_detail_page_url_elem() base_objects = xs.select(base_elem.x_path) if (len(base_objects) == 0): self.log("No base objects found!", log.ERROR) if (self.conf['MAX_ITEMS_READ']): items_left = min( len(base_objects), self.conf['MAX_ITEMS_READ'] - self.items_read_count) base_objects = base_objects[0:items_left] for obj in base_objects: item_num = self.items_read_count + 1 self.log("Starting to crawl item %s." % str(item_num), log.INFO) item = self.parse_item(response, obj) #print item url_name = url_elem.scraped_obj_attr.name if (item and url_name in item): url = item[url_name] cnt = self.scraped_obj_class.objects.filter( url=item[url_name]).count() cnt1 = self.scraper.get_standard_update_elems_from_detail_page( ).count() cnt2 = self.scraper.get_from_detail_page_scrape_elems().count() # Mark item as DOUBLE item if cnt > 0: item[url_name] = 'DOUBLE' + item[url_name] # (DOUBLE item with no standard update elements to be scraped from detail page) or # generally no attributes scraped from detail page if (cnt > 0 and cnt1 == 0) or cnt2 == 0: yield item else: yield Request(url, callback=self.parse_item, meta={'item': item}) else: self.log("Detail page url elem could not be read!", log.ERROR)
def parse(self, response): xxs = XmlXPathSelector(response) for productxs in xxs.select( '//product[attribute_set/text()!="spares-accessories"]'): loader = ProductLoader(item=Product(), selector=productxs) loader.add_xpath('sku', './product_id/text()') loader.add_xpath('identifier', './product_id/text()') loader.add_xpath('price', './product_price/text()') loader.add_xpath('name', './product_name/text()') loader.add_xpath('url', './product_url/text()') loader.add_xpath('category', './attribute_set/text()') loader.add_xpath('brand', './manufacturer/text()') brand = loader.get_output_value('brand').strip().upper() if brand in self.ignore_brands: log.msg('Ignoring product %s because of brand %s' % (loader.get_output_value('identifier'), brand)) continue loader.add_value('stock', '1') item = loader.load_item() item['identifier'] = item['identifier'].upper() cost_price = productxs.select('./cost/text()').extract() metadata = CSCateringMeta() cost_price = cost_price[0].strip() if cost_price else '0.00' metadata['cost_price'] = cost_price item['metadata'] = metadata category = loader.get_output_value('category').strip().lower() if category in ignore_categories and not self.has_sku( item.get('sku', '')): log.msg('Ignoring product %s because of category %s' % (loader.get_output_value('identifier'), category)) continue yield Request(item['url'], callback=self.parse_img, meta={'item': item})
def parse(self, response): if not hasattr(self, 'parse_node'): raise NotConfigured( 'You must define parse_node method in order to scrape this XML feed' ) response = self.adapt_response(response) if self.iterator == 'iternodes': nodes = self._iternodes(response) elif self.iterator == 'xml': selector = XmlXPathSelector(response) self._register_namespaces(selector) nodes = selector.select('//%s' % self.itertag) elif self.iterator == 'html': selector = HtmlXPathSelector(response) self._register_namespaces(selector) nodes = selector.select('//%s' % self.itertag) else: raise NotSupported('Unsupported node iterator') return self.parse_nodes(response, nodes)
def parse(self, response): xxs = XmlXPathSelector(response) xxs.register_namespace("g", "http://base.google.com/ns/1.0") products = xxs.select('//channel/item') for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('url', 'link/text()') loader.add_xpath('name', 'title/text()') loader.add_xpath('image_url', 'g:image_link/text()') loader.add_xpath('price', 'g:price/text()') loader.add_xpath('brand', 'g:brand/text()') categories = product.select( 'g:product_type/text()').extract()[0].split(' > ') loader.add_value('category', categories) loader.add_xpath('sku', 'g:id/text()') loader.add_xpath('identifier', 'g:id/text()') stock = product.select( 'g:availability/text()').extract()[0].lower() if stock != 'in stock': loader.add_value('stock', 0) yield loader.load_item()
def scrape_rss(response): log.msg("inside scrape rss") xxs = XmlXPathSelector(response) items = [] requests = [] for item_tag in xxs.select('//item'): items.append(ArticleItem()) if len(item_tag.select("title")) > 0: items[-1]["title"] = item_tag.select("title/text()")[0].extract() if len(item_tag.select("pubDate")) > 0: items[-1]["time_published"] = [ item_tag.select("pubDate/text()")[0].extract() ] if len(item_tag.select("link")) > 0: items[-1]["url"] = item_tag.select("link/text()")[0].extract() if len(item_tag.select("description")) > 0: items[-1]["summary"] = item_tag.select( "description/text()")[0].extract() request = Request(items[-1]["url"], callback=extract_author_from_link) request.meta["item"] = items[-1] yield request
def xmliter(obj, nodename): """Return a iterator of XPathSelector's over all nodes of a XML document, given tha name of the node to iterate. Useful for parsing XML feeds. obj can be: - a Response object - a unicode string - a string encoded as utf-8 """ HEADER_START_RE = re.compile(r'^(.*?)<\s*%s(?:\s|>)' % nodename, re.S) HEADER_END_RE = re.compile(r'<\s*/%s\s*>' % nodename, re.S) text = body_or_str(obj) header_start = re.search(HEADER_START_RE, text) header_start = header_start.group(1).strip() if header_start else '' header_end = re_rsearch(HEADER_END_RE, text) header_end = text[header_end[1]:].strip() if header_end else '' r = re.compile(r"<%s[\s>].*?</%s>" % (nodename, nodename), re.DOTALL) for match in r.finditer(text): nodetext = header_start + match.group() + header_end yield XmlXPathSelector(text=nodetext).select('//' + nodename)[0]
def parse(self, response): hxs = XmlXPathSelector(response) shows = hxs.select('//show') date_from = datetime.now() date_to = date_from + timedelta(days=7 * 6) for show in shows: name = show.select('./name/text()').extract()[0] url = show.select('./@href').extract()[0] show_id = url.split('/')[-1] show_data = SHOWS_DATA % (show_id, date_from.strftime('%Y-%m-%d'), date_to.strftime('%Y-%m-%d')) r = Request( 'https://api.entstix.com/api/v1/xlive/booking/book/availability/show', method='POST', body=show_data, callback=self.parse_products, meta={ 'name': name, 'id': show_id }) yield r
def parse(self, response): xxs = XmlXPathSelector(response) xxs.remove_namespaces() urls = xxs.select('//loc/text()').extract() for url in urls: if 'brands-sitemap.xml' in url: continue if 'productbrand' in url: prod_id = re.findall('productbrand_(\d+).html', url) prod_id = prod_id[0] if prod_id else '' if prod_id: if prod_id in self.product_ids: continue else: self.product_ids.append(prod_id) yield Request(url, callback=self.parse_product, meta={"dont_merge_cookies": True}) else: yield Request(url, meta={"dont_merge_cookies": True}) '''
def parse(self, response): xxs = XmlXPathSelector(response) for title in xxs.select("//item/title/text()").extract() log.msg(title)