def parse_one_sale(self, response): #with open( '/tmp/findnsave_sales_one.html', 'w' ) as f: # f.write( response.body ) sale = f_xpath( response, '//div[contains(@class, "offer-description-wrapper") ' + \ ' and contains(@class, "clearfix")]' ) if not sale: return starts, expiration = self._parse_date( response ) pct_off = self._parse_callout( sale ) lg_img = self._parse_large_img( sale ) sr = f_xpath( sale, './div[@class="offer-right"]' ) name = fx_extract( sr, './h1[@itemprop="name"]/text()' ) if name is None: logger.debug( 'not crawl name in : ' + response.url ) return p_c, p_p, p_r, p_u = self._parse_price( sr ) desc = self._parse_desc( sr ) retailer, category, brand = self._parse_retailer_category_brand( sr ) data = [ response.meta[ 'id' ], name, p_c, p_p, p_r, p_u, pct_off, starts, expiration, retailer, category, brand, response.url, response.meta[ 'th_img' ], lg_img, desc, ] d = FindnsaveSaleItem() d['area'] = 'newyork' d['id'] = response.meta[ 'id' ] d['name'] = escape(name) d['priceCurrency'] = p_c d['price'] = p_p d['priceRegular'] = p_r d['priceUtilDate'] = p_u d['priceOff'] = pct_off d['retailer'] = escape(retailer) d['category'] = escape(category) d['brand'] = escape(brand) d['desc'] = escape(desc) yield d #self.jsonfile.write( json.dumps( data ) + '\n' ) logger.info( 'crawl : `' + name + '` OK' ) return
def parse(self, response): logger.info('fetch : ' + response.url) brands = f_xpath( response, '//ul[contains(@class, "brands") ' + \ ' and contains(@class, "columnize")' + \ ' and contains(@class, "clearfix")]' ).xpath( './li' ) for br in brands: br = f_xpath(br, './a') if not br: continue href = fx_extract(br, './@href') name = fx_extract(br, './text()') try: _b, bid, id = href.strip('/').split('/') except: continue #csv.writer( self.csv_fd ).writerow( [ id, bid, name, href ] ) d = FindnsaveBrandItem() d['id'] = id d['name'] = escape(name) d['nameid'] = bid d['uri'] = href yield d next_url = self.brand_next_page(response) if next_url is None: return yield scrapy.http.Request(url=next_url, callback=self.parse, dont_filter=True)
def parse(self, response): logger.info( 'fetch : ' + response.url ) brands = f_xpath( response, '//ul[contains(@class, "brands") ' + \ ' and contains(@class, "columnize")' + \ ' and contains(@class, "clearfix")]' ).xpath( './li' ) for br in brands: br = f_xpath( br, './a' ) if not br: continue href = fx_extract( br, './@href' ) name = fx_extract( br, './text()' ) try: _b, bid, id = href.strip( '/' ).split( '/' ) except: continue #csv.writer( self.csv_fd ).writerow( [ id, bid, name, href ] ) d = FindnsaveBrandItem() d['id'] = id d['name'] = escape(name) d['nameid'] = bid d['uri'] = href yield d next_url = self.brand_next_page( response ) if next_url is None: return yield scrapy.http.Request( url = next_url, callback = self.parse, dont_filter = True )