def parse_one_sale(self, response): #with open( '/tmp/findnsave_sales_one.html', 'w' ) as f: # f.write( response.body ) sale = f_xpath( response, '//div[contains(@class, "offer-description-wrapper") ' + \ ' and contains(@class, "clearfix")]' ) if not sale: return starts, expiration = self._parse_date( response ) pct_off = self._parse_callout( sale ) lg_img = self._parse_large_img( sale ) sr = f_xpath( sale, './div[@class="offer-right"]' ) name = fx_extract( sr, './h1[@itemprop="name"]/text()' ) if name is None: logger.debug( 'not crawl name in : ' + response.url ) return p_c, p_p, p_r, p_u = self._parse_price( sr ) desc = self._parse_desc( sr ) retailer, category, brand = self._parse_retailer_category_brand( sr ) data = [ response.meta[ 'id' ], name, p_c, p_p, p_r, p_u, pct_off, starts, expiration, retailer, category, brand, response.url, response.meta[ 'th_img' ], lg_img, desc, ] d = FindnsaveSaleItem() d['area'] = 'newyork' d['id'] = response.meta[ 'id' ] d['name'] = escape(name) d['priceCurrency'] = p_c d['price'] = p_p d['priceRegular'] = p_r d['priceUtilDate'] = p_u d['priceOff'] = pct_off d['retailer'] = escape(retailer) d['category'] = escape(category) d['brand'] = escape(brand) d['desc'] = escape(desc) yield d #self.jsonfile.write( json.dumps( data ) + '\n' ) logger.info( 'crawl : `' + name + '` OK' ) return
def parse_one_top(self, response): logger.info('fetch : ' + response.url) img = f_xpath(response, '//div[contains(@class, "inner-main-content")]') meta = {} meta['name'] = fx_extract(img, './div/h3/text()').strip().strip('#') meta['img'] = fx_extract(img, './/div[@class="inner-image"]/img/@src') meta['key'] = meta['img'][self.prefix_len:] meta['from'] = fx_extract(img, './/div[@class="inner-image"]/a/@href') meta['desc'] = fx_extract(img, './div/p/text()') curr_meta = response.meta curr_meta['top'].append(meta) nexturl = self.next_top_img(response) if nexturl: yield scrapy.http.Request(url=nexturl, callback=self.parse_one_top, meta=curr_meta, dont_filter=True) else: cli = authedclient() cli.upload_data(curr_meta['key'], json.dumps(curr_meta), headers={'Content-Type': 'text/json'}) logger.info('upload : ' + curr_meta['key']) for meta in curr_meta['top']: put_file_from_url(cli, meta['key'], meta['img']) logger.info('upload : %s from %s' % (meta['key'], meta['img']))
def parse_one_top( self, response ): logger.info( 'fetch : ' + response.url ) img = f_xpath( response, '//div[contains(@class, "inner-main-content")]' ) meta = {} meta[ 'name' ] = fx_extract( img, './div/h3/text()' ).strip().strip('#') meta[ 'img' ] = fx_extract( img, './/div[@class="inner-image"]/img/@src' ) meta[ 'key' ] = meta[ 'img' ][ self.prefix_len: ] meta[ 'from' ] = fx_extract( img, './/div[@class="inner-image"]/a/@href' ) meta[ 'desc' ] = fx_extract( img, './div/p/text()' ) curr_meta = response.meta curr_meta[ 'top' ].append( meta ) nexturl = self.next_top_img( response ) if nexturl: yield scrapy.http.Request( url = nexturl, callback = self.parse_one_top, meta = curr_meta, dont_filter = True ) else: cli = authedclient() cli.upload_data( curr_meta[ 'key' ], json.dumps( curr_meta ), headers = { 'Content-Type' : 'text/json' } ) logger.info( 'upload : ' + curr_meta[ 'key' ] ) for meta in curr_meta[ 'top' ]: put_file_from_url( cli, meta[ 'key' ], meta[ 'img' ] ) logger.info( 'upload : %s from %s' % ( meta[ 'key' ], meta[ 'img' ] ) )
def next_page( self, response ): nexturl = f_xpath( response, '//table[@class="CMheadingBar"]/tr/td[1]/div/span' ) if nexturl is None: return None return fx_extract( nexturl, './a[contains(text(), "Next")]/@href' )
def _parse_date(self, response): date = f_xpath( response, '//div[contains(@class, "offer-pdp-hd") ' + \ ' and contains(@class, "clearfix")]' ) starts = '' expiration = '' if not date: return starts, expiration st = f_xpath( date, './/div[@class="offer-right"]' ) if st: starts = fx_extract( st, './h2/text()' ) or '' exp = f_xpath( date, './/div[contains(@class, "expiration") ' + \ ' and contains(@class, "with-date") ]' ) if exp: expiration = fx_extract( exp, './div/text()' ) or '' return starts, expiration
def next_top_img(self, response): urls = f_xpath( response, '//div[contains(@style, "margin-bottom")]').xpath('./div')[1:] for url in urls: nxt = fx_extract(url, './a[contains(text(), "Next")]/@href') if nxt: return nxt
def next_page(self, response): nexturl = f_xpath(response, '//table[@class="CMheadingBar"]/tr/td[1]/div/span') if nexturl is None: return None return fx_extract(nexturl, './a[contains(text(), "Next")]/@href')
def _parse_callout(self, p): callout = f_xpath( p, './div[@class="callout"]' ) pct_off = '' if callout: pct = fx_extract( callout, './span[@class="pct"]/text()' ) or '' off = fx_extract( callout, './span[@class="off"]/text()' ) or '' pct_off = (pct + ' ' + off).strip() return pct_off
def store_next_page( self, response ): nexturl = f_xpath( response, '//div[@class="pagination"]/span[@class="next"]' ) if nexturl is None: return None uri = fx_extract( nexturl, './a[contains(text(), "Next")]/@href' ) if not uri: return None return self.rooturl + uri
def parse(self, response): #with open( '/tmp/findnsave_sales.html', 'w' ) as f: # f.write( response.body ) logger.info( 'fetch : ' + response.url ) sales = f_xpath( response, '//ul[contains(@class, "listing") ' + \ ' and contains(@class, "retailer-detail")' + \ ' and contains(@class, "infinite")]' ).xpath( './li[starts-with(@id, "offer-")]' ) for s in sales: s = f_xpath( s, './div' ).xpath( './a' ) id = fx_extract( s, './@data-offer-id' ) href = fx_extract( s, './@href' ) th_img = fx_extract( s, './img/@src' ) if not ( id and href and th_img ): continue # TODO : id if in db continue if not href.startswith( 'http://' ): href = self.rooturl + href meta = { 'id' : id, 'href' : href, 'th_img' : th_img } yield scrapy.http.Request( url = href, callback = self.parse_one_sale, meta = meta, dont_filter = True ) next_url = self.store_next_page( response ) if next_url is None: return yield scrapy.http.Request( url = next_url, callback = self.parse, dont_filter = True )
def parse(self, response): logger.info('fetch : ' + response.url) catgos = f_xpath( response, '//ul[contains(@class, "listing") ' + \ ' and contains(@class, "grouping")' + \ ' and contains(@class, "infinite")]' ).xpath( './li' ) for ctg in catgos: ctg = f_xpath(ctg, './/div[@class="chiclet-actions"]/a') if not ctg: continue href = fx_extract(ctg, './@href') name = fx_extract(ctg, './@title') name = self.parse_categorie_name(name) try: _c, cid, id = href.strip('/').split('/') except: continue #csv.writer( self.csv_fd ).writerow( [ id, cid, name, href ] ) d = FindnsaveCategoryItem() d['id'] = id d['name'] = name d['nameid'] = cid d['uri'] = href yield d next_url = self.categorie_next_page(response) if next_url is None: return yield scrapy.http.Request(url=next_url, callback=self.parse, dont_filter=True)
def parse(self, response): logger.info( 'fetch : ' + response.url ) tops = f_xpath( response, '//ul[contains(@class, "thumbnails")]' ).xpath( './li' ) for top in tops: top = f_xpath( top, './div[@class="thumbnail"]' ) if not top: continue name = fx_extract( top, './p/strong/text()' ) href = fx_extract( top, './a/@href' ).strip() curr_meta = {} curr_meta[ 'name' ] = name curr_meta[ 'url' ] = href curr_meta[ 'key' ] = 'meta/' + href[ self.prefix_len: ] + '.json' curr_meta[ 'top' ] = [] yield scrapy.http.Request( url = href, callback = self.parse_one_top, meta = curr_meta, dont_filter = True )
def parse(self, response): logger.info('fetch : ' + response.url) brands = f_xpath( response, '//ul[contains(@class, "brands") ' + \ ' and contains(@class, "columnize")' + \ ' and contains(@class, "clearfix")]' ).xpath( './li' ) for br in brands: br = f_xpath(br, './a') if not br: continue href = fx_extract(br, './@href') name = fx_extract(br, './text()') try: _b, bid, id = href.strip('/').split('/') except: continue #csv.writer( self.csv_fd ).writerow( [ id, bid, name, href ] ) d = FindnsaveBrandItem() d['id'] = id d['name'] = escape(name) d['nameid'] = bid d['uri'] = href yield d next_url = self.brand_next_page(response) if next_url is None: return yield scrapy.http.Request(url=next_url, callback=self.parse, dont_filter=True)
def parse(self, response): logger.info( 'fetch : ' + response.url ) catgos = f_xpath( response, '//ul[contains(@class, "listing") ' + \ ' and contains(@class, "grouping")' + \ ' and contains(@class, "infinite")]' ).xpath( './li' ) for ctg in catgos: ctg = f_xpath( ctg, './/div[@class="chiclet-actions"]/a' ) if not ctg: continue href = fx_extract( ctg, './@href' ) name = fx_extract( ctg, './@title' ) name = self.parse_categorie_name( name ) try: _c, cid, id = href.strip( '/' ).split( '/' ) except: continue #csv.writer( self.csv_fd ).writerow( [ id, cid, name, href ] ) d = FindnsaveCategoryItem() d['id'] = id d['name'] = name d['nameid'] = cid d['uri'] = href yield d next_url = self.categorie_next_page( response ) if next_url is None: return yield scrapy.http.Request( url = next_url, callback = self.parse, dont_filter = True )
def parse(self, response): logger.info( 'fetch : ' + response.url ) brands = f_xpath( response, '//ul[contains(@class, "brands") ' + \ ' and contains(@class, "columnize")' + \ ' and contains(@class, "clearfix")]' ).xpath( './li' ) for br in brands: br = f_xpath( br, './a' ) if not br: continue href = fx_extract( br, './@href' ) name = fx_extract( br, './text()' ) try: _b, bid, id = href.strip( '/' ).split( '/' ) except: continue #csv.writer( self.csv_fd ).writerow( [ id, bid, name, href ] ) d = FindnsaveBrandItem() d['id'] = id d['name'] = escape(name) d['nameid'] = bid d['uri'] = href yield d next_url = self.brand_next_page( response ) if next_url is None: return yield scrapy.http.Request( url = next_url, callback = self.parse, dont_filter = True )
def parse(self, response): logger.info('fetch : ' + response.url) tops = f_xpath(response, '//ul[contains(@class, "thumbnails")]').xpath('./li') for top in tops: top = f_xpath(top, './div[@class="thumbnail"]') if not top: continue name = fx_extract(top, './p/strong/text()') href = fx_extract(top, './a/@href').strip() curr_meta = {} curr_meta['name'] = name curr_meta['url'] = href curr_meta['key'] = 'meta/' + href[self.prefix_len:] + '.json' curr_meta['top'] = [] yield scrapy.http.Request(url=href, callback=self.parse_one_top, meta=curr_meta, dont_filter=True)
def _parse_price(self, p): price = f_xpath( p, './div[@class="product-price"]' ) p_c = fx_extract( price, './/span[@itemprop="priceCurrency"]/@content' ) or '' p_p = fx_extract( price, './/span[@class="price"]/@content' ) or '-1' p_r = fx_extract( price, './/span[@class="regular-price"]/text()' ) or '' p_u = fx_extract( price, './/span[@itemprop="priceValidUntil"]/@content' ) or '' try: float( p_p ) except Exception: p_p = '-1' if p_c == 'USD': if '$' in p_r: p_r = p_r.split('$')[1].split()[0] return p_c, p_p, p_r, p_u
def parse(self, response): logger.info( 'fetch : ' + response.url ) states = f_xpath( response, '//select[@id="states-dropdown"]' ).xpath( './option' ) sts = {} for st in states: st_short = fx_extract( st, './@value' ) st_name = fx_extract( st, './text()' ) if not st_short: continue if st_short not in sts: sts[ st_short ] = st_name states = xpath( response, '//ul[contains(@class, "hide") ' + \ ' and contains(@class, "clearfix")]' ) #state_fd = open( '/tmp/state_url.csv', 'w' ) #csvw = csv.writer( state_fd ) for st in states: st_short = fx_extract( st, './@id' ) locs = st.xpath( './li' ) for loc in locs: url = fx_extract( loc, './a/@href' ) area = fx_extract( loc, './a/text()' ) #csvw.writerow( [ st_short, sts.get( st_short, '' ), area, url ] ) if st_short not in sts: continue d = FindnsaveAreaItem() d[ 'area' ] = area d[ 'short' ] = st_short d[ 'state' ] = sts[ st_short ] d[ 'url' ] = url yield d
def _parse_desc(self, p): desc = f_xpath( p, './div[@class="offer-descriptions"]' ) desc = ' '.join( [ x.strip() for x in \ xpath_extract( desc, './/div[@class="offer-description"]/text()' ) ] ) return desc
def parse(self, response): #with open( '/tmp/amazon.html', 'w' ) as f: # f.write( response.body ) logger.info('fetch : ' + response.url) prdid = response.url.split('?')[0].split('/')[-1] review = f_xpath(response, '//table[@id="productReviews"]/tr/td') if review is None: yield scrapy.http.Request(url=next_product_url(), callback=self.parse) rids = xpath_extract(review, './a/@name') details = xpath(review, './div') lenth = min(len(rids), len(details)) for i in xrange(lenth): rdetail = details[i] divs = xpath(rdetail, './div') # max of len( divs ) is 7, ( 0 - 6 ) # 0 : number of helpful review # 1 : star, helpful text, date # 2 : reviewer, reviewer from # 3 : from # 4 : free product # 5 : reviewText # 6 : helpful? d = self.empty_item() d['prdid'] = prdid d['rid'] = rids[i] d['text'] = ' '.join( xpath_extract(rdetail, './div[@class="reviewText"]/text()')) while len(divs) > 0: div = divs[0] divs = divs[1:] text = div.extract() if 'people found the following review helpful' in text: d['num_help_review'] = self.parse_num_help_review(div) continue if 'out of' in text and 'stars' in text and '</nobr>' in text: d[ 'star' ], d[ 'help_text' ], d[ 'date' ] = \ self.parse_star_help_date( div ) continue if 'By' in text and 'See all my reviews' in text: d[ 'reviewer' ], d[ 'reviewer_from' ] = \ self.parse_reviewer_from( div ) continue if 'This review is from' in text: d['rfrom'] = self.parse_from(div) break yield d next_url = self.next_page( response ) or \ next_product_url() # see http://doc.scrapy.org/en/latest/topics/request-response.html yield scrapy.http.Request(url=next_url, callback=self.parse, dont_filter=True)
def parse(self, response): #with open( '/tmp/amazon.html', 'w' ) as f: # f.write( response.body ) logger.info( 'fetch : ' + response.url ) prdid = response.url.split( '?' )[ 0 ].split( '/' )[ -1 ] review = f_xpath( response, '//table[@id="productReviews"]/tr/td' ) if review is None: yield scrapy.http.Request( url = next_product_url(), callback = self.parse ) rids = xpath_extract( review, './a/@name' ) details = xpath( review, './div' ) lenth = min( len( rids ), len( details ) ) for i in xrange( lenth ): rdetail = details[ i ] divs = xpath( rdetail, './div' ) # max of len( divs ) is 7, ( 0 - 6 ) # 0 : number of helpful review # 1 : star, helpful text, date # 2 : reviewer, reviewer from # 3 : from # 4 : free product # 5 : reviewText # 6 : helpful? d = self.empty_item() d[ 'prdid' ] = prdid d[ 'rid' ] = rids[ i ] d[ 'text' ] = ' '.join( xpath_extract( rdetail, './div[@class="reviewText"]/text()' ) ) while len( divs ) > 0: div = divs[ 0 ] divs = divs[ 1: ] text = div.extract() if 'people found the following review helpful' in text: d[ 'num_help_review' ] = self.parse_num_help_review( div ) continue if 'out of' in text and 'stars' in text and '</nobr>' in text: d[ 'star' ], d[ 'help_text' ], d[ 'date' ] = \ self.parse_star_help_date( div ) continue if 'By' in text and 'See all my reviews' in text: d[ 'reviewer' ], d[ 'reviewer_from' ] = \ self.parse_reviewer_from( div ) continue if 'This review is from' in text: d[ 'rfrom' ] = self.parse_from( div ) break yield d next_url = self.next_page( response ) or \ next_product_url() # see http://doc.scrapy.org/en/latest/topics/request-response.html yield scrapy.http.Request( url = next_url, callback = self.parse, dont_filter = True )
def next_top_img( self, response ): urls = f_xpath( response, '//div[contains(@style, "margin-bottom")]' ).xpath( './div' )[1:] for url in urls: nxt = fx_extract( url, './a[contains(text(), "Next")]/@href' ) if nxt: return nxt