def parse(self, response): logger.info( 'fetch : ' + response.url ) states = f_xpath( response, '//select[@id="states-dropdown"]' ).xpath( './option' ) sts = {} for st in states: st_short = fx_extract( st, './@value' ) st_name = fx_extract( st, './text()' ) if not st_short: continue if st_short not in sts: sts[ st_short ] = st_name states = xpath( response, '//ul[contains(@class, "hide") ' + \ ' and contains(@class, "clearfix")]' ) #state_fd = open( '/tmp/state_url.csv', 'w' ) #csvw = csv.writer( state_fd ) for st in states: st_short = fx_extract( st, './@id' ) locs = st.xpath( './li' ) for loc in locs: url = fx_extract( loc, './a/@href' ) area = fx_extract( loc, './a/text()' ) #csvw.writerow( [ st_short, sts.get( st_short, '' ), area, url ] ) if st_short not in sts: continue d = FindnsaveAreaItem() d[ 'area' ] = area d[ 'short' ] = st_short d[ 'state' ] = sts[ st_short ] d[ 'url' ] = url yield d
def parse(self, response): #with open( '/tmp/amazon.html', 'w' ) as f: # f.write( response.body ) logger.info('fetch : ' + response.url) prdid = response.url.split('?')[0].split('/')[-1] review = f_xpath(response, '//table[@id="productReviews"]/tr/td') if review is None: yield scrapy.http.Request(url=next_product_url(), callback=self.parse) rids = xpath_extract(review, './a/@name') details = xpath(review, './div') lenth = min(len(rids), len(details)) for i in xrange(lenth): rdetail = details[i] divs = xpath(rdetail, './div') # max of len( divs ) is 7, ( 0 - 6 ) # 0 : number of helpful review # 1 : star, helpful text, date # 2 : reviewer, reviewer from # 3 : from # 4 : free product # 5 : reviewText # 6 : helpful? d = self.empty_item() d['prdid'] = prdid d['rid'] = rids[i] d['text'] = ' '.join( xpath_extract(rdetail, './div[@class="reviewText"]/text()')) while len(divs) > 0: div = divs[0] divs = divs[1:] text = div.extract() if 'people found the following review helpful' in text: d['num_help_review'] = self.parse_num_help_review(div) continue if 'out of' in text and 'stars' in text and '</nobr>' in text: d[ 'star' ], d[ 'help_text' ], d[ 'date' ] = \ self.parse_star_help_date( div ) continue if 'By' in text and 'See all my reviews' in text: d[ 'reviewer' ], d[ 'reviewer_from' ] = \ self.parse_reviewer_from( div ) continue if 'This review is from' in text: d['rfrom'] = self.parse_from(div) break yield d next_url = self.next_page( response ) or \ next_product_url() # see http://doc.scrapy.org/en/latest/topics/request-response.html yield scrapy.http.Request(url=next_url, callback=self.parse, dont_filter=True)
def parse(self, response): #with open( '/tmp/amazon.html', 'w' ) as f: # f.write( response.body ) logger.info( 'fetch : ' + response.url ) prdid = response.url.split( '?' )[ 0 ].split( '/' )[ -1 ] review = f_xpath( response, '//table[@id="productReviews"]/tr/td' ) if review is None: yield scrapy.http.Request( url = next_product_url(), callback = self.parse ) rids = xpath_extract( review, './a/@name' ) details = xpath( review, './div' ) lenth = min( len( rids ), len( details ) ) for i in xrange( lenth ): rdetail = details[ i ] divs = xpath( rdetail, './div' ) # max of len( divs ) is 7, ( 0 - 6 ) # 0 : number of helpful review # 1 : star, helpful text, date # 2 : reviewer, reviewer from # 3 : from # 4 : free product # 5 : reviewText # 6 : helpful? d = self.empty_item() d[ 'prdid' ] = prdid d[ 'rid' ] = rids[ i ] d[ 'text' ] = ' '.join( xpath_extract( rdetail, './div[@class="reviewText"]/text()' ) ) while len( divs ) > 0: div = divs[ 0 ] divs = divs[ 1: ] text = div.extract() if 'people found the following review helpful' in text: d[ 'num_help_review' ] = self.parse_num_help_review( div ) continue if 'out of' in text and 'stars' in text and '</nobr>' in text: d[ 'star' ], d[ 'help_text' ], d[ 'date' ] = \ self.parse_star_help_date( div ) continue if 'By' in text and 'See all my reviews' in text: d[ 'reviewer' ], d[ 'reviewer_from' ] = \ self.parse_reviewer_from( div ) continue if 'This review is from' in text: d[ 'rfrom' ] = self.parse_from( div ) break yield d next_url = self.next_page( response ) or \ next_product_url() # see http://doc.scrapy.org/en/latest/topics/request-response.html yield scrapy.http.Request( url = next_url, callback = self.parse, dont_filter = True )