class FlipkartTrendsPipeline(object): def __init__(self): self.newcsv=csv.writer(open("books.csv","w")) #self.engine = create_engine('mysql connection') #self.Session = sessionmaker(bind=self.engine) #self.dt = datetime #Base.metadata.create_all(self.engine) self.session = Session() def process_item(self, item, spider): use_for = ['FlipkartSpider'] if spider.name in use_for: log.msg(item['title'], level=log.DEBUG) round_number = self.session.query(Meta).order_by(desc(Meta.round)).first() #self.session.refresh(round_number) #self.newcsv.writerow([item['author'][0],item['title'][0],item['price'][0]]) book = Books(unicode(round_number.round),unicode(item['title'][0]),unicode(item['author'][0]),flipkart=unicode(item['price'][0].split(' ')[2])) self.session.add(book) self.session.commit() self.session.flush() #session.close() return item else: return item
class FlipkartTrendsPipeline(object): def __init__(self): self.newcsv = csv.writer(open("books.csv", "w")) #self.engine = create_engine('mysql connection') #self.Session = sessionmaker(bind=self.engine) #self.dt = datetime #Base.metadata.create_all(self.engine) self.session = Session() def process_item(self, item, spider): use_for = ['FlipkartSpider'] if spider.name in use_for: log.msg(item['title'], level=log.DEBUG) round_number = self.session.query(Meta).order_by(desc( Meta.round)).first() #self.session.refresh(round_number) #self.newcsv.writerow([item['author'][0],item['title'][0],item['price'][0]]) book = Books(unicode(round_number.round), unicode(item['title'][0]), unicode(item['author'][0]), flipkart=unicode(item['price'][0].split(' ')[2])) self.session.add(book) self.session.commit() self.session.flush() #session.close() return item else: return item
class FlipkartSpider(BaseSpider): name = "FlipkartSpider" allowed_domains = ["http://flipkart.com","www.flipkart.com"] start_urls = [ 'http://www.flipkart.com/view-books/0/new-releases' ] def __init__(self): #self.engine = create_engine('mysql connection') #self.Session = sessionmaker(bind=self.engine) #Base.metadata.create_all(self.engine) self.session = Session() try: round_info = self.session.query(Meta).order_by(desc(Meta.round)).first() print round_info new_round = Meta(round_info.round+1) self.session.add(new_round) self.session.commit() self.session.flush() self.session.close() except: new_round = Meta(0) self.session.add(new_round) self.session.commit() self.session.flush() self.session.close() def parse(self, response): #filename = response.url.split("/")[-2] #open(filename, 'wb').write(response.body) hxs = HtmlXPathSelector(response) #hxs.select('//div[@class="line bmargin10"]/h2[@class="fk-srch-item-title fksd-bodytext"]/a/text()').extract() sites = hxs.select('//div[@class="fk-srch-item fk-inf-scroll-item"]') #sites = hxs.select('//div[@class="lastUnit"]/div[@id="search_results"]') items=[] print sites.__len__() for site in sites: #print site item = Book() item['title']= site.select('div[@class="line fksd-bodytext "]/div[@class="line bmargin10"]/h2[@class="fk-srch-item-title fksd-bodytext"]/a/text()').extract() item['author'] = site.select('div[@class="line fksd-bodytext "]/div[@class="line bmargin10"]/span[@class="fk-item-authorinfo-text fksd-smalltext"]/a/text()').extract() item['price'] = site.select('div[@class="line fksd-bodytext "]/div[@class="unit fk-sitem-info-section"]/div[@class="line fk-itemdetail-info fksd-bodytext"]/div[@class="line dlvry-det"]/div[@class="line fk-srch-pricing fksd-smalltext"]/b[@class="fksd-bodytext price final-price"]/text()').extract() items.append(item) #print item return items