def spider_opened(self, spider): self.file = None if not os.path.exists(file_name): print("create file") self.file = open(file_name, 'w') self.wb = Workbook() #remove default worksheet self.wb.remove_sheet(self.wb.active) self.wb.save(file_name) else: self.wb = load_workbook(file_name) self.exporter = XlsExporter(self.file, self.wb) self.exporter.start_exporting()
class GoodPipeline(object): headers = ('name', 'jan', 'package', 'package_cnt', 'picture', 'price', 'description') @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): self.file = None if not os.path.exists(file_name): print("create file") self.file = open(file_name, 'w') self.wb = Workbook() #remove default worksheet self.wb.remove_sheet(self.wb.active) self.wb.save(file_name) else: self.wb = load_workbook(file_name) self.exporter = XlsExporter(self.file, self.wb) self.exporter.start_exporting() def process_item(self, item, spider): try: self.wb.get_sheet_by_name(item["brand"]) except KeyError as err: self.wb.create_sheet(item["brand"]) brand_worksheet = self.wb.get_sheet_by_name(item["brand"]) verbose_headers = settings.EXPORT_SETTINGS["headers"] for col, header in enumerate(verbose_headers): cell = brand_worksheet.cell(row=1, column=col + 1) cell.value = header self.exporter.export_item(item) return item def get_media_requests(self, item, info): for image_url in item['image_urls']: yield scrapy.Request(image_url) def spider_closed(self, spider): self.exporter.finish_exporting() self.wb.save(file_name) logger.debug("save complete") if self.file: self.file.close() logger.debug("---PARSE REPORT---\r\n%s\r\n----------" % pprint.pprint(spider.parse_report()))