예제 #1
0
    def spider_opened(self, spider):
        self.file = None
        if not os.path.exists(file_name):
            print("create file")
            self.file = open(file_name, 'w')
            self.wb = Workbook()
            #remove default worksheet
            self.wb.remove_sheet(self.wb.active)
            self.wb.save(file_name)
        else:
            self.wb = load_workbook(file_name)

        self.exporter = XlsExporter(self.file, self.wb)
        self.exporter.start_exporting()
예제 #2
0
class GoodPipeline(object):
    headers = ('name', 'jan', 'package', 'package_cnt', 'picture', 'price', 'description')

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        self.file = None
        if not os.path.exists(file_name):
            print("create file")
            self.file = open(file_name, 'w')
            self.wb = Workbook()
            #remove default worksheet
            self.wb.remove_sheet(self.wb.active)
            self.wb.save(file_name)
        else:
            self.wb = load_workbook(file_name)

        self.exporter = XlsExporter(self.file, self.wb)
        self.exporter.start_exporting()

    def process_item(self, item, spider):

        try:
            self.wb.get_sheet_by_name(item["brand"])
        except KeyError as err:
            self.wb.create_sheet(item["brand"])

        brand_worksheet = self.wb.get_sheet_by_name(item["brand"])
        verbose_headers = settings.EXPORT_SETTINGS["headers"]

        for col, header in enumerate(verbose_headers):
            cell = brand_worksheet.cell(row=1, column=col + 1)
            cell.value = header

        self.exporter.export_item(item)
        return item

    def get_media_requests(self, item, info):
        for image_url in item['image_urls']:
            yield scrapy.Request(image_url)

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.wb.save(file_name)
        logger.debug("save complete")
        if self.file:
            self.file.close()

        logger.debug("---PARSE REPORT---\r\n%s\r\n----------" % pprint.pprint(spider.parse_report()))