class JsonPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('/home/gaoliang/Desktop/result.json', 'w+b') self.files[spider] = file self.exporter = JsonItemExporter(file, ensure_ascii=False) # 添加ensure_ascii=False用于使json保存中文不乱码 self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class JsonExportPipeline(object): def __init__(self): _log.info('JsonExportPipeline.init....') self.files = {} @classmethod def from_crawler(cls, crawler): _log.info('JsonExportPipeline.from_crawler....') pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): _log.info('JsonExportPipeline.spider_opened....') file = open('%s.json' % spider.name, 'w+b') self.files[spider] = file self.exporter = JsonItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): _log.info('JsonExportPipeline.spider_closed....') self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): _log.info('JsonExportPipeline.process_item....') self.exporter.export_item(item) return item
class JsonPipeline(object): """Save Pipeline output to JSON.""" def __init__(self, spider_name): self.file = open("output/{}_recipes.json".format(spider_name), 'wb') self.file.write( '{"date_scraped": "%s", "recipes": ' % datetime.datetime.now() ) self.exporter = JsonItemExporter(self.file, encoding='utf-8', ensure_ascii=False) self.exporter.start_exporting() @classmethod def from_crawler(cls, crawler): return cls( spider_name=crawler.spider.name ) def close_spider(self): self.exporter.finish_exporting() self.file.write("}") self.file.close() def process_item(self, item): self.exporter.export_item(item) return item
class SaveItemToJson(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_items.json' % spider.name, 'w+b') self.files[spider] = file self.exporter = JsonItemExporter(file=file) print self.exporter self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class JsonPipelineExporterMixin: @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_items.json' % spider.name, 'w+b') self.files[spider] = file self.exporter = JsonItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): item = self.pre_process_item(item) self.exporter.export_item(item) return item def pre_process_item(self, item): return item
class JsonExportPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = codecs.open('%s_data.json' % spider.name, 'w+b', encoding='utf-8') self.files[spider] = file self.exporter = JsonItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class JsonExportPipeline(object): """ app.pipelines.exporter_json.JsonExportPipeline """ def __init__(self): self.files = {} self.exporter = None @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file_json = open('%s_items.json' % spider.name, 'w+b') self.files[spider] = file_json self.exporter = JsonItemExporter(file_json) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file_json = self.files.pop(spider) file_json.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class JsonExporterPipeline(object): # 调用scrapy提供的json_export 导出json文件 def __init__(self): self.file = open('articleexport.json','wb') self.exporter = JsonItemExporter(self.file,encoding="utf-8",ensure_ascii = False) self.exporter.start_exporting() def close_spider(self,spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class TopicItemPipeline(object): def __init__(self): self.file = open("data.json", 'wb') self.exporter = JsonItemExporter(self.file, encoding='utf-8', indent=4) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class ZapposJasonWriterPipeline(object): def open_spider(self, spider): self.file = open('tiger.json', 'wb') self.exporter = JsonItemExporter(self.file, indent=2) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class TescoPipeline: def open_spider(self, spider): self.file = open('item.json', 'wb') self.exporter = JsonItemExporter(self.file) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class honglingjingjson(object): def open_spider(self, spider): self.file = open('honglingjing.json', 'wb') self.exporter = JsonItemExporter(self.file, encoding='utf-8', ensure_ascii=False) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class TvshowPipeline(object): def __init__(self): self.fp=open("tv.json","wb") self.exporter=JsonItemExporter(self.fp,encoding='utf-8',ensure_ascii=False) self.exporter.start_exporting() def process_item(self, item, spider): self.exporter.export_item(item) return item def close(self,spider): self.exporter.finish_exporting() self.fp.close()
class JingdongSpiderPipeline(object): def __init__(self): self.__json_file = open("cup.json", "wb") self.__exporter = JsonItemExporter(self.__json_file, encoding="utf-8") self.__exporter.start_exporting() def close_spider(self, spider): self.__exporter.finish_exporting() self.__json_file.close() def process_item(self, item, spider): self.__exporter.export_item(item) return item
class FileExporterJson(object): def open_spider(self, spider): self.file = open("./results/.json", 'wb') # Update name self.exporter = JsonItemExporter(self.file, encoding='utf-8', ensure_ascii=False) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class WxappPipeline(object): def __init__(self): self.fp = open('wxjiaocheng.json', 'wb') self.exporter = JsonItemExporter(self.fp, ensure_ascii=False, encoding='utf-8') def process_item(self, item, spider): self.exporter.export_item(item) return item def close_spider(self, spider): self.fp.close()
class NaverNewsCrawlerPipeline(object): def __init__(self): self.file = open("news_Crawl_from20060101_200809011.json", 'wb') self.exporter = JsonItemExporter(self.file, encoding='utf-8') self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class JsonItemPipeline: def open_spider(self, spider): self.file = open(self.file_name, "wb") self.exporter = JsonItemExporter(self.file) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class JsonExporterPipeline(object): def __init__(self): self.file = open("article_export.json", "wb") self.exporter = JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False) self.exporter.start_exporting() def process_item(self, item, spider): self.exporter.export_item(item) return item def spider_closed(self, spider): self.exporter.finish_exporting() self.file.close()
class CreepycrawlersPipeline(object): def open_spider(self, spider): self.file = open('results.jl', 'wb') self.exp = JsonItemExporter(self.file, indent=4) self.exp.start_exporting() def close_spider(self, spider): self.exp.finish_exporting() self.file.close() def process_item(self, item, spider): self.exp.export_item(item) return item
class JsonExporterPipeline(object): # 调用scrapy提供的json export导出json文件 def __init__(self): self.file = open("article_exporter.json", 'wb') self.exporter = JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False) def close_spider(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class JsonPipeline(object): def __init__(self): self.file = open("items.json", 'wb') self.exporter = JsonItemExporter(self.file, encoding='utf-8', ensure_ascii=False) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class TreasureJsonPipeline(object): def open_spider(self, spider): self.file = open('treasure.json', 'wb') self.write = JsonItemExporter(self.file) self.write.start_exporting() def process_item(self, item, spider): self.write.export_item(item) return item def close_spider(self, spider): self.file.close() self.write.finish_exporting()
class ArticleJsonWithExporterPipeline(object): def __init__(self): self.file = open("article_with_exporter.json", "wb") self.exporter = JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False) self.exporter.start_exporting() def process_item(self, item, spider): self.exporter.export_item(item) def close_spider(self, spider): self.exporter.finish_exporting()
class DoubanvideoPipeline(object): def open_spider(self, spider): self.file = open('doubanvideo.json', 'wb') self.exporter = JsonItemExporter(self.file) self.exporter.start_exporting() def process_item(self, item, spider): self.exporter.export_item(item) return item def close_spider(self, spider): self.exporter.finish_exporting() self.file.close()
class JsonPipeline(object): def __init__(self): self.file = open('channels.json', 'wb') self.exporter = JsonItemExporter(self.file) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class AritcleJsonItemExporter(object): """调用scrapy的json exporter导出json文件""" def __init__(self): self.file = open('articleExport.json', 'wb') self.exporter = JsonItemExporter(self.file, encoding='utf-8', ensure_ascii=False) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class JsonExporterPipleline(object): #scrapyのJsonItemExporter使用してjsonのエクスポート def __init__(self): self.file = open("articleexport.json", 'wb') self.exporter = JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item=item) return item
class JsonExporterPipeline(object): # use the scrapy built-in json writter def __init__(self): self.file = open('articleJsonExporter.json', 'wb') self.exporter = JsonItemExporter(self.file, encoding='utf-8') self.exporter.start_exporting() def process_item(self, item, spider): self.exporter.export_item(item) return item def close_spider(self, spider): self.exporter.finish_exporting() self.file.close()
class JsonExporterPipeline(object): # 调用scrapy提供的JsonItemExporter写入json文件 def __init__(self): self.file = open('article_exporter.json','wb') self.exporter = JsonItemExporter(self.file, encoding='utf-8', ensure_ascii=False) self.exporter.start_exporting() def close_spider(self,spider): self.exporter.fields_to_export() self.file.close() def process_item(self,item, spider): self.exporter.export_item(item) return item
class ScrapeNyxPipeline: def open_spider(self,spider): self.file = open("data.json", 'wb') self.exporter = JsonItemExporter(self.file, encoding='utf-8', ensure_ascii=False) self.exporter.start_exporting() def close_spider(self,spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class JsonExporterPipline(object): # 调用scrapy提供的JSON EXPORTER def __init__(self): self.file = open('articleexport.json', 'wb') self.exporter = JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class KatinuoPipeline(object): def open_spider(self, spider): self.file = open('novel.json', 'wb') self.exporter = JsonItemExporter(self.file, ensure_ascii=False, encoding='utf-8') self.exporter.start_exporting() def process_item(self, item, spider): self.exporter.export_item(item) return item def close_spider(self, spider): self.exporter.finish_exporting() self.file.close()
class JsonExportPipeline(object): # 调用scrapy提供的json export 导出json文件 def __init__(self): self.file = open('articleExport.json', 'wb') self.exporter = JsonItemExporter(self.file, encoding="utf-8") self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class JsonExporterPipeline(object): # 调用scrapy提供的json exporter导出json文件 def __init__(self): self.file = open('articlexport.json', 'wb') self.exporter = JsonItemExporter(self.file, encoding='utf-8', ensure_ascii=False) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) # 这一步完成了python数据到json的转换 return item
class VisionsJsonPipeline(object): def __init__(self): self.exporter = None def open_spider(self, spider): self.exporter = JsonItemExporter(open('%s.json' %spider.name, 'wb')) self.exporter.start_exporting() def process_item(self, item, spider): self.exporter.export_item(item) return item def close_spider(self, spider): self.exporter.finish_exporting()
class BestsellerItemJsonPipeline(object): def open_spider(self, spider): self.file = open('amazon_bestseller.json', 'wb') #写模式 self.exporter = JsonItemExporter(self.file, encoding='utf-8', ensure_ascii=False)#不使用ascii编码 self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class MaitianPipeline(object): def open_spider(self, spider): self.file = open('house_info.json', 'wb') self.exporter = JsonItemExporter(self.file, ensure_ascii=False, encoding='utf-8') self.exporter.start_exporting() def process_item(self, item, spider): if isinstance(item, MaitianItem): self.exporter.export_item(item) return item def close_spider(self, spider): self.exporter.finish_exporting() self.file.close()
class JsonWriterPipeline(object): def __init__(self): self.file = open('news.json', 'wb') self.exporter = JsonItemExporter(self.file, encoding='utf-8') self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): print("==========保存到json中===========") self.exporter.export_item(item) return item
def process_item(self, item, spider): designer_dir_name = skutils.escape_filename(item['name']) designer_dir_path = os.path.join(GlobalState.data_dir, designer_dir_name) file_path = os.path.join(designer_dir_path, designer_dir_name) # write json file with open('%s.json' % file_path, 'w+b') as f: exporter = JsonItemExporter(f) exporter.start_exporting() exporter.export_item(item) exporter.finish_exporting() # write excel file excelutils.write_designer_excel(item, file_path, designer_dir_name) return item
class JsonWriterPipeline(BaseItemExporter): def __init__(self, **kwargs): self._configure(kwargs) self.files = {} self.encoder = json.JSONEncoder(ensure_ascii=False, **kwargs) @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = codecs.open('item.json', 'wb', encoding="utf-8") self.files[spider] = file self.exporter = JsonItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): if item['title']: # and item['image_url'] : item['description'] = re.sub("\r|\n","", item['description']) item['general_impression'] = re.sub("\r|\n","", item['general_impression']) item['subject_of_photo'] = re.sub("\r|\n","", item['subject_of_photo']) item['composition'] = re.sub("\r|\n","", item['composition']) item['use_of_camera'] = re.sub("\r|\n","", item['use_of_camera']) item['depth_of_field'] = re.sub("\r|\n","", item['depth_of_field']) item['color_lighting'] = re.sub("\r|\n","", item['color_lighting']) item['focus'] = re.sub("\r|\n","", item['focus']) ##line = json.dumps(dict(item)) + '\n' ##self.file.write(line) self.exporter.export_item(item) return item
class JsonExportPipeline(object): def __init__(self): dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) self.files = {} def spider_opened(self, spider): file = open('%s_items.json' % spider.name, 'w+b') self.files[spider] = file self.exporter = JsonItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class WikicrawlerPipeline(object): def __init__(self): self.item_file = open('items.json', 'wb') self.exporter = JsonItemExporter(self.item_file) @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): self.exporter.start_exporting() def process_item(self, item, spider): self.exporter.export_item(item) def spider_closed(self): self.exporter.finish_exporting() self.item_file.close()
class SiteMapJsonExportPipeline(object): '''Process the SiteMap spider output Items, and write them as JSON to an output file. The output file is taken from the Spider's config (spider.config)''' @classmethod def from_crawler(cls, crawler): ''' Boilerplate ''' pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): self.file = open(spider.config['map_file'], 'wb') self.exporter = JsonItemExporter(self.file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class DuplicatesExportPipeline(object): def __init__(self): self.category_seen = set() self.product_seen = set() self.shop_seen = set() self.product_price_seen = set() def open_spider(self, spider): # Creates 4 files for storage scraped items self.category_file = open('spider/scraped/category.json', 'wb') self.category_exporter = JsonItemExporter(self.category_file, encoding="utf-8") self.category_exporter.start_exporting() self.product_file = open('spider/scraped/product.json', 'wb') self.product_exporter = JsonItemExporter(self.product_file, encoding="utf-8") self.product_exporter.start_exporting() self.shop_file = open('spider/scraped/shop.json', 'wb') self.shop_exporter = JsonItemExporter(self.shop_file, encoding="utf-8") self.shop_exporter.start_exporting() self.product_price_file = open('spider/scraped/productprice.json', 'wb') self.product_price_exporter = JsonItemExporter(self.product_price_file, encoding="utf-8") self.product_price_exporter.start_exporting() def close_spider(self, spider): # Closing exports and scraped item files self.category_exporter.finish_exporting() self.category_file.close() self.product_exporter.finish_exporting() self.product_file.close() self.shop_exporter.finish_exporting() self.shop_file.close() self.product_price_exporter.finish_exporting() self.product_price_file.close() def process_item(self, item, spider): if 'id' in item.keys() and 'name' in item.keys() and 'parent_category_id' in item.keys(): # Drops duplicates in category if item['id'] in self.category_seen: raise DropItem("Duplicate category item found: %s" % item) else: self.category_seen.add(item['id']) # Exports category item self.category_exporter.export_item(item) return item if 'name' in item.keys() and 'category_id' in item.keys() and 'thumbnail_url' in item.keys() and 'url' in item.keys(): # Drops duplicates in products if item['url'] in self.product_seen: raise DropItem("Duplicate product item found: %s" % item) else: self.product_seen.add(item['url']) # Exports category item self.product_exporter.export_item(item) return item if 'name' in item.keys() and 'url' in item.keys() and 'thumbnail_url' in item.keys(): # Drops duplicates in shops if item['url'] in self.shop_seen: raise DropItem("Duplicate shop item found: %s" % item) else: self.shop_seen.add(item['url']) # Exports shop item self.shop_exporter.export_item(item) return item if 'shop_id' in item.keys() and 'product_id' in item.keys() and 'price' in item.keys() and 'price_and_shipment' in item.keys(): # Drops duplicates in product price if item['shop_id'] + '-' + item['product_id'] in self.product_price_seen: raise DropItem("Duplicate product price item found: %s" % item) else: self.product_price_seen.add(item['shop_id'] + '-' + item['product_id']) # Exports product price item self.product_price_exporter.export_item(item) return item return item
class JsonExportPipeline(object): def __init__(self): self.files = [] self.exporters = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): if spider.__class__ == MayorsSpider: mayor_file = open("data/mayor_candidates.json", "w+b") council_file = open("data/city_counsils.json", "w+b") self.files.append(mayor_file) self.files.append(council_file) self.mayor_exporter = JsonItemExporter(mayor_file) self.council_exporter = JsonItemExporter(council_file) self.mayor_exporter.start_exporting() self.council_exporter.start_exporting() elif spider.__class__ == RegionCountiesSpider: counties_file = open("data/region_counties.json", "w+b") self.counties_exporter = JsonItemExporter(counties_file) self.files.append(counties_file) def create_exporter(self, filename): file = open(filename, "w+b") exporter = JsonItemExporter(file) exporter.start_exporting() self.files.append(file) return exporter def spider_closed(self, spider): if spider.__class__ == MayorsSpider: self.mayor_exporter.finish_exporting() self.council_exporter.finish_exporting() elif spider.__class__ == RegionCountiesSpider: for exporter in self.exporters.itervalues(): exporter.finish_exporting() for file in self.files: file.close() def process_item(self, item, spider): if item.__class__ == CityCouncil: self.council_exporter.export_item(item) elif item.__class__ == MayorCandidate: self.mayor_exporter.export_item(item) self.counties_exporter.export_item(item) else: if item.__class__ == RegionCandidate: filename = "data/region_council_candidates.json" elif item.__class__ == RegionCounty: filename = "data/region_counties.json" elif item.__class__ == CityCouncilCandidate: filename = "data/city_council_candidates.json" elif item.__class__ == CityCounty: filename = "data/city_counties.json" exporter_name = item.__class__.__name__ if exporter_name not in self.exporters: self.exporters[exporter_name] = self.create_exporter(filename) self.exporters[exporter_name].export_item(item) return item