def load_table(table, source = "default/test-0.jsonlines",\ modifier = "" , dblogin = "******" , \ as_name = None ): filename = source if as_name is None else as_name dbname, collectioname = parse_path(filename , modifier) connection = pymongo.MongoClient( dblogin ) db = getattr(connection, dbname) collection = getattr( db , collectioname ) try: result = collection.insert_many((set_id(obj) for obj in odicts(table)),ordered=False) except BulkWriteError as e: result = e.details errs = set() with open( "%s.%s" % (filename,"errors") , "a") as f: exporter = JsonLinesItemExporter(f) exporter.start_exporting() for err in result.get("writeErrors"): if not err.get("op").get("_id") in errs: obj = dict( item = err.get("op") , \ error = err.get("errmsg") ) errs.add( err.get("op").get("_id") ) exporter.export_item(obj) exporter.finish_exporting() f.close() return result
class KinoPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): # It might be possible to add indent=4 and ensure_ascii=False somewhere. file = open('output/' + spider.name + '.jsonl', 'w+b', encoding='utf-8') self.files[spider] = file self.exporter = JsonLinesItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
def export_item(self, item): storage_file = open(self.item_storage_path(item["id"]), "w") item_exporter = JsonLinesItemExporter(storage_file) item_exporter.start_exporting() item_exporter.export_item(item) item_exporter.finish_exporting() storage_file.close()
class JsonExportPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_all.json' % spider.name, 'w+b') self.files[spider] = file self.exporter = JsonLinesItemExporter(file, encoding='utf-8', ensure_ascii=False) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class MedPipeline(object): def __init__(self): self.ids_seen = set() @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): self.file = open('medData.json', 'wb') self.expoter = JsonLinesItemExporter(self.file) self.expoter.start_exporting() def spider_closed(self, spider): self.expoter.finish_exporting() self.file.close() def process_item(self, item, spider): if int(item['reply_num'][0]) == 0: raise DropItem("no reply in %s" % item) elif item['post_id'] in self.ids_seen: raise DropItem("Duplicate item found: %s" % item) else: self.ids_seen.add(item['post_id']) self.expoter.export_item(item) return item
class JsonLinesExportPipeline(object): """ app.pipelines.exporter_json_lines.JsonLinesExportPipeline """ def __init__(self): self.files = {} self.exporter = None @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file_json_lines = open('%s_item_lines.json' % spider.name, 'w+b') self.files[spider] = file_json_lines self.exporter = JsonLinesItemExporter(file_json_lines) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file_json_lines = self.files.pop(spider) file_json_lines.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class QiushiPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): # The file created on Dec20 2015 will be named as "12-20-2015.json" datestr = date.today().strftime("%m-%d-%Y") file = open('scraped_data/%s.json' % datestr, 'w+b') self.files[spider] = file self.exporter = JsonLinesItemExporter(file, ensure_ascii=False) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) # print item['author'] # print item['title'] # print item['content'] # print item['href'] return item
class EduSpider(scrapy.Spider): """ Used to scrape .edu websites for web technology statistics """ name = 'edu' def __init__(self): scrapy.Spider.__init__(self) baseurl = 'https://domaintyper.com/top-websites/most-popular-websites-with-edu-domain/page/' logpath = '/home/www/flasksite/static/scenes/unipagestats/schools.jl' self.start_urls = [baseurl + str(i) for i in xrange(1, 30)] self.domain = 'domaintyper.com' self.exporter = JsonLinesItemExporter(open(logpath, 'wb+')) def parse(self, response): self.exporter.start_exporting() urls = [url.encode('utf-8') for url in response.css('.wsTR > td:nth-child(2)').xpath('text()').extract()] for url in urls: fullurl = 'http://www.' + url + '/' yield scrapy.Request(fullurl, callback=self.parse_edu_site) def parse_edu_site(self, response): data = SiteData() tc = TagCounter() # Fill summary fields data['url'] = response.url data['domain'] = '.'.join(response.url.split('/')[2].split('.')[-2:]) data['name'] = data['domain'].split('.')[0] data['title'] = response.xpath('//title/text()').extract()[0].encode('utf-8') # Fill CSS fields data['css_paths'] = [stylesheet.encode('utf-8') for stylesheet in response.xpath('//link[@rel="stylesheet"]/@href').extract()] data['css_files'] = [stylesheet.split('/')[-1] for stylesheet in data['css_paths']] # Fill JS fields data['js_paths'] = [script.encode('utf-8') for script in response.xpath('//script/@src').extract()] data['js_files'] = [script.split('/')[-1] for script in data['js_paths']] # Fill tag fields tc.feed(response.body) data['tagcount'] = tc.tagcount data['nonvoidcount'] = tc.nonvoid_tagcount data['topnest'] = tc.topnest self.exporter.export_item(data) yield data def __del__(self): scrapy.Spider.__del__(self) self.exporter.finish_exporting()
class WxappPipeline(object): def __init__(self): #爬虫开始之前,先打开文件(没有就创建文件) self.fp = open('wxapp_test2.json','wb') #创建导出文件 self.exporter = JsonLinesItemExporter(self.fp,ensure_ascii=False,encoding='utf-8') def open_spider(self,spider): print('爬虫开始了。。。') def process_item(self, item, spider): #处理json数据 self.exporter.export_item(item) return item #因为pipeline有可能有多个,前面的处理完了item,如果不把item返回去,其他的pipeline获取不到item了,就无法处理item数据了 def close_spider(self,spider): print('爬虫结束了。。。') #爬虫结束,关闭文件 self.fp.close()
class HouseRedisPipeline(object): def __init__(self): self.newhouse_fp = open('new_house.json', 'wb') self.erhouse_fp = open('er_house.json', 'wb') self.newhouse_exporter = JsonLinesItemExporter(self.newhouse_fp, ensure_ascii=False) self.erhouse_exporter = JsonLinesItemExporter(self.erhouse_fp, ensure_ascii=False) # 保存json格式 def process_item(self, item, spider): self.newhouse_exporter.export_item(item) self.erhouse_exporter.export_item(item) return item def close_spider(self, spider): self.newhouse_fp.close() self.erhouse_fp.close()
class DuanziPipeline(object): # 在初始化方法中打开duanzi.json def __init__(self): self.fp = open('duanzi.json', 'wb') self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False, encoding='utf-8') def open_spider(self, spider): print('爬虫开始了') # 在这个方法中 将 数据 写入 json 文件中 def process_item(self, item, spider): # 这里简单了,不用转格式什么的了 self.exporter.export_item(item) return item def close_spider(self, spider): self.fp.close() print('爬虫结束了') pass
class QsbkPipeline(object): def __init__(self): # JsonLinesItemExporter 必须要以二进制的方式打开 # 注意:以二进制的方式打开写入,不需要指定编码格式;以字符串的形式打开写入,就需要指定编码格式 self.fp = open('duanzi.json', 'wb') # 定义一个 exporters self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False, encoding='utf-8') def open_spider(self, spider): print('爬虫开始了...') def process_item(self, item, spider): self.exporter.export_item(item) return item def close_spider(self, spider): self.fp.close() print('爬虫结束了。')
class EsfsalePipeline(object): def __init__(self): self.path = PROJECT_PATH self.json = open(os.path.join(self.path, 'Esfsale{}.json'.format(datetime.now().strftime('%Y-%m-%d'))), 'ab') self.json_exporter = JsonLinesItemExporter(self.json, ensure_ascii=False, encoding='utf-8') self.csv = open(os.path.join(self.path, 'Esfsale{}.csv'.format(datetime.now().strftime('%Y-%m-%d'))), 'ab') self.csv_exporter = CsvItemExporter(self.csv, encoding='utf-8') def open_spider(self, spider): print("爬虫开始了") def process_item(self, item, spider): self.json_exporter.export_item(item) self.csv_exporter.export_item(item) return item def close_spider(self, spider): self.json.close() self.csv.close() print("爬虫结束了")
class AnzSpiderPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() return pipeline def process_item(self, item, spider): file = open('%s_items.json' % spider.name, 'w+b') self.files[spider] = file file.write('{"anzbank":') self.exporter = JsonLinesItemExporter(file) self.exporter.start_exporting() self.exporter.export_item(item) self.exporter.finish_exporting() file = self.files.pop(spider) file.write("}") file.close() return item
class ModulePipeline(object): def __init__(self): self.fp = open("sogou.json", 'wb') self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False, encoding='utf-8') # self.exporter.start_exporting() def open_spider(self, spider): print("start!!") def process_item(self, item, spider): if item['name']: self.exporter.export_item(item) else: pass return item def close_spider(self, spider): # self.exporter.finish_exporting() self.fp.close() print("end!!")
class News163JsonPipeline(object): @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): name = '{0}_{1}'.format(spider.name, str(datetime.now()).replace(':', '-')) self.file = open('{0}_products.json'.format(name) , 'wb') self.exporter = JsonLinesItemExporter(self.file) def spider_closed(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class FangtianxiaPipeline(object): def __init__(self): self.newhouse_fp = open('newhouse.json', 'wb') self.esfhouse_fp = open('esfhouse.json', 'wb') self.newhouse_exporter = JsonLinesItemExporter(self.newhouse_fp, ensure_ascii=False) self.esfhouse_exporter = JsonLinesItemExporter(self.esfhouse_fp, ensure_ascii=False) def process_item(self, item, spider): self.newhouse_exporter.export_item(item) self.esfhouse_exporter.export_item(item) return item def close_spider(self, spider): self.newhouse_fp.close() self.esfhouse_fp.close()
class JsonExportPipeline(object): def __init__(self, settings): self.save_file = open( os.path.join(settings.get("RESULT_PATH"), "result.json"), "wb") self.exporter = JsonLinesItemExporter(self.save_file, encoding="utf8", ensure_ascii=False) self.exporter.start_exporting() @classmethod def from_crawler(cls, crawler): settings = crawler.settings return cls(settings) def close_spider(self, spider): self.exporter.finish_exporting() self.save_file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class HousePipeline(object): def __init__(self): self.newhouse_fp = open('newhouse.json', 'wb') self.esfhouse_fp = open('esfhouse.json', 'wb') self.newhouse_exporter = JsonLinesItemExporter(self.newhouse_fp, ensure_ascii=False, encoding='utf-8') self.esfhouse_exporter = JsonLinesItemExporter(self.esfhouse_fp, ensure_ascii=False, encoding="utf-8") def process_item(self, item, spider): if isinstance(item, HouseItem): self.newhouse_exporter.export_item(item) if isinstance(item, esfHouseItem): self.esfhouse_exporter.export_item(item) return item def close_spider(self, spider): self.newhouse_fp.close() self.esfhouse_fp.close()
class WeiboPipeline(object): def __init__(self): self.comments_fp = open("comments.json", "wb") self.people_fp = open('people.json', 'wb') self.statuses_fp = open('statuses.json', 'wb') self.comments_exporter = JsonLinesItemExporter(self.comments_fp, ensure_ascii=False) self.people_exporter = JsonLinesItemExporter(self.people_fp, ensure_ascii=False) self.statuses_exporter = JsonLinesItemExporter(self.statuses_fp, ensure_ascii=False) def process_item(self, item, spider): if isinstance(item, CommentItem): self.comments_exporter.export_item(item) elif isinstance(item, PeopleItem): self.people_exporter.export_item(item) else: self.statuses_exporter.export_item(item) return item def close_item(self, spider): print("存储成功!") self.comments_fp.close() self.people_fp.close() self.statuses_fp.close()
class ZhihuCrawlerPipeline(object): def __init__(self): self.fp1 = open('./data/answers.txt', 'wb') self.fp2 = open('./data/users_v2.txt', 'wb') self.fp3 = open('./data/questions.txt', 'wb') self.exporter1 = JsonLinesItemExporter(self.fp1, ensure_ascii=False, encoding='utf-8') self.exporter2 = JsonLinesItemExporter(self.fp2, ensure_ascii=False, encoding='utf-8') self.exporter3 = JsonLinesItemExporter(self.fp3, ensure_ascii=False, encoding='utf-8') def open_spider(self, spider): print('spider has opened.') def process_item(self, item, spider): if (isinstance(item, ZhihuAnswerItem)): self.exporter1.export_item(item) elif (isinstance(item, ZhihuUserItem)): self.exporter2.export_item(item) elif (isinstance(item, ZhihuQuestionItem)): self.exporter3.export_item(item) return item def close_spider(self, spider): self.exporter1.finish_exporting() self.exporter2.finish_exporting() self.exporter3.finish_exporting() self.fp1.close() self.fp2.close() self.fp3.close() print('spider has closed.')
class NewsPipeline(object): def __init__(self): self.fp_article = open('article.json', 'wb') self.fp_author = open('author.json', 'wb') self.fp_navItem = open('navItem.json', 'wb') self.exporter_article = JsonLinesItemExporter(self.fp_article, ensure_ascii=False, encoding='utf-8') self.exporter_author = JsonLinesItemExporter(self.fp_author, ensure_ascii=False, encoding='utf-8') self.exporter_navItem = JsonLinesItemExporter(self.fp_navItem, ensure_ascii=False, encoding='utf-8') def process_item(self, item, spider): if isinstance(item, ArticleItem): self.exporter_article.export_item(item) if isinstance(item, AuthorItem): self.exporter_author.export_item(item) if isinstance(item, NavItem): self.exporter_navItem.export_item(item) return item def close_spider(self, spider): self.fp_article.close() self.fp_author.close() self.fp_navItem.close()
class SfwPipeline(object): def __init__(self): self.fp_new_house = open('new_house.json', 'wb') self.fp_esf_house = open('esf_house.json', 'wb') self.exporter_new_house = JsonLinesItemExporter(self.fp_new_house, ensure_ascii=False) self.exporter_esf_house = JsonLinesItemExporter(self.fp_esf_house, ensure_ascii=False) def process_item(self, item, spider): if isinstance(item, NewHouseItem): print('写入一条新手房数据') self.exporter_new_house.export_item(item) else: print('写入一条二手房数据') self.exporter_esf_house.export_item(item) return item def close_spider(self, spider): self.fp_new_house.close() self.fp_esf_house.close()
def _make_fileobj(self): """ Build file object from items. """ bio = BytesIO() f = gzip.GzipFile(mode='wb', fileobj=bio) if self.use_gzip else bio # Build file object using ItemExporter exporter = JsonLinesItemExporter(f, encoding='utf-8') exporter.start_exporting() for item in self.items: exporter.export_item(item) exporter.finish_exporting() if f is not bio: f.close() # Close the file if GzipFile # Seek to the top of file to be read later bio.seek(0) return bio
class FangPipeline(object): def __init__(self): self.xf = open('xf.json', 'wb') self.es = open('es.json', 'wb') self.xf_ex = JsonLinesItemExporter(self.xf, ensure_ascii=False, encoding='utf-8') self.es_ex = JsonLinesItemExporter(self.es, ensure_ascii=False, encoding='utf-8') def process_item(self, item, spider): if item['xf_or_es'] == 'xf': self.xf_ex.export_item(item) elif item['xf_or_es'] == 'es': self.es_ex.export_item(item) return item def close(self): self.xf.close() self.es.close()
class JsonLinesExportPipeline(object): nbLines = 0 nbFiles = 0 def __init__(self): dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) self.files = {} def spider_opened(self, spider): i = datetime.now() file = codecs.open( '%s_items_%s_%s.json' % (spider.name, self.nbFiles, i.strftime('%Y-%m-%dT%H-%M-%S')), 'w+b') self.files[spider] = file self.exporter = JsonLinesItemExporter(file, ensure_ascii=False) self.exporter.start_exporting() def process_item(self, item, spider): if self.nbLines >= 10000: self.nbFiles = self.nbFiles + 1 self.nbLines = 0 i = datetime.now() file = codecs.open( '%s_items_%s_%s.json' % (spider.name, self.nbFiles, i.strftime('%Y-%m-%dT%H-%M-%S')), 'w+b') self.files[spider] = file self.exporter = JsonLinesItemExporter(file, ensure_ascii=False) else: self.nbLines = self.nbLines + 1 self.exporter.export_item(item) return item def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close()
class WxappPipeline: def __init__(self): self.file = open('wxapp.json', 'wb') self.export = JsonLinesItemExporter(self.file, ensure_ascii=False, encoding='utf-8') pass def open_spider(self, spider): print("爬虫开始...") pass def process_item(self, item, spider): print("存储...") self.export.export_item(item) return item pass def close_spider(self, spider): print("爬虫结束...") self.file.close() pass
class QsbkPipeline(object): def __init__(self): self.fp = open('duanzi.json', 'wb') self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False, encoding='utf-8') def open_spider(self, spider): pass def process_item(self, item, spider): self.exporter.export_item(item) return item def close_spider(self, spider): self.fp.close() # from scrapy.exporters import JsonItemExporter # # class QsbkPipeline(object): # def __init__(self): # self.fp = open('duanzi.json', 'wb') # self.exporter = JsonItemExporter(self.fp, ensure_ascii=False, # encoding='utf-8') # self.exporter.start_exporting() # # def open_spider(self, spider): # pass # # def process_item(self, item, spider): # self.exporter.export_item(item) # # return item # # def close_spider(self, spider): # self.exporter.finish_exporting() # self.fp.close()
class PowerMarketPipeline: def __init__(self): pass def open_spider(self, spiders): print("------ TablePipeline start ------") def process_item(self, item, spider): if isinstance(item, CurrentItem): self.fp = open(item['rename'], 'wb') self.exporters = JsonLinesItemExporter(self.fp, ensure_ascii=False, encoding='utf-8') self.exporters.export_item(item) self.fp.close() else: Type = str(type(item)) print('------ ' + 'Warnning, PowerMarketPipeline: ItemType(' + Type + ') matched failed, so skip it. ------') return item def close_spider(self, spider): print("------ TablePipeline end ------")
class FangPipeline: def __init__(self): self.fp_new = open("new.json", 'bw') self.exporter_new = JsonLinesItemExporter(self.fp_new, ensure_ascii=False, encoding='utf-8') self.fp_old = open('old.json', 'bw') self.exporter_old = JsonLinesItemExporter(self.fp_old, ensure_ascii=False, encoding='utf-8') def process_item(self, item, spider): old_item = item.get('old') new_item = item.get('new') if new_item: self.exporter_new.export_item(item) if old_item: self.exporter_old.export_item(item) return item def close_spider(self, spider): self.fp_new.close() self.fp_old.close()
class FangPipeline(object): def __init__(self): self.newhouse_fp = open('newhouse.json', 'wb') self.esfhouse_fp = open('esfhouse.json', 'wb') self.newhouse_exporter = JsonLinesItemExporter(self.newhouse_fp, ensure_ascii=False) self.esfhouse_exporter = JsonLinesItemExporter(self.esfhouse_fp, ensure_ascii=False) def process_item(self, item, spider): # if item['sale'] is not None: # self.newhouse_exporter.export_item(item) # else: # self.esfhouse_exporter.export_item(item) try: sale = item['sale'] self.newhouse_exporter.export_item(item) except: self.esfhouse_exporter.export_item(item) return item def close_spider(self, spider): self.newhouse_fp.close() self.esfhouse_fp.close()
class NetlendingPipeline: def __init__(self): #wb以二进制方式打开 self.fp = open("comments.json", 'wb') self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False, encoding='utf-8') def open_spider(self, spider): print("爬虫开始了...") def process_item(self, item, spider): self.exporter.export_item(item) return item def process_info(self, info): info = [re.sub(r"\t|\n|\s", "", i) for i in info] info = [i for i in info if len(i) > 0] return info def close_spider(self, spider): self.fp.close() print("爬虫结束了...")
class QsbkPipeline(object): # 好处:每次调用export_item就把数据储存到磁盘,不消耗内存。坏处:不满足json数据格式 def __init__(self): # wb 以二进制打开;因为JsonItemExporter要以byte类型储存 self.fp = open("budejie.json", "wb") self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False, encoding="utf-8") def open_spider(self, spider): '''打开爬虫就执行''' print('爬虫开始啦.....') def process_item(self, item, spider): '''保存数据''' self.exporter.export_item(item) return item def close_spider(self, spider): '''爬虫关闭时执行''' # 关闭文件 self.fp.close() print("爬虫结束啦....")
class OKCupidJsonPipeline(object): def __init__(self): dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) self.files = {} def spider_opened(self, spider): file = open('%s_items.json' % spider.name, 'w+b') self.files[spider] = file self.exporter = JsonLinesItemExporter(file, ensure_ascii=True) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): #print item #uItem = urllib.urlencode(item) #jItem = dumps(uItem, cls=PythonObjectEncoder) self.exporter.export_item(item) return item
class QsbkPipeline(object): def __init__(self): self.f = open("qsbk.json", "wb") self.exporter = JsonLinesItemExporter(self.f, ensure_ascii=False, encoding='utf-8') self.start_time = datetime.datetime.now() def open_spider(self, spider): print("[{}]开始抓取数据".format(self.start_time)) def process_item(self, item, spider): self.exporter.export_item(item) return item def close_spider(self, spider): self.f.close() end_time = datetime.datetime.now() print("数据抓取完毕,总计用时:{}".format(end_time - self.start_time)) # class QsbkPipeline(object): # def __init__(self): # self.f = open("qsbk.json", "w", encoding='utf-8') # # def open_spider(self, spider): # print("开始抓取数据") # # def process_item(self, item, spider): # item_json = json.dumps(dict(item), ensure_ascii=False) # # print(item_json) # self.f.write(item_json + "\n") # return item # # def close_spider(self, spider): # self.f.close() # print("数据抓取完毕")
class SpiderPipeline(object): # 爬虫开始时打开的文件,也可以放再open函数中 def __init__(self): # 爬虫数据要写入的文件(wb 二进制写入 JsonItemExporter必须用这个) # 同时二进制写入就不再指定编码方式 self.fp = open("saveText.json", 'wb') self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False, encoding='utf-8') # 不需要开始导入 # self.exporter.start_exporting() # 爬虫打开之后就会调用这个函数 def open_spider(self, spider): print("爬虫开始了......") # 爬虫运行过程中的传递数据调用这个函数 def process_item(self, item, spider): # 需要将传入的数据 item,导入json # 将字典 dict 转换成字符串 str,并解释成中文 # item_json = json.dumps(item, ensure_ascii=False) # item是 ITEM模型传过来的啊,用dict转变数据类型 # item_json = json.dumps(dict(item), ensure_ascii=False) # self.fp.write(item_json + "\n") # return item self.exporter.export_item(item) return item # 爬虫调用完成之后调用 def close_spider(self, spider): # 不需要完成导入 # self.exporter.finish_exporting() self.fp.close() print("爬虫结束了......")
class HuabanPipeline(object): def __init__(self): '''Open file to save the exported Items''' # save info of BoardItem self.board_info = open( 'D:/litreily/Pictures/python/huaban/boards.json', 'w+b') self.board_exporter = JsonItemExporter(self.board_info, encoding='utf-8', indent=4) # save info of PinItem self.pin_info = open('D:/litreily/Pictures/python/huaban/pins.json', 'w+b') self.pin_exporter = JsonLinesItemExporter(self.pin_info, encoding='utf-8', indent=4) def open_spider(self, spider): '''Start exporting BoardItem''' self.board_exporter.start_exporting() self.pin_exporter.start_exporting() def process_item(self, item, spider): if isinstance(item, BoardItem): self.board_exporter.export_item(item) elif isinstance(item, PinItem): self.pin_exporter.export_item(item) return item def close_spider(self, spider): '''finish exporting and close files''' self.board_exporter.finish_exporting() self.pin_exporter.finish_exporting() self.board_info.close() self.pin_info.close()
class JsonExportPipeline(object): def __init__(self): self.file_name = 'svet_androida_links_to_articles.json' self.file_handle = None def open_spider(self, spider): print('JsonExportPipeline Exporter opened') file = open(self.file_name, 'wb') self.file_handle = file self.exporter = JsonLinesItemExporter(file, encoding='utf-8', ensure_ascii=False) self.exporter.start_exporting() def close_spider(self, spider): print('JsonExportPipeline Exporter closed') self.exporter.finish_exporting() self.file_handle.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class ScrapyGraphExport(object): #Open the exporting file and init the spider def __init__(self): dispatcher.connect(self.response_received, signal=signals.response_received) dispatcher.connect(self.spider_closed, signal=signals.spider_closed) dispatcher.connect(self.spider_opened, signal=signals.spider_opened) self.output = {} self.file = open("/tmp/rawData_1.json", 'wb') self.exporter = JsonLinesItemExporter(self.file, encoding='utf-8', ensure_ascii=False) def spider_opened(self, spider): print "Spider opened" def response_received(self, response, request, spider): hxs = HtmlXPathSelector(response) i = torItem() i['url'] = response.url i['http_status'] = response.status llinks = [] for anchor in hxs.select('//a[@href]'): href = anchor.select('@href').extract()[0] if not href.lower().startswith("javascript"): llinks.append(urljoin_rfc(response.url, href)) i['linkedurls'] = llinks if request.headers.has_key('Referer'): i['referer'] = request.headers['Referer'] self.exporter.export_item(i) return i def spider_closed(self, spider): self.file.close()
class ResolutionPipeline(object): """Pipeline used for ResolutionSpider.""" def __init__(self): self.file = None self.exporter = None # compile regular expressions: # input looks like 'dec14R.aspx' # we need the resolution number (14R) self.resolution_number_pattern = re.compile(r"^\D+(?P<number>.+?)\..*$") # input looks like 'ממשלה/הממשלה ה - 34 בנימין נתניהו;' # we need the government number (34) and prime minister name (בנימין נתניהו) self.gov_pattern = re.compile(r'^.+\s??\-\s?(?P<gov_number>.+?)\s+?(?P<pm_name>.+?);?$') def open_spider(self, spider): """Initialize export JSON lines file.""" self.file = open("gov.json", "wb") self.exporter = JsonLinesItemExporter(self.file, ensure_ascii=False) self.exporter.start_exporting() def close_spider(self, spider): """Close export file.""" self.file.close() self.exporter.finish_exporting() def process_item(self, item, spider): """Sanitize text for each field, and export to file.""" try: data = { 'url': item["url"], 'date': self.get_date(item).timestamp, 'resolution_number': self.get_resolution_number(item), 'gov_number': self.get_gov_number(item), 'pm_name': self.get_pm_name(item), 'title': self.get_title(item), 'subject': self.get_subject(item), 'body': self.get_body(item), } except ResolutionError as ex: # if one of the fields fails sanitation, # raise and exception # and export the url leading to the specific resolution # for later (human) review self.exporter.export_item({'error': repr(ex), 'url': item["url"], }) else: self.exporter.export_item(data) return item # the following are specific field handling functions # e.g. cleaning, stripping, etc. # these should be called before dumping the data def get_date(self, item): if len(item["date"]) != 1: raise ResolutionError("Date field length is not 1 for item %s", item) return arrow.get(item["date"][0], "YYYYMMDD") def get_resolution_number(self, item): if len(item["resolution_number"]) != 1: raise ResolutionError("Resolution number field length is not 1 for item %s", item) return self.resolution_number_pattern.search(item["resolution_number"][0]).group('number') def get_gov_number(self, item): if len(item["gov"]) != 1: raise ResolutionError("Government field length is not 1 for item %s", item) gov_match = self.gov_pattern.search(item["gov"][0]) return gov_match.group("gov_number") def get_pm_name(self, item): if len(item["gov"]) != 1: raise ResolutionError("Government field length is not 1 for item %s", item) gov_match = self.gov_pattern.search(item["gov"][0]) return gov_match.group("pm_name") def get_title(self, item): if len(item["title"]) == 0: raise ResolutionError("Title fields is empty for item %s", item) return '\n'.join(item["title"]).strip() def get_subject(self, item): if len(item["subject"]) == 0: raise ResolutionError("Subject field is empty for item %s", item) return '\n'.join(item["subject"]).strip() def get_body(self, item): if len(item["body"]) == 0: raise ResolutionError("Body field is empty for item %s", item) # return '\n'.join(item["body"]).strip() # body is originally a list of lines # it is intentionally not stripped # some resolutions have custom css, tables, # and other crap which i'd rather not process here, # but in a later stage, unrelated to the scraper return item["body"]