class CardPipeline(object): def __init__(self): self.files = {} def process_item(self, item, spider): if not item['wb_nick']\ or not item['wb_location']\ or not item['wb_images']: raise DropItem print item['wb_nick'][0] item['wb_content'] = ''.join(item['wb_content']) item['wb_date'] = item['wb_date'][0] item['wb_location'] = item['wb_location'][0] images_urls = item.pop('wb_images') item['wb_images'] = [] for image_url in images_urls: image_url = image_url.replace('thumbnail', 'large') image_url = image_url.replace('square', 'large') item['wb_images'].append(image_url) self.exporter.export_item(item) return item def open_spider(self, spider): file = open('json/{}_products.json'.format(spider.name), 'w+b') self.files[spider] = file self.exporter = JsonLinesItemExporter(file) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close()
def spider_opened(self, spider): for i in self.JSONWriters.values(): file = open('%s_out.json' % i, 'w+b') self.files[spider] = file exporter = JsonLinesItemExporter(file) self.exporters[i] = exporter exporter.start_exporting() print(self.exporters)
def spider_opened(self, spider): # The file created on Dec20 2015 will be named as "12-20-2015.json" datestr = date.today().strftime("%m-%d-%Y") file = open('scraped_data/%s.json' % datestr, 'w+b') self.files[spider] = file self.exporter = JsonLinesItemExporter(file, ensure_ascii=False) self.exporter.start_exporting()
def __init__(self): scrapy.Spider.__init__(self) baseurl = 'https://domaintyper.com/top-websites/most-popular-websites-with-edu-domain/page/' logpath = '/home/www/flasksite/static/scenes/unipagestats/schools.jl' self.start_urls = [baseurl + str(i) for i in xrange(1, 30)] self.domain = 'domaintyper.com' self.exporter = JsonLinesItemExporter(open(logpath, 'wb+'))
def load_table(table, source = "default/test-0.jsonlines",\ modifier = "" , dblogin = "******" , \ as_name = None ): filename = source if as_name is None else as_name dbname, collectioname = parse_path(filename , modifier) connection = pymongo.MongoClient( dblogin ) db = getattr(connection, dbname) collection = getattr( db , collectioname ) try: result = collection.insert_many((set_id(obj) for obj in odicts(table)),ordered=False) except BulkWriteError as e: result = e.details errs = set() with open( "%s.%s" % (filename,"errors") , "a") as f: exporter = JsonLinesItemExporter(f) exporter.start_exporting() for err in result.get("writeErrors"): if not err.get("op").get("_id") in errs: obj = dict( item = err.get("op") , \ error = err.get("errmsg") ) errs.add( err.get("op").get("_id") ) exporter.export_item(obj) exporter.finish_exporting() f.close() return result
def export_item(self, item): storage_file = open(self.item_storage_path(item["id"]), "w") item_exporter = JsonLinesItemExporter(storage_file) item_exporter.start_exporting() item_exporter.export_item(item) item_exporter.finish_exporting() storage_file.close()
class JsonExport(object): def open_spider(self, spider): if spider and spider.settings.get('EXPORTER_PATH'): path = spider.settings.get('EXPORTER_PATH') file_name = spider.name + '.json' else: path = EXPORTER_PATH file_name = 'questions.json' self._file = open(path + file_name, 'w+b') self._exporter = JsonLinesItemExporter(self._file) def process_item(self, item, spider): self._exporter.export_item(item) return item def close_spider(self, spider): self._file.close()
class DuanziPipleline: def __init__(self): self.fp = open("duanzi.json", 'wb') self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False, encoding='utf-8') self.exporter.start_exporting() def open_spider(self, spider): print("start....") def process_item(self, item, spider): self.exporter.export_item(item) return item def close_spider(self, spider): self.fp.close() print("over....")
def process_item(self, item, spider): filename = item['filename'] del item['filename'] # if the file exists it will append the data JsonLinesItemExporter(open(filename, "ab")).export_item(item) return item
class Scrapy01Pipeline(object): def __init__(self): print("__init__") self.ft = open("./Scrapy01/files/qsbk.json", "wb") #注意以二进制的形式打开 self.exporter = JsonLinesItemExporter(self.ft, ensure_ascii=False, encoding='utf-8') def open_spider(self, spider): print("open_spider") def process_item(self, item, spider): self.exporter.export_item(item) return item def close_spider(self, spider): self.ft.close() print("close_spider")
class WechatAppPipeline(object): def __init__(self): self.json = open('wechat.json', 'wb') #Json写入器 self.json_export = JsonLinesItemExporter(self.json, ensure_ascii=False, encoding='utf-8') def run(self): print("爬虫开始") def process_item(self, item, spider): self.json_export.export_item(item) return item def close_spider(self): self.json.close() print("爬虫结束")
class NewHousePipeline(object): def __init__(self): self.newhouse_fp = open("newhouse.json",'wb') self.esf_fp = open("esf.json",'wb') self.newhouse_exporter = JsonLinesItemExporter(self.newhouse_fp, ensure_ascii=False) self.esf_exporter = JsonLinesItemExporter(self.esf_fp, ensure_ascii=False) def process_item(self, item, spider): if "newhouse" == item.get("house_style"): self.newhouse_exporter.export_item(item) elif "esf" == item.get("house_style"): self.esf_exporter.export_item(item) return item def close_spider(self, spider): self.newhouse_fp.close() self.esf_fp.close()
def process_item(self, item, spider): '''Save item info to loacl file''' if isinstance(item, VmgirlsItem): self.girls_info = open( os.path.join(self.user_data_dir, 'vmgirls.json'), 'w+b') self.girls_exporter = JsonLinesItemExporter(self.girls_info, encoding='utf-8', indent=4) self.girls_exporter.start_exporting() for url, title in zip(item['theme_urls'], item['theme_titles']): single_item = {'theme_url': url, 'title': title} self.girls_exporter.export_item(single_item) self.girls_exporter.finish_exporting() self.girls_info.close() return item
class BaiducrawlerPipeline: def __init__(self): self.fp = open("data.json", "wb") self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False, encoding='utf-8') def open_spider(self, spider): print("爬虫开始了") def process_item(self, item, spider): self.exporter.export_item(item) print("存了") return item def close_spider(self, spider): self.fp.close() print("爬虫结束了")
class HexunPipeline(object): def __init__(self): ssstime = time.strftime("%Y-%m-%d %H-%M-%S", time.localtime()) self.fp = open("文章网址" + ssstime + ".json", 'wb') self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False) def open_spider(self, spider): print("=====爬虫开始力=====") def process_item(self, item, spider): self.exporter.export_item(item) return item def close_spider(self, spider): self.exporter.finish_exporting() self.fp.close() print("=====爬虫结束力=====")
class WxappPipeline: # 当爬虫被打开的时候会调用 def open_spider(self, spider): print("爬虫开始执行。。。") fileName = "article.json" self.fp = open(fileName, "wb") # 必须以二进制的形式打开文件 self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False, encoding="utf-8") # 当爬虫有item传过来的时候会调用 def process_item(self, item, spider): self.exporter.export_item(item) return item # 当爬虫关闭的时候会调用 def close_spider(self, spider): print("爬虫执行结束")
def __init__(self, user_data_dir): '''Open file to save the exported Items''' self.user_data_dir = user_data_dir if not os.path.isdir(self.user_data_dir): os.makedirs(self.user_data_dir) # save info of BoardItem self.board_info = open(self.user_data_dir + 'boards.json', 'w+b') self.board_exporter = JsonItemExporter(self.board_info, encoding='utf-8', indent=4) # save info of PinItem self.pin_info = open(self.user_data_dir + 'pins.json', 'w+b') self.pin_exporter = JsonLinesItemExporter(self.pin_info, encoding='utf-8', indent=4)
class JrsjPipeline: def __init__(self): #wb以二进制方式打开 self.fp = open("xinwenli.json", 'wb') self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False, encoding='utf-8') def open_spider(self, spider): print('爬虫开始了...') def process_item(self, item, spider): self.exporter.export_item(item) return item def close_spider(self, spider): self.fp.close() print('爬虫结束了')
def spider_opened(self, spider): if not os.path.isdir(self.outputs): os.mkdir(self.outputs) path = os.path.join(self.outputs, f'{spider.name}.json') if os.path.isfile(path): os.unlink(path) file = open(path, 'a+b') self.exporters[spider.name] = JsonLinesItemExporter(file) self.exporters[spider.name].start_exporting()
class SohousePipeline(object): def __init__(self): self.file1 = open('D:/newhouse.jsno','wb') self.file2 = open("D:/esfhouse.json",'wb') self.expoter1 = JsonLinesItemExporter(self.file1,ensure_ascii=False) self.expoter2 = JsonLinesItemExporter(self.file2, ensure_ascii=False) def process_item(self, item, spider): # 区分item if isinstance(item,ESFitem): self.expoter2.export_item(item) if isinstance(item,NewHouseItem): self.expoter1.export_item(item) return item def close_spider(self,spider): self.file1.close() self.file2.close()
class AmazoncleaningPipeline(object): def __init__(self): self.fp = open("results.jl", 'wb') self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False, encoding='utf-8') def open_spider(self, spider): pass def process_item(self, item, spider): self.exporter.export_item(item) self.fp.write(b'') return item def close_spider(self, spider): self.fp.close()
def spider_opened(self, spider): file_pprnt = open( '%s_pprint-items0' % spider.name, 'w+b', ) file_jsl = open( '%s_json-items0' % spider.name, 'w+b', ) self.jsl_exporter = JsonLinesItemExporter(file_jsl) self.pprnt_exporter = PprintItemExporter(file_pprnt) self.files[spider] = [file_pprnt, file_jsl] self.pprnt_exporter.indent = 2 self.pprnt_exporter.start_exporting() self.jsl_exporter.start_exporting()
class LianjiaPipeline(object): def __init__(self): self.esfhouse_fp = open("esfhouse.json","wb") self.esfhouse_exporter = JsonLinesItemExporter(self.esfhouse_fp,ensure_ascii=False) self.zfhouse_fp = open("zfhouse.json", "wb") self.zfhouse_exporter = JsonLinesItemExporter(self.zfhouse_fp, ensure_ascii=False) def process_item(self, item, spider): if isinstance(item,EsfItem): self.esfhouse_exporter.export_item(item) return item else: self.zfhouse_exporter.export_item(item) return item def close_spider(self,spider): self.esfhouse_fp.close() self.zfhouse_fp.close()
class CapterraPipeline(object): def __init__(self): self.fp=open("capterra.json",'wb') self.exporter=JsonLinesItemExporter(self.fp,ensure_ascii=False,encoding='utf-8') # self.exporter.start_exporting() def open_spider(self,spider): print("CapterraItem爬虫开始了!") pass def process_item(self,item,spider): if isinstance(item,CapterraItem): self.exporter.export_item(item) return item # self.exporter.export_item(item) # return item def close_spider(self,spider): self.fp.close() print("CapterraItem爬虫结束了!") pass
class QsbkPipeline(object): def __init__(self): self.fp = open('duanzi.json', 'wb') self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False, encoding='utf-8') def open_spider(self, spider): print('爬虫开始') def close_spider(self, spider): self.fp.close() print('爬虫结束') def process_item(self, item, spider): self.exporter.export_item(item) return item
def process_item(self, item, spider): filename = str(item['listing'][0]['id']) + '.jl' with open(filename, 'wb') as file: exporter = JsonLinesItemExporter( file, fields_to_export=['listing', 'trovokasa']) exporter.start_exporting() exporter.export_item(item) exporter.finish_exporting() pathlib.Path(__file__).parents[1].joinpath(filename).rename( self.feed_path.joinpath(filename)) return item
class FangTianXiaScrapyPipeline(object): def __init__(self): self.newhouse_fp = open('newhouse.json', 'ab') self.esfhouse_fp = open('esfhouse.json', 'ab') self.newhouse_exporter = JsonLinesItemExporter(self.newhouse_fp, ensure_ascii=False) self.esfhouse_exporter = JsonLinesItemExporter(self.esfhouse_fp, ensure_ascii=False) def process_item(self, item, spider): if isinstance(item, NewHouseItem): self.newhouse_exporter.export_item(item) elif isinstance(item, EsfHouseItem): self.esfhouse_exporter.export_item(item) return item def close_spider(self, spider): self.newhouse_fp.close() self.esfhouse_fp.close()
def _exporter_for_item(self, item): unit = "" search_topic = item["search_topic"] search_location = item["search_location"] teacher = item["teacher"] unit = "-".join([search_topic, search_location]) full_search_date = time.strftime("%Y-%m-%d") export_name = f'{search_topic}_{full_search_date}_{search_location.lower()}.json' if unit not in self.unit_to_exporter: f = open(f"./data/scraper/{full_search_date}/{export_name}", 'wb') exporter = JsonLinesItemExporter(f) exporter.start_exporting() self.unit_to_exporter[unit] = exporter return self.unit_to_exporter[unit]
class FangtianxiaPipeline(object): def __init__(self): # 用两个文件和两个写入器来写入信息 fp1 = open('fang1.json', 'wb') fp2 = open('fang2.json', 'wb') self.exporter1 = JsonLinesItemExporter(fp1, ensure_ascii=False, encoding='utf-8') self.exporter2 = JsonLinesItemExporter(fp2, ensure_ascii=False, encoding='utf-8') def process_item(self, item, spider): # 用isinstance对item作判断,看具体要保存到哪个文件 if isinstance(item, FangtianxiaFirstItem): self.exporter1.export_item(item) if isinstance(item, FangtianxiaSecondItem): self.exporter2.export_item(item) return item
class FangPipeline: def __init__(self): self.newhouse_fp = open('newhouse.json', 'wb') self.ershouhouse_fp = open('ershouhouse.json', 'wb') self.newhouse_exporter = JsonLinesItemExporter(self.newhouse_fp, ensure_ascii=False) self.ershouhouse_exporter = JsonLinesItemExporter(self.ershouhouse_fp, ensure_ascii=False) def process_item(self, item, spider): if isinstance(item, NewhouseItem): self.newhouse_exporter.export_item(item) elif isinstance(item, ErshouItem): self.ershouhouse_exporter.export_item(item) return item def close_spider(self, spider): self.newhouse_fp.close() self.ershouhouse_fp.close()
class QsbkPipeline(object): def __init__(self): # 打开文件句柄 self.fp = open("duanzi.json", "wb") self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False, encoding="utf-8") def open_spider(self, spider): print("爬虫开始了....") def process_item(self, item, spider): # 接受yield传参,序列化为json格式,不使用默认ascii码 self.exporter.export_item(item) return item def close_spider(self, spider): self.fp.close() print("爬虫结束了.....")
class QsbkScrapyPipeline(object): def __init__(self): self.fp = open('duanzi_2.json', 'wb') self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False, encoding='utf-8') def open_spider(self, spider): print("爬虫开始了。。。") def process_item(self, item, spider): """ 将数据保存到json文件中 item: 若spider中通过yield返回数据,则被参数item接收 """ self.exporter.export_item(item) return item def close_spider(self, spider): self.fp.close() print("爬虫结束了。。。")
class QsbkPipeline: def __init__(self): #wb是以二进制方式打开,因为JsonItemExporter在写入数字时是以bite形式写入 self.fp=open("duanzi.json","wb") self.exporter=JsonLinesItemExporter(self.fp,ensure_ascii=False,encoding="utf-8") def open_spider(self,spider): print("这是爬虫开始了") def process_item(self, item, spider): self.exporter.export_item(item) return item def close_spider(self,spider): self.fp.close() print("爬虫结束了")
class BsbdjPipeline(object): def __init__(self): self.fp = open('budejie.json', 'wb') self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False, encoding='utf-8') def open_spider(self, spider): print('爬虫开始了...') pass def process_item(self, item, spider): self.exporter.export_item(item) return item def close_spider(self, spider): self.fp.close() print('爬虫结束了...') pass
class BjdyPipeline(object): def __init__(self): self.fp = open("bjdy.json", 'wb') self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False, encoding='utf-8') self.fp.write(b"[") def open_spider(self, spider): pass def process_item(self, item, spider): self.exporter.export_item(item) self.fp.write(b',') return item def close_spider(self, spider): self.fp.write(b"]") pass
class SoufangPipeline(object): def __init__(self): self.newhouse_fp = open('newhouse.json', 'wb') self.esfhouse_fp = open('esfhouse.json', 'wb') self.newhouse_exporter = JsonLinesItemExporter(self.newhouse_fp, ensure_ascii=False) self.esfhouse_exporter = JsonLinesItemExporter(self.esfhouse_fp, ensure_ascii=False) def process_item(self, item, spider): if item['genre'] == '新房': self.newhouse_exporter.export_item(item) else: self.esfhouse_exporter.export_item(item) return item def close_spider(self, spider): self.newhouse_fp.close() self.esfhouse_fp.close()
class QiushiPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): # The file created on Dec20 2015 will be named as "12-20-2015.json" datestr = date.today().strftime("%m-%d-%Y") file = open('scraped_data/%s.json' % datestr, 'w+b') self.files[spider] = file self.exporter = JsonLinesItemExporter(file, ensure_ascii=False) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) # print item['author'] # print item['title'] # print item['content'] # print item['href'] return item
class KinoPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): # It might be possible to add indent=4 and ensure_ascii=False somewhere. file = open('output/' + spider.name + '.jsonl', 'w+b', encoding='utf-8') self.files[spider] = file self.exporter = JsonLinesItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class JsonExportPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_all.json' % spider.name, 'w+b') self.files[spider] = file self.exporter = JsonLinesItemExporter(file, encoding='utf-8', ensure_ascii=False) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class JsonLinesExportPipeline(object): """ app.pipelines.exporter_json_lines.JsonLinesExportPipeline """ def __init__(self): self.files = {} self.exporter = None @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file_json_lines = open('%s_item_lines.json' % spider.name, 'w+b') self.files[spider] = file_json_lines self.exporter = JsonLinesItemExporter(file_json_lines) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file_json_lines = self.files.pop(spider) file_json_lines.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class MedPipeline(object): def __init__(self): self.ids_seen = set() @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): self.file = open('medData.json', 'wb') self.expoter = JsonLinesItemExporter(self.file) self.expoter.start_exporting() def spider_closed(self, spider): self.expoter.finish_exporting() self.file.close() def process_item(self, item, spider): if int(item['reply_num'][0]) == 0: raise DropItem("no reply in %s" % item) elif item['post_id'] in self.ids_seen: raise DropItem("Duplicate item found: %s" % item) else: self.ids_seen.add(item['post_id']) self.expoter.export_item(item) return item
class EduSpider(scrapy.Spider): """ Used to scrape .edu websites for web technology statistics """ name = 'edu' def __init__(self): scrapy.Spider.__init__(self) baseurl = 'https://domaintyper.com/top-websites/most-popular-websites-with-edu-domain/page/' logpath = '/home/www/flasksite/static/scenes/unipagestats/schools.jl' self.start_urls = [baseurl + str(i) for i in xrange(1, 30)] self.domain = 'domaintyper.com' self.exporter = JsonLinesItemExporter(open(logpath, 'wb+')) def parse(self, response): self.exporter.start_exporting() urls = [url.encode('utf-8') for url in response.css('.wsTR > td:nth-child(2)').xpath('text()').extract()] for url in urls: fullurl = 'http://www.' + url + '/' yield scrapy.Request(fullurl, callback=self.parse_edu_site) def parse_edu_site(self, response): data = SiteData() tc = TagCounter() # Fill summary fields data['url'] = response.url data['domain'] = '.'.join(response.url.split('/')[2].split('.')[-2:]) data['name'] = data['domain'].split('.')[0] data['title'] = response.xpath('//title/text()').extract()[0].encode('utf-8') # Fill CSS fields data['css_paths'] = [stylesheet.encode('utf-8') for stylesheet in response.xpath('//link[@rel="stylesheet"]/@href').extract()] data['css_files'] = [stylesheet.split('/')[-1] for stylesheet in data['css_paths']] # Fill JS fields data['js_paths'] = [script.encode('utf-8') for script in response.xpath('//script/@src').extract()] data['js_files'] = [script.split('/')[-1] for script in data['js_paths']] # Fill tag fields tc.feed(response.body) data['tagcount'] = tc.tagcount data['nonvoidcount'] = tc.nonvoid_tagcount data['topnest'] = tc.topnest self.exporter.export_item(data) yield data def __del__(self): scrapy.Spider.__del__(self) self.exporter.finish_exporting()
class OKCupidJsonPipeline(object): def __init__(self): dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) self.files = {} def spider_opened(self, spider): file = open('%s_items.json' % spider.name, 'w+b') self.files[spider] = file self.exporter = JsonLinesItemExporter(file, ensure_ascii=True) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): #print item #uItem = urllib.urlencode(item) #jItem = dumps(uItem, cls=PythonObjectEncoder) self.exporter.export_item(item) return item
def spider_opened(self, spider): file = open('%s_all.json' % spider.name, 'w+b') self.files[spider] = file self.exporter = JsonLinesItemExporter(file, encoding='utf-8', ensure_ascii=False) self.exporter.start_exporting()
def spider_opened(self, spider): # It might be possible to add indent=4 and ensure_ascii=False somewhere. file = open('output/' + spider.name + '.jsonl', 'w+b', encoding='utf-8') self.files[spider] = file self.exporter = JsonLinesItemExporter(file) self.exporter.start_exporting()
def spider_opened(self, spider): file = open('%s_items.json' % spider.name, 'w+b') self.files[spider] = file self.exporter = JsonLinesItemExporter(file, ensure_ascii=True) self.exporter.start_exporting()
def open_spider(self, spider): """Initialize export JSON lines file.""" self.file = open("gov.json", "wb") self.exporter = JsonLinesItemExporter(self.file, ensure_ascii=False) self.exporter.start_exporting()
class ResolutionPipeline(object): """Pipeline used for ResolutionSpider.""" def __init__(self): self.file = None self.exporter = None # compile regular expressions: # input looks like 'dec14R.aspx' # we need the resolution number (14R) self.resolution_number_pattern = re.compile(r"^\D+(?P<number>.+?)\..*$") # input looks like 'ממשלה/הממשלה ה - 34 בנימין נתניהו;' # we need the government number (34) and prime minister name (בנימין נתניהו) self.gov_pattern = re.compile(r'^.+\s??\-\s?(?P<gov_number>.+?)\s+?(?P<pm_name>.+?);?$') def open_spider(self, spider): """Initialize export JSON lines file.""" self.file = open("gov.json", "wb") self.exporter = JsonLinesItemExporter(self.file, ensure_ascii=False) self.exporter.start_exporting() def close_spider(self, spider): """Close export file.""" self.file.close() self.exporter.finish_exporting() def process_item(self, item, spider): """Sanitize text for each field, and export to file.""" try: data = { 'url': item["url"], 'date': self.get_date(item).timestamp, 'resolution_number': self.get_resolution_number(item), 'gov_number': self.get_gov_number(item), 'pm_name': self.get_pm_name(item), 'title': self.get_title(item), 'subject': self.get_subject(item), 'body': self.get_body(item), } except ResolutionError as ex: # if one of the fields fails sanitation, # raise and exception # and export the url leading to the specific resolution # for later (human) review self.exporter.export_item({'error': repr(ex), 'url': item["url"], }) else: self.exporter.export_item(data) return item # the following are specific field handling functions # e.g. cleaning, stripping, etc. # these should be called before dumping the data def get_date(self, item): if len(item["date"]) != 1: raise ResolutionError("Date field length is not 1 for item %s", item) return arrow.get(item["date"][0], "YYYYMMDD") def get_resolution_number(self, item): if len(item["resolution_number"]) != 1: raise ResolutionError("Resolution number field length is not 1 for item %s", item) return self.resolution_number_pattern.search(item["resolution_number"][0]).group('number') def get_gov_number(self, item): if len(item["gov"]) != 1: raise ResolutionError("Government field length is not 1 for item %s", item) gov_match = self.gov_pattern.search(item["gov"][0]) return gov_match.group("gov_number") def get_pm_name(self, item): if len(item["gov"]) != 1: raise ResolutionError("Government field length is not 1 for item %s", item) gov_match = self.gov_pattern.search(item["gov"][0]) return gov_match.group("pm_name") def get_title(self, item): if len(item["title"]) == 0: raise ResolutionError("Title fields is empty for item %s", item) return '\n'.join(item["title"]).strip() def get_subject(self, item): if len(item["subject"]) == 0: raise ResolutionError("Subject field is empty for item %s", item) return '\n'.join(item["subject"]).strip() def get_body(self, item): if len(item["body"]) == 0: raise ResolutionError("Body field is empty for item %s", item) # return '\n'.join(item["body"]).strip() # body is originally a list of lines # it is intentionally not stripped # some resolutions have custom css, tables, # and other crap which i'd rather not process here, # but in a later stage, unrelated to the scraper return item["body"]
def spider_opened(self, spider): file_json_lines = open('%s_item_lines.json' % spider.name, 'w+b') self.files[spider] = file_json_lines self.exporter = JsonLinesItemExporter(file_json_lines) self.exporter.start_exporting()
def open_spider(self, spider): file = open('json/{}_products.json'.format(spider.name), 'w+b') self.files[spider] = file self.exporter = JsonLinesItemExporter(file) self.exporter.start_exporting()
def spider_opened(self, spider): self.file = open('medData.json', 'wb') self.expoter = JsonLinesItemExporter(self.file) self.expoter.start_exporting()
def spider_opened(self, spider): file = open('%s_hp_stories.jl' % spider.name, 'w+b') self.files[spider] = file self.exporter = JsonLinesItemExporter(file) self.exporter.start_exporting()