class JsonLinesExportPipeline(object): """ app.pipelines.exporter_json_lines.JsonLinesExportPipeline """ def __init__(self): self.files = {} self.exporter = None @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file_json_lines = open('%s_item_lines.json' % spider.name, 'w+b') self.files[spider] = file_json_lines self.exporter = JsonLinesItemExporter(file_json_lines) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file_json_lines = self.files.pop(spider) file_json_lines.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
def export_item(self, item): storage_file = open(self.item_storage_path(item["id"]), "w") item_exporter = JsonLinesItemExporter(storage_file) item_exporter.start_exporting() item_exporter.export_item(item) item_exporter.finish_exporting() storage_file.close()
def load_table(table, source = "default/test-0.jsonlines",\ modifier = "" , dblogin = "******" , \ as_name = None ): filename = source if as_name is None else as_name dbname, collectioname = parse_path(filename , modifier) connection = pymongo.MongoClient( dblogin ) db = getattr(connection, dbname) collection = getattr( db , collectioname ) try: result = collection.insert_many((set_id(obj) for obj in odicts(table)),ordered=False) except BulkWriteError as e: result = e.details errs = set() with open( "%s.%s" % (filename,"errors") , "a") as f: exporter = JsonLinesItemExporter(f) exporter.start_exporting() for err in result.get("writeErrors"): if not err.get("op").get("_id") in errs: obj = dict( item = err.get("op") , \ error = err.get("errmsg") ) errs.add( err.get("op").get("_id") ) exporter.export_item(obj) exporter.finish_exporting() f.close() return result
class JsonExportPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_all.json' % spider.name, 'w+b') self.files[spider] = file self.exporter = JsonLinesItemExporter(file, encoding='utf-8', ensure_ascii=False) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class CardPipeline(object): def __init__(self): self.files = {} def process_item(self, item, spider): if not item['wb_nick']\ or not item['wb_location']\ or not item['wb_images']: raise DropItem print item['wb_nick'][0] item['wb_content'] = ''.join(item['wb_content']) item['wb_date'] = item['wb_date'][0] item['wb_location'] = item['wb_location'][0] images_urls = item.pop('wb_images') item['wb_images'] = [] for image_url in images_urls: image_url = image_url.replace('thumbnail', 'large') image_url = image_url.replace('square', 'large') item['wb_images'].append(image_url) self.exporter.export_item(item) return item def open_spider(self, spider): file = open('json/{}_products.json'.format(spider.name), 'w+b') self.files[spider] = file self.exporter = JsonLinesItemExporter(file) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close()
class QiushiPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): # The file created on Dec20 2015 will be named as "12-20-2015.json" datestr = date.today().strftime("%m-%d-%Y") file = open('scraped_data/%s.json' % datestr, 'w+b') self.files[spider] = file self.exporter = JsonLinesItemExporter(file, ensure_ascii=False) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) # print item['author'] # print item['title'] # print item['content'] # print item['href'] return item
class KinoPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): # It might be possible to add indent=4 and ensure_ascii=False somewhere. file = open('output/' + spider.name + '.jsonl', 'w+b', encoding='utf-8') self.files[spider] = file self.exporter = JsonLinesItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class MedPipeline(object): def __init__(self): self.ids_seen = set() @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): self.file = open('medData.json', 'wb') self.expoter = JsonLinesItemExporter(self.file) self.expoter.start_exporting() def spider_closed(self, spider): self.expoter.finish_exporting() self.file.close() def process_item(self, item, spider): if int(item['reply_num'][0]) == 0: raise DropItem("no reply in %s" % item) elif item['post_id'] in self.ids_seen: raise DropItem("Duplicate item found: %s" % item) else: self.ids_seen.add(item['post_id']) self.expoter.export_item(item) return item
def spider_opened(self, spider): for i in self.JSONWriters.values(): file = open('%s_out.json' % i, 'w+b') self.files[spider] = file exporter = JsonLinesItemExporter(file) self.exporters[i] = exporter exporter.start_exporting() print(self.exporters)
def _exporter_for_item(self, item): season = item["season"] if season not in self.season_to_exporter: f = open(f'./outputs/champions_league_{season}.json', 'wb') exporter = JsonLinesItemExporter(f) exporter.start_exporting() self.season_to_exporter[season] = exporter return self.season_to_exporter[season]
def process_item(self, item, spider): if not isinstance(item, Publisher): return item ts = int(time.time()) try: is_updated = self._is_updated(item) ret = self.coll.update({"_id": item["_id"]}, { "$setOnInsert": { "_id": item["_id"], "username": item["username"], "first_scraped_ts": ts, }, "$set": { "full_name": item["full_name"], "profile_pic_url": item["profile_pic_url"], "profile_pic_url_hd": item["profile_pic_url_hd"], "followed_by": item["followed_by"], "biography": item["biography"], "external_url": item["external_url"], "published_count": item["published_count"], "downloaded_avatar_info": item.get("downloaded_avatar_info"), "update_ts": ts, "begin_ts": ts, "status": -1 } }, upsert=True) if ret['updatedExisting']: logger.info('Updated publisher: %s', item["username"]) else: logger.info('Inserted publisher: %s', item["username"]) if is_updated: logger.info('Publisher %s is updated.', item["username"]) filename = '{}.jl'.format(item["username"]) filename = os.path.join(self.export_filepath, filename) export_file = open(filename, 'wb') exportor = JsonLinesItemExporter(export_file) exportor.start_exporting() exportor.export_item(item) exportor.finish_exporting() logger.info('dumped item to file: %s', item["username"]) self.task.send_task('sync_publisher', (item["username"], )) logger.info('Send task sync_publisher: %s', item["username"]) else: logger.info( 'Publisher %s is not updated. No dumping data or sending task', item["username"]) except RedisError: logger.error('Send task Failed. Network unreachable') raise DropItem('Send sync_publisher task FAILED. DROP ITEM %s' % item["username"]) except: logger.error('DB FAILED: %s', traceback.format_exc()) raise DropItem('Save publisher to db FAILED. DROP ITEM %s' % item["_id"]) else: return item
class ArticleJsonSavePipeline(object): def __init__(self): self.file = open('data.json', 'wb') self.exporter = JsonLinesItemExporter(self.file) self.exporter.start_exporting() def process_item(self, item, spider): self.exporter.export_item(item) return item
class DictionaryPipeline(FileExporter): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): pass def spider_closed(self, spider): pass def process_item(self, item, spider): DATA_STORE = spider.settings.get('DATA_STORE') if item and \ 'letter' in item.keys() and \ 'strongs_number' in item.keys() and \ 'word_original' in item.keys() and \ 'word_translated' in item.keys(): found_in_words_file = False language = 'all' if item['strongs_number'][0] == 'H': language = 'hebrew' elif item['strongs_number'][0] == 'G': language = 'greek' WORDS_FILE = os.path.join( DATA_STORE, spider.settings.get('DICTIONARY_FILE') % (language, item['letter'])) if os.path.exists(WORDS_FILE): with open(WORDS_FILE, 'r') as words: for word in words: data = json.loads(word) if item['word_translated'] == data['word_translated'] and \ item['strongs_number'] == data['strongs_number']: found_in_words_file = True break else: ensure_dir('%s' % os.path.dirname(WORDS_FILE)) if not found_in_words_file: words_file = open(WORDS_FILE, 'a+') self.files[spider] = words_file self.exporter = JsonLinesItemExporter(words_file) self.exporter.start_exporting() self.exporter.export_item(item) self.exporter.finish_exporting() word_file = self.files.pop(spider) word_file.close() return item
def _exporter_for_item(self, item): doc_id = item['url_id'] dpath = Path('gradsop/data') if doc_id not in self.url_id_to_exporter: f = open(dpath/f'{doc_id}.json', 'wb') exporter = JsonLinesItemExporter(f) exporter.start_exporting() self.url_id_to_exporter[doc_id] = exporter return self.url_id_to_exporter[doc_id]
def open_spider(self, spider): destination_file = spider.settings.get("DESTINATION_FILE") destination_dir = os.path.dirname(destination_file) os.makedirs(destination_dir, exist_ok=True) if os.path.exists(destination_file): os.replace(destination_file, '{}_backup_{}'.format(destination_file, datetime.utcnow().isoformat())) file_ = open(destination_file, 'ab+') company_exporter = JsonLinesItemExporter(file_) company_exporter.start_exporting() self.items_exporter = company_exporter
def _get_exporter(self, item): year = item['year'] if year not in self.year_exporters: f = open(os.path.join(self.database_dir, '{}.json'.format(year)), 'ab') exporter = JsonLinesItemExporter(f) exporter.start_exporting() self.year_exporters[year] = exporter return self.year_exporters[year]
def get_exporter_for_item(self, item, hasDate=False): file = self.get_exporter_key_for_item(item) if file not in self.all_exporters: f = open(file, 'wb') exporter = JsonLinesItemExporter(f) exporter.start_exporting() self.all_exporters[file] = exporter if hasDate: print(f'----- Starting crawling process for {item["race_id"]} ({item["race_date"]})') return self.all_exporters[file]
def _exporter_for_item(self, item, spider): url = item['url'] ticker = re.search(r'/[A-Z]+/', url).group()[1:-1] metric = re.search(r'[A-Z]/.+p', url).group()[2:-2] f_name = f'{ticker}-{metric}' if f_name not in self.f_lst: f = open(f_name + '.jl', 'wb') exporter = JsonLinesItemExporter(f) exporter.start_exporting() self.f_lst[f_name] = exporter return self.f_lst[f_name]
def _exporter_for_item(self, item): # Get the subject of the current item subject = item['subject'] # If this is a new subject, create a new file with subject.jl as its name ject.jl as name to store all courses of the same subject if subject not in self.subject_exporters: f = open(Path(OUTPUT_PATH).joinpath(f'{subject}.jl'), mode='wb') exporter = JsonLinesItemExporter(f) exporter.start_exporting() self.subject_exporters[ subject] = exporter # add a new entry in the exporter dictionary return self.subject_exporters[subject]
def process_item(self, item, spider): filename = str(item['listing'][0]['id']) + '.jl' with open(filename, 'wb') as file: exporter = JsonLinesItemExporter( file, fields_to_export=['listing', 'trovokasa']) exporter.start_exporting() exporter.export_item(item) exporter.finish_exporting() pathlib.Path(__file__).parents[1].joinpath(filename).rename( self.feed_path.joinpath(filename)) return item
class TutorialPipeline(object): def open_spider(self, spider): self.file = open('result.json', 'w') self.exporter = JsonLinesItemExporter(self.file) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class WxappPipeline: def __init__(self): self.fp = open("wxapp.json", "wb") self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False) self.exporter.start_exporting() def process_item(self, item, spider): self.exporter.export_item(item) return item def close_spider(self, spider): self.exporter.finish_exporting() self.fp.close()
class JsonLPipeline: def open_spider(self, spider): self.file = open('static/maquinas.jsonl', 'ab') self.exporter = JsonLinesItemExporter(self.file) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
def _exporter_for_item(self, item): if 'category' in item.keys(): category = item['category'] else: category = 'reviews_of_'+item['review_category'] if category not in self.category_to_exporter.keys(): f = open(f'{category}.json', 'wb') exporter = JsonLinesItemExporter(f,indent=4) exporter.start_exporting() self.category_to_exporter[category] = exporter return self.category_to_exporter[category]
class JsonPickerPipe(object): def __init__(self): self.jsonfile = open("arts.json","wb") self.exporter = JsonLinesItemExporter(self.jsonfile,encoding="utf-8") self.exporter.start_exporting() def process_item(self, item, spider): self.exporter.export_item(item) return item def close_spider(self,spider): self.jsonfile.close() self.exporter.finish_exporting()
class JsonExportPipeline(object): def open_spider(self, spider): now = datetime.datetime.now() file = open(now.strftime('%Y%m%d%H%M%S%f.json'), 'wb') self.exporter = JsonLinesItemExporter(file) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.exporter.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class Artnet_Headline_Pipeline: def process_item(self, item, spider): self.exporter.export_item(item) #print("Pipeline test: " + item['blurb'][0]) return item def __init__(self): self.file = open("metadata.jsonl", 'wb') self.exporter = JsonLinesItemExporter(self.file) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.file.close()
class TrackerPipeline(object): def __init__(self): self.file = open('items.json', 'wb') self.exporter = JsonLinesItemExporter(self.file, encoding='utf-8', ensure_ascii=False) self.exporter.start_exporting() def process_item(self, item, spider): self.exporter.export_item(item) return item def close_spider(self, spider): self.file.close()
class EduSpider(scrapy.Spider): """ Used to scrape .edu websites for web technology statistics """ name = 'edu' def __init__(self): scrapy.Spider.__init__(self) baseurl = 'https://domaintyper.com/top-websites/most-popular-websites-with-edu-domain/page/' logpath = '/home/www/flasksite/static/scenes/unipagestats/schools.jl' self.start_urls = [baseurl + str(i) for i in xrange(1, 30)] self.domain = 'domaintyper.com' self.exporter = JsonLinesItemExporter(open(logpath, 'wb+')) def parse(self, response): self.exporter.start_exporting() urls = [url.encode('utf-8') for url in response.css('.wsTR > td:nth-child(2)').xpath('text()').extract()] for url in urls: fullurl = 'http://www.' + url + '/' yield scrapy.Request(fullurl, callback=self.parse_edu_site) def parse_edu_site(self, response): data = SiteData() tc = TagCounter() # Fill summary fields data['url'] = response.url data['domain'] = '.'.join(response.url.split('/')[2].split('.')[-2:]) data['name'] = data['domain'].split('.')[0] data['title'] = response.xpath('//title/text()').extract()[0].encode('utf-8') # Fill CSS fields data['css_paths'] = [stylesheet.encode('utf-8') for stylesheet in response.xpath('//link[@rel="stylesheet"]/@href').extract()] data['css_files'] = [stylesheet.split('/')[-1] for stylesheet in data['css_paths']] # Fill JS fields data['js_paths'] = [script.encode('utf-8') for script in response.xpath('//script/@src').extract()] data['js_files'] = [script.split('/')[-1] for script in data['js_paths']] # Fill tag fields tc.feed(response.body) data['tagcount'] = tc.tagcount data['nonvoidcount'] = tc.nonvoid_tagcount data['topnest'] = tc.topnest self.exporter.export_item(data) yield data def __del__(self): scrapy.Spider.__del__(self) self.exporter.finish_exporting()
def process_item(self, item, spider): directory = get_data_dir(item['key']) if not os.path.exists(directory): os.makedirs(directory) write_file = directory + "/" + item['key'] + ".jsonlines" f = open(write_file, "w") exporter = JsonLinesItemExporter(f) exporter.start_exporting() exporter.export_item(item) exporter.finish_exporting() f.close() return item
class CrawlerPipeline(object): def __init__(self): self.file = None self.exporter = None self.dirname = None self.articles_seen = set() def open_spider(self, spider): self.file = open('articles.json', 'a+b') self.exporter = JsonLinesItemExporter(self.file) self.exporter.start_exporting() self.file.seek(0) articles_seen = [ json.loads(line)['url'] for line in self.file.read().splitlines() ] self.articles_seen = set(articles_seen) self.dirname = os.path.join("articles", spider.allowed_domains[0]) + "/html" if not os.path.exists(self.dirname): os.makedirs(self.dirname) def close_spider(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, article, spider): if article['url'] in self.articles_seen: raise DropItem("Duplicate article found: %s" % article) filename = sha1(article['url']).hexdigest() + '.html' path = os.path.join(self.dirname, filename) item = { "domain": spider.allowed_domains[0], "url": article['url'], "title": article['title'], "path": path } with open(path, 'wb+') as f: f.write(article['html']) self.exporter.export_item(item) self.articles_seen.add(article['url']) return article
class TutorialPipeline(object): def __init__(self): self.file = open("headlines_2021.json", 'wb') self.exporter = JsonLinesItemExporter(self.file, encoding='utf-8', ensure_ascii=False) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
def _exporter_for_item(self, item, spider_name): create_time = int(float(item['create_time'])/86400)*86400 dt = datetime.fromtimestamp(create_time) exporter_dir = os.path.join(self.export_root_dir, spider_name, str(dt.year), str(dt.month)) pathlib.Path(exporter_dir).mkdir(parents=True, exist_ok=True) exporter_file_name = os.path.join(exporter_dir, str(dt.day)) if create_time not in self.exporters_dict[spider_name]: self._close_exporter(spider_name) logging.info("start new exporter, saved to: %s", exporter_file_name) f = open(exporter_file_name, 'ab') exporter = JsonLinesItemExporter(f, ensure_ascii=False) exporter.start_exporting() self.exporters_dict[spider_name][create_time] = exporter logging.info("export to: %s", exporter_file_name) return self.exporters_dict[spider_name][create_time]
class JsonWriterPipeline(object): def __init__(self): self.file = open('top_posts.csv', 'w+b') def spider_opened(self, spider): file = open('top_posts.csv', 'w+b') self.exporter = JsonLinesItemExporter(self.file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): JsonLinesItemExporter(self.file).export_item(item) return item
class JsonPipelines: def __init__(self): self.file = open('data.json', 'wb') self.exporters = JsonLinesItemExporter(self.file, ensure_ascii=False, encoding='utf-8') self.file.write(b'[') self.exporters.start_exporting() def process_item(self, item, spider): self.exporters.export_item(item) self.file.write(b',') def close_item(self, spider): self.file.write(b']') self.file.close()
class WriteImdbPipeline(object): def __init__(self): self.filename = 'imdb.json' def open_spider(self, spider): self.jsonfile = open(self.filename, 'wb') self.exporter = JsonLinesItemExporter(self.jsonfile) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.jsonfile.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class MovieJsonPipeline(object): def open_spider(self, spider): if isinstance(spider, doubanSpider.DoubanSpider): file = open("../movies.json", 'wb') self.exporter = JsonLinesItemExporter(file, ensure_ascii=False) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.exporter.file.close() def process_item(self, item, spider): if isinstance(item, MovieItem) and isinstance( spider, doubanSpider.DoubanSpider): self.exporter.export_item(item) return item
class TopicPipeline(FileExporter): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): pass def spider_closed(self, spider): pass def process_item(self, item, spider): DATA_STORE = spider.settings.get('DATA_STORE') if item and \ 'name' in item.keys() and \ 'url' in item.keys() and \ 'letter' in item.keys(): found_in_topic_file = False TOPICS_FILE = os.path.join( DATA_STORE, spider.settings.get('TOPICS_FILE') % item['letter']) if os.path.exists(TOPICS_FILE): with open(TOPICS_FILE, 'r') as topics: for topic in topics: if item['name'] in topic: found_in_topic_file = True break else: ensure_dir('%s' % os.path.dirname(TOPICS_FILE)) if not found_in_topic_file: topics_file = open(TOPICS_FILE, 'a+') self.files[spider] = topics_file self.exporter = JsonLinesItemExporter(topics_file) self.exporter.start_exporting() self.exporter.export_item(item) self.exporter.finish_exporting() chapter_file = self.files.pop(spider) chapter_file.close() return item
class DemoPipeline(object): def __init__(self): self.fp = open('resp.json', 'wb') self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False, encoding='utf-8') self.exporter.start_exporting() def open_spider(self, spider): print('start') def process_item(self, item, spider): self.exporter.export_item(item) return item def close_spider(self, spider): self.fp.close()
class QiushiSpiderPipeline(object): def open_spider(self,spider): print('爬虫开始了') # wb 以二进制形式打开 self.fp = open('duanzi.json', 'wb') self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False,encoding='utf-8') self.exporter.start_exporting() # 有 item 传过来的时候就会被调用 def process_item(self, item, spider): self.exporter.export_item(item) return item def close_spider(self,spider): self.exporter.finish_exporting() self.fp.close() print('爬虫结束了')
class OKCupidJsonPipeline(object): def __init__(self): dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) self.files = {} def spider_opened(self, spider): file = open('%s_items.json' % spider.name, 'w+b') self.files[spider] = file self.exporter = JsonLinesItemExporter(file, ensure_ascii=True) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): #print item #uItem = urllib.urlencode(item) #jItem = dumps(uItem, cls=PythonObjectEncoder) self.exporter.export_item(item) return item
class ResolutionPipeline(object): """Pipeline used for ResolutionSpider.""" def __init__(self): self.file = None self.exporter = None # compile regular expressions: # input looks like 'dec14R.aspx' # we need the resolution number (14R) self.resolution_number_pattern = re.compile(r"^\D+(?P<number>.+?)\..*$") # input looks like 'ממשלה/הממשלה ה - 34 בנימין נתניהו;' # we need the government number (34) and prime minister name (בנימין נתניהו) self.gov_pattern = re.compile(r'^.+\s??\-\s?(?P<gov_number>.+?)\s+?(?P<pm_name>.+?);?$') def open_spider(self, spider): """Initialize export JSON lines file.""" self.file = open("gov.json", "wb") self.exporter = JsonLinesItemExporter(self.file, ensure_ascii=False) self.exporter.start_exporting() def close_spider(self, spider): """Close export file.""" self.file.close() self.exporter.finish_exporting() def process_item(self, item, spider): """Sanitize text for each field, and export to file.""" try: data = { 'url': item["url"], 'date': self.get_date(item).timestamp, 'resolution_number': self.get_resolution_number(item), 'gov_number': self.get_gov_number(item), 'pm_name': self.get_pm_name(item), 'title': self.get_title(item), 'subject': self.get_subject(item), 'body': self.get_body(item), } except ResolutionError as ex: # if one of the fields fails sanitation, # raise and exception # and export the url leading to the specific resolution # for later (human) review self.exporter.export_item({'error': repr(ex), 'url': item["url"], }) else: self.exporter.export_item(data) return item # the following are specific field handling functions # e.g. cleaning, stripping, etc. # these should be called before dumping the data def get_date(self, item): if len(item["date"]) != 1: raise ResolutionError("Date field length is not 1 for item %s", item) return arrow.get(item["date"][0], "YYYYMMDD") def get_resolution_number(self, item): if len(item["resolution_number"]) != 1: raise ResolutionError("Resolution number field length is not 1 for item %s", item) return self.resolution_number_pattern.search(item["resolution_number"][0]).group('number') def get_gov_number(self, item): if len(item["gov"]) != 1: raise ResolutionError("Government field length is not 1 for item %s", item) gov_match = self.gov_pattern.search(item["gov"][0]) return gov_match.group("gov_number") def get_pm_name(self, item): if len(item["gov"]) != 1: raise ResolutionError("Government field length is not 1 for item %s", item) gov_match = self.gov_pattern.search(item["gov"][0]) return gov_match.group("pm_name") def get_title(self, item): if len(item["title"]) == 0: raise ResolutionError("Title fields is empty for item %s", item) return '\n'.join(item["title"]).strip() def get_subject(self, item): if len(item["subject"]) == 0: raise ResolutionError("Subject field is empty for item %s", item) return '\n'.join(item["subject"]).strip() def get_body(self, item): if len(item["body"]) == 0: raise ResolutionError("Body field is empty for item %s", item) # return '\n'.join(item["body"]).strip() # body is originally a list of lines # it is intentionally not stripped # some resolutions have custom css, tables, # and other crap which i'd rather not process here, # but in a later stage, unrelated to the scraper return item["body"]