class CardPipeline(object): def __init__(self): self.files = {} def process_item(self, item, spider): if not item['wb_nick']\ or not item['wb_location']\ or not item['wb_images']: raise DropItem print item['wb_nick'][0] item['wb_content'] = ''.join(item['wb_content']) item['wb_date'] = item['wb_date'][0] item['wb_location'] = item['wb_location'][0] images_urls = item.pop('wb_images') item['wb_images'] = [] for image_url in images_urls: image_url = image_url.replace('thumbnail', 'large') image_url = image_url.replace('square', 'large') item['wb_images'].append(image_url) self.exporter.export_item(item) return item def open_spider(self, spider): file = open('json/{}_products.json'.format(spider.name), 'w+b') self.files[spider] = file self.exporter = JsonLinesItemExporter(file) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close()
class JsonExportPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_all.json' % spider.name, 'w+b') self.files[spider] = file self.exporter = JsonLinesItemExporter(file, encoding='utf-8', ensure_ascii=False) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class KinoPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): # It might be possible to add indent=4 and ensure_ascii=False somewhere. file = open('output/' + spider.name + '.jsonl', 'w+b', encoding='utf-8') self.files[spider] = file self.exporter = JsonLinesItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class MedPipeline(object): def __init__(self): self.ids_seen = set() @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): self.file = open('medData.json', 'wb') self.expoter = JsonLinesItemExporter(self.file) self.expoter.start_exporting() def spider_closed(self, spider): self.expoter.finish_exporting() self.file.close() def process_item(self, item, spider): if int(item['reply_num'][0]) == 0: raise DropItem("no reply in %s" % item) elif item['post_id'] in self.ids_seen: raise DropItem("Duplicate item found: %s" % item) else: self.ids_seen.add(item['post_id']) self.expoter.export_item(item) return item
class QiushiPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): # The file created on Dec20 2015 will be named as "12-20-2015.json" datestr = date.today().strftime("%m-%d-%Y") file = open('scraped_data/%s.json' % datestr, 'w+b') self.files[spider] = file self.exporter = JsonLinesItemExporter(file, ensure_ascii=False) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) # print item['author'] # print item['title'] # print item['content'] # print item['href'] return item
class JsonLinesExportPipeline(object): """ app.pipelines.exporter_json_lines.JsonLinesExportPipeline """ def __init__(self): self.files = {} self.exporter = None @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file_json_lines = open('%s_item_lines.json' % spider.name, 'w+b') self.files[spider] = file_json_lines self.exporter = JsonLinesItemExporter(file_json_lines) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file_json_lines = self.files.pop(spider) file_json_lines.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
def load_table(table, source = "default/test-0.jsonlines",\ modifier = "" , dblogin = "******" , \ as_name = None ): filename = source if as_name is None else as_name dbname, collectioname = parse_path(filename , modifier) connection = pymongo.MongoClient( dblogin ) db = getattr(connection, dbname) collection = getattr( db , collectioname ) try: result = collection.insert_many((set_id(obj) for obj in odicts(table)),ordered=False) except BulkWriteError as e: result = e.details errs = set() with open( "%s.%s" % (filename,"errors") , "a") as f: exporter = JsonLinesItemExporter(f) exporter.start_exporting() for err in result.get("writeErrors"): if not err.get("op").get("_id") in errs: obj = dict( item = err.get("op") , \ error = err.get("errmsg") ) errs.add( err.get("op").get("_id") ) exporter.export_item(obj) exporter.finish_exporting() f.close() return result
def export_item(self, item): storage_file = open(self.item_storage_path(item["id"]), "w") item_exporter = JsonLinesItemExporter(storage_file) item_exporter.start_exporting() item_exporter.export_item(item) item_exporter.finish_exporting() storage_file.close()
class EduSpider(scrapy.Spider): """ Used to scrape .edu websites for web technology statistics """ name = 'edu' def __init__(self): scrapy.Spider.__init__(self) baseurl = 'https://domaintyper.com/top-websites/most-popular-websites-with-edu-domain/page/' logpath = '/home/www/flasksite/static/scenes/unipagestats/schools.jl' self.start_urls = [baseurl + str(i) for i in xrange(1, 30)] self.domain = 'domaintyper.com' self.exporter = JsonLinesItemExporter(open(logpath, 'wb+')) def parse(self, response): self.exporter.start_exporting() urls = [url.encode('utf-8') for url in response.css('.wsTR > td:nth-child(2)').xpath('text()').extract()] for url in urls: fullurl = 'http://www.' + url + '/' yield scrapy.Request(fullurl, callback=self.parse_edu_site) def parse_edu_site(self, response): data = SiteData() tc = TagCounter() # Fill summary fields data['url'] = response.url data['domain'] = '.'.join(response.url.split('/')[2].split('.')[-2:]) data['name'] = data['domain'].split('.')[0] data['title'] = response.xpath('//title/text()').extract()[0].encode('utf-8') # Fill CSS fields data['css_paths'] = [stylesheet.encode('utf-8') for stylesheet in response.xpath('//link[@rel="stylesheet"]/@href').extract()] data['css_files'] = [stylesheet.split('/')[-1] for stylesheet in data['css_paths']] # Fill JS fields data['js_paths'] = [script.encode('utf-8') for script in response.xpath('//script/@src').extract()] data['js_files'] = [script.split('/')[-1] for script in data['js_paths']] # Fill tag fields tc.feed(response.body) data['tagcount'] = tc.tagcount data['nonvoidcount'] = tc.nonvoid_tagcount data['topnest'] = tc.topnest self.exporter.export_item(data) yield data def __del__(self): scrapy.Spider.__del__(self) self.exporter.finish_exporting()
class OKCupidJsonPipeline(object): def __init__(self): dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) self.files = {} def spider_opened(self, spider): file = open('%s_items.json' % spider.name, 'w+b') self.files[spider] = file self.exporter = JsonLinesItemExporter(file, ensure_ascii=True) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): #print item #uItem = urllib.urlencode(item) #jItem = dumps(uItem, cls=PythonObjectEncoder) self.exporter.export_item(item) return item
class JsonExportPipeline(object): def __init__(self): self.file_name = 'svet_androida_links_to_articles.json' self.file_handle = None def open_spider(self, spider): print('JsonExportPipeline Exporter opened') file = open(self.file_name, 'wb') self.file_handle = file self.exporter = JsonLinesItemExporter(file, encoding='utf-8', ensure_ascii=False) self.exporter.start_exporting() def close_spider(self, spider): print('JsonExportPipeline Exporter closed') self.exporter.finish_exporting() self.file_handle.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class HuabanPipeline(object): def __init__(self): '''Open file to save the exported Items''' # save info of BoardItem self.board_info = open( 'D:/litreily/Pictures/python/huaban/boards.json', 'w+b') self.board_exporter = JsonItemExporter(self.board_info, encoding='utf-8', indent=4) # save info of PinItem self.pin_info = open('D:/litreily/Pictures/python/huaban/pins.json', 'w+b') self.pin_exporter = JsonLinesItemExporter(self.pin_info, encoding='utf-8', indent=4) def open_spider(self, spider): '''Start exporting BoardItem''' self.board_exporter.start_exporting() self.pin_exporter.start_exporting() def process_item(self, item, spider): if isinstance(item, BoardItem): self.board_exporter.export_item(item) elif isinstance(item, PinItem): self.pin_exporter.export_item(item) return item def close_spider(self, spider): '''finish exporting and close files''' self.board_exporter.finish_exporting() self.pin_exporter.finish_exporting() self.board_info.close() self.pin_info.close()
class EquationscraperPipeline(object): def __init__(self): self.jsl_exporter = None self.pprnt_exporter = None self.files = {} authenticate('localhost:7474', 'neo4j', 'big-theta-team') self.graph = Graph('localhost:7474/db/data') @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file_pprnt = open( '%s_pprint-items0' % spider.name, 'w+b', ) file_jsl = open( '%s_json-items0' % spider.name, 'w+b', ) self.jsl_exporter = JsonLinesItemExporter(file_jsl) self.pprnt_exporter = PprintItemExporter(file_pprnt) self.files[spider] = [file_pprnt, file_jsl] self.pprnt_exporter.indent = 2 self.pprnt_exporter.start_exporting() self.jsl_exporter.start_exporting() def spider_closed(self, spider): self.pprnt_exporter.finish_exporting() self.jsl_exporter.finish_exporting() for f in self.files[spider]: f.close() def process_item(self, item, spider): if spider.settings.getbool("EXPORT_JSON"): self.pprnt_exporter.export_item(item) self.jsl_exporter.export_item(item) node_equation_label = 'EQUATION' node_subject_label = 'SUBJECT' link_relation = 'LINKS_TO' page_relation = 'SAME_PAGE_AS' item_array = [item['last_item'].copy(), item.copy()] subject_nodes_array = [] for idx, elem in enumerate(item_array): subject_nodes_array.append( Node(node_subject_label, title=item_array[idx]['title'], url=item_array[idx]['url'], categories=item_array[idx]['categories'])) for expression in elem['maths']: expression = latexutils.strip_styles(expression) if latexutils.contains_equality_command(expression): latex_equation_node = Node(node_equation_label, name='Equation<' + item_array[idx]['title'] + '>', equation=expression) self.graph.merge( Relationship(subject_nodes_array[idx], page_relation, latex_equation_node, distance=0)) self.graph.merge( Relationship(subject_nodes_array[0], link_relation, subject_nodes_array[1], distance=item_array[1]['link_dist'])) del item raise exceptions.DropItem
class ResolutionPipeline(object): """Pipeline used for ResolutionSpider.""" def __init__(self): self.file = None self.exporter = None # compile regular expressions: # input looks like 'dec14R.aspx' # we need the resolution number (14R) self.resolution_number_pattern = re.compile(r"^\D+(?P<number>.+?)\..*$") # input looks like 'ממשלה/הממשלה ה - 34 בנימין נתניהו;' # we need the government number (34) and prime minister name (בנימין נתניהו) self.gov_pattern = re.compile(r'^.+\s??\-\s?(?P<gov_number>.+?)\s+?(?P<pm_name>.+?);?$') def open_spider(self, spider): """Initialize export JSON lines file.""" self.file = open("gov.json", "wb") self.exporter = JsonLinesItemExporter(self.file, ensure_ascii=False) self.exporter.start_exporting() def close_spider(self, spider): """Close export file.""" self.file.close() self.exporter.finish_exporting() def process_item(self, item, spider): """Sanitize text for each field, and export to file.""" try: data = { 'url': item["url"], 'date': self.get_date(item).timestamp, 'resolution_number': self.get_resolution_number(item), 'gov_number': self.get_gov_number(item), 'pm_name': self.get_pm_name(item), 'title': self.get_title(item), 'subject': self.get_subject(item), 'body': self.get_body(item), } except ResolutionError as ex: # if one of the fields fails sanitation, # raise and exception # and export the url leading to the specific resolution # for later (human) review self.exporter.export_item({'error': repr(ex), 'url': item["url"], }) else: self.exporter.export_item(data) return item # the following are specific field handling functions # e.g. cleaning, stripping, etc. # these should be called before dumping the data def get_date(self, item): if len(item["date"]) != 1: raise ResolutionError("Date field length is not 1 for item %s", item) return arrow.get(item["date"][0], "YYYYMMDD") def get_resolution_number(self, item): if len(item["resolution_number"]) != 1: raise ResolutionError("Resolution number field length is not 1 for item %s", item) return self.resolution_number_pattern.search(item["resolution_number"][0]).group('number') def get_gov_number(self, item): if len(item["gov"]) != 1: raise ResolutionError("Government field length is not 1 for item %s", item) gov_match = self.gov_pattern.search(item["gov"][0]) return gov_match.group("gov_number") def get_pm_name(self, item): if len(item["gov"]) != 1: raise ResolutionError("Government field length is not 1 for item %s", item) gov_match = self.gov_pattern.search(item["gov"][0]) return gov_match.group("pm_name") def get_title(self, item): if len(item["title"]) == 0: raise ResolutionError("Title fields is empty for item %s", item) return '\n'.join(item["title"]).strip() def get_subject(self, item): if len(item["subject"]) == 0: raise ResolutionError("Subject field is empty for item %s", item) return '\n'.join(item["subject"]).strip() def get_body(self, item): if len(item["body"]) == 0: raise ResolutionError("Body field is empty for item %s", item) # return '\n'.join(item["body"]).strip() # body is originally a list of lines # it is intentionally not stripped # some resolutions have custom css, tables, # and other crap which i'd rather not process here, # but in a later stage, unrelated to the scraper return item["body"]
class JsonExportPipeline(object): def __init__(self): self.files = {} self.room_count = 0 self.user_count = 0 self.room_file_count = 0 self.user_file_count = 0 self.exporter_room = None self.exporter_user = None @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider, mode=0): if mode == 1 or mode == 0: room_file = open( 'Airbnb_room_{}.json'.format(self.room_file_count), 'w+b') self.files['room'] = room_file self.exporter_room = JsonLinesItemExporter(room_file) self.exporter_room.start_exporting() if mode == 2 or mode == 0: user_file = open( 'Airbnb_user_{}.json'.format(self.user_file_count), 'w+b') self.files['user'] = user_file self.exporter_user = JsonLinesItemExporter(user_file) self.exporter_user.start_exporting() def spider_closed(self, spider, mode=0): if mode == 1 or mode == 0: self.exporter_room.finish_exporting() file = self.files['room'] file.close() if mode == 2 or mode == 0: self.exporter_user.finish_exporting() file = self.files['user'] file.close() def process_item(self, item, spider): if isinstance(item, AirbnbItem): self.room_count += 1 if self.room_count > 100000: self.room_count = 0 self.room_file_count += 1 self.spider_closed(spider, mode=1) self.spider_opened(spider, mode=1) self.exporter_room.export_item(item) elif isinstance(item, UserItem): self.user_count += 1 if self.user_count > 100000: self.user_count = 0 self.user_file_count += 1 self.spider_closed(spider, mode=2) self.spider_opened(spider, mode=2) self.exporter_user.export_item(item) else: logger.info('Some error happened!')
class KjvPipeline(FileExporter): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): pass def spider_closed(self, spider): pass def process_item(self, item, spider): DATA_STORE = spider.settings.get('DATA_STORE') if item and \ 'book' in item.keys() and \ 'text' in item.keys() and \ 'mp3' in item.keys() and \ 'url' in item.keys(): book_name = get_book_name(item) ensure_dir(os.path.join(DATA_STORE, book_name)) filename = os.path.join(DATA_STORE, book_name, get_filename(item, 'txt')) if not os.path.exists(filename): chapter_file = open(filename, 'w') self.files[spider] = chapter_file self.exporter = FileExporter(chapter_file) self.exporter.start_exporting() self.exporter.export_item("\n".join(item['text'])) self.exporter.finish_exporting() chapter_file = self.files.pop(spider) chapter_file.close() if item and \ 'name' in item.keys() and \ 'urls' in item.keys(): found_in_bible_file = False CONTENT_FILE = os.path.join(DATA_STORE, spider.settings.get('CONTENT_FILE')) if os.path.exists(CONTENT_FILE): with open(CONTENT_FILE, 'r') as bible: for books in bible: if item['name'] in books: found_in_bible_file = True break else: ensure_dir('%s' % DATA_STORE) if not found_in_bible_file: bible_file = open(CONTENT_FILE, 'a+') self.files[spider] = bible_file self.exporter = JsonLinesItemExporter(bible_file) self.exporter.start_exporting() self.exporter.export_item(item) self.exporter.finish_exporting() chapter_file = self.files.pop(spider) chapter_file.close() return item
class EduSpider(scrapy.Spider): """ Used to scrape .edu websites for web technology statistics """ name = 'edu' def __init__(self): scrapy.Spider.__init__(self) baseurl = 'https://domaintyper.com/top-websites/most-popular-websites-with-edu-domain/page/' logpath = '/home/www/flasksite/static/scenes/unipagestats/schools.jl' self.start_urls = [baseurl + str(i) for i in xrange(1, 30)] self.domain = 'domaintyper.com' self.exporter = JsonLinesItemExporter(open(logpath, 'wb+')) def parse(self, response): self.exporter.start_exporting() urls = [ url.encode('utf-8') for url in response.css( '.wsTR > td:nth-child(2)').xpath('text()').extract() ] for url in urls: fullurl = 'http://www.' + url + '/' yield scrapy.Request(fullurl, callback=self.parse_edu_site) def parse_edu_site(self, response): data = SiteData() tc = TagCounter() # Fill summary fields data['url'] = response.url data['domain'] = '.'.join(response.url.split('/')[2].split('.')[-2:]) data['name'] = data['domain'].split('.')[0] data['title'] = response.xpath('//title/text()').extract()[0].encode( 'utf-8') # Fill CSS fields data['css_paths'] = [ stylesheet.encode('utf-8') for stylesheet in response.xpath( '//link[@rel="stylesheet"]/@href').extract() ] data['css_files'] = [ stylesheet.split('/')[-1] for stylesheet in data['css_paths'] ] # Fill JS fields data['js_paths'] = [ script.encode('utf-8') for script in response.xpath('//script/@src').extract() ] data['js_files'] = [ script.split('/')[-1] for script in data['js_paths'] ] # Fill tag fields tc.feed(response.body) data['tagcount'] = tc.tagcount data['nonvoidcount'] = tc.nonvoid_tagcount data['topnest'] = tc.topnest self.exporter.export_item(data) yield data def __del__(self): scrapy.Spider.__del__(self) self.exporter.finish_exporting()
def process_item(self, item, spider): if not isinstance(item, GraphImage): return item try: ret = self.coll.update({"_id": item["_id"]}, { "$setOnInsert": { "_id": item["_id"], "instagram_id": item["instagram_id"], "owner_id": item["owner_id"], "thumbnail_src": item["thumbnail_src"], "thumbnail_resources": item["thumbnail_resources"], "typename": item.get("typename"), "is_video": item["is_video"], "date": item["date"], "display_src": item["display_src"], "caption": item["caption"], "download_urls": item["download_urls"], "downloaded_img_info": item.get("downloaded_img_info"), "status": 1, "scraped_ts": int(time.time()), }, "$set": { "update_ts": int(time.time()) }, "$addToSet": { "hashtags": { "$each": item.get('hashtags', []) } } }, upsert=True) if item["date"] > self.latest_downloaded_ts: self.latest_downloaded_ts = item["date"] if item["date"] < self.earliest_downloaded_ts: self.earliest_downloaded_ts = item["date"] if ret['updatedExisting']: logger.info('Updated graph images: %s', item["_id"]) self.existed += 1 else: filename = '{}.jl'.format(item["_id"]) filename = os.path.join(self.export_filepath, filename) export_file = open(filename, 'wb') exportor = JsonLinesItemExporter(export_file) exportor.start_exporting() exportor.export_item(item) exportor.finish_exporting() logger.info('dumped item to file: %s', ret['upserted']) logger.info('Inserted graph images: %s', ret['upserted']) self.task.send_task('fetch_image', (item['_id'], )) logger.info('Send task fetch_image: %s', item['_id']) self.inserted += 1 except RedisError: logger.error('Send task Failed. Network unreachable') raise DropItem('Send fetch_image task FAILED. DROP ITEM %s' % item["_id"]) except: logger.error('DB FAILED: %s', traceback.format_exc()) raise DropItem('Save graph image to db FAILED. DROP ITEM %s' % item["_id"]) else: return item
class ResolutionPipeline(object): """Pipeline used for ResolutionSpider.""" def __init__(self): self.file = None self.exporter = None # compile regular expressions: # input looks like 'dec14R.aspx' # we need the resolution number (14R) self.resolution_number_pattern = re.compile( r"^\D+(?P<number>.+?)\..*$") # input looks like 'ממשלה/הממשלה ה - 34 בנימין נתניהו;' # we need the government number (34) and prime minister name (בנימין נתניהו) self.gov_pattern = re.compile( r'^.+\s??\-\s?(?P<gov_number>.+?)\s+?(?P<pm_name>.+?);?$') def open_spider(self, spider): """Initialize export JSON lines file.""" self.file = open("gov.json", "wb") self.exporter = JsonLinesItemExporter(self.file, ensure_ascii=False) self.exporter.start_exporting() def close_spider(self, spider): """Close export file.""" self.file.close() self.exporter.finish_exporting() def process_item(self, item, spider): """Sanitize text for each field, and export to file.""" try: data = { 'url': item["url"], 'date': self.get_date(item).timestamp, 'resolution_number': self.get_resolution_number(item), 'gov_number': self.get_gov_number(item), 'pm_name': self.get_pm_name(item), 'title': self.get_title(item), 'subject': self.get_subject(item), 'body': self.get_body(item), } except ResolutionError as ex: # if one of the fields fails sanitation, # raise and exception # and export the url leading to the specific resolution # for later (human) review self.exporter.export_item({ 'error': repr(ex), 'url': item["url"], }) else: self.exporter.export_item(data) return item # the following are specific field handling functions # e.g. cleaning, stripping, etc. # these should be called before dumping the data def get_date(self, item): if len(item["date"]) != 1: raise ResolutionError("Date field length is not 1 for item %s", item) return arrow.get(item["date"][0], "YYYYMMDD") def get_resolution_number(self, item): if len(item["resolution_number"]) != 1: raise ResolutionError( "Resolution number field length is not 1 for item %s", item) return self.resolution_number_pattern.search( item["resolution_number"][0]).group('number') def get_gov_number(self, item): if len(item["gov"]) != 1: raise ResolutionError( "Government field length is not 1 for item %s", item) gov_match = self.gov_pattern.search(item["gov"][0]) return gov_match.group("gov_number") def get_pm_name(self, item): if len(item["gov"]) != 1: raise ResolutionError( "Government field length is not 1 for item %s", item) gov_match = self.gov_pattern.search(item["gov"][0]) return gov_match.group("pm_name") def get_title(self, item): if len(item["title"]) == 0: raise ResolutionError("Title fields is empty for item %s", item) return '\n'.join(item["title"]).strip() def get_subject(self, item): if len(item["subject"]) == 0: raise ResolutionError("Subject field is empty for item %s", item) return '\n'.join(item["subject"]).strip() def get_body(self, item): if len(item["body"]) == 0: raise ResolutionError("Body field is empty for item %s", item) # return '\n'.join(item["body"]).strip() # body is originally a list of lines # it is intentionally not stripped # some resolutions have custom css, tables, # and other crap which i'd rather not process here, # but in a later stage, unrelated to the scraper return item["body"]
class S3RawStorage(object): """Store items as JSON lines on S3. Currently almost equivalent to the default scrapy built-in S3 exporter. It just brings full control over the process and start leveraging it with custom Scrapy metrics. It helps us distinguish items scraped and data actually stored. """ STATS_TPL = 'pipeline/storage/{metric}' def __init__(self, stats): self.stats = stats @staticmethod def _validate_settings(settings): """Disable pipeline if criteria are not met.""" feed_uri_tpl = settings.get('KP_RAW_FEED_URI') if not feed_uri_tpl: raise NotConfigured('no feed uri defined') builtin_feed_uri_tpl = settings.get('FEED_URI') if builtin_feed_uri_tpl: raise NotConfigured('conflict: built-in Scrapy feed exporter is already configured') @classmethod def _namespace(cls, metric): """Namespace metrics to distinguish them in Scrapy stats. Examples: >>> S3RawStorage._namespace('foo') 'pipeline/storage/foo' """ return cls.STATS_TPL.format(metric=metric) @classmethod def from_crawler(cls, crawler): cls._validate_settings(crawler.settings) pipeline = cls(crawler.stats) crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline @staticmethod def feed_uri(spider): """Generate item storage URI. Args: spider (scrapy.Spider): Returns: str: S3 object key to which data should be uploaded to """ # use spider finish_time as default, else UTC time _spider_finish = spider.crawler.stats._stats.get('finish_time') if _spider_finish: _time = _spider_finish - dt.timedelta(hours=system_tz_offset()) else: _time = dt.datetime.utcnow() uri_opts = { 'name': spider.name, 'time': _time.isoformat(), 'job_id': spider.job_id, 'bucket': ITEMS_BUCKET, # use the same semantic as on the ETL # default env is a safe playground where we can dump whatever items # we want without taking the risk of polluting production # environments # con: actual production env MUST specify this setting # pro: new environment will have by default somewhere to upload # items, withput displaying an error because an env-dependant # bucket was not created. # # we allow fallback on the env since alternative runtimes like EC2 dont # benefit from scrapinghub settings interface. 'env': spider.settings.get('KP_ENV', os.getenv('KP_ENV', 'pre-production')), } return spider.settings.get('KP_RAW_FEED_URI') % uri_opts def spider_opened(self, spider): self.stats.set_value(self._namespace('backend'), 'rawS3') # spider finish time only available when `spider_closed` # uri used here only as a filler to fulfil feed storage contract self.storage = S3FeedStorage( uri=f's3://{ITEMS_BUCKET}', access_key=spider.settings['AWS_ACCESS_KEY_ID'], secret_key=spider.settings['AWS_SECRET_ACCESS_KEY'], ) self.raw_content = self.storage.open(spider) self.exporter = JsonLinesItemExporter(self.raw_content) self.exporter.start_exporting() def spider_closed(self, spider): # push items to json lines feed self.exporter.finish_exporting() # update object key to use job finish time uri = urlparse(self.feed_uri(spider)) self.storage.keyname = uri.path[1:] # remove first "/" logger.debug(f"Data will be uploaded to `{self.storage.keyname}`") # push items to S3 self.raw_content.file.seek(0) if len(self.raw_content.file.read()) != 0: self.storage.store(self.raw_content) else: logger.info("No items are scrapped, not pushing to s3") def process_item(self, item, spider): self.stats.inc_value(self._namespace('items_stored')) self.exporter.export_item(item) # running jobs on scrapinghub will still store them # in their database. The point of this pipeline is # obviously to stop relying on it but that way it # remains a cheap fallback/backup return item