Пример #1
0
class CardPipeline(object):

    def __init__(self):
        self.files = {}

    def process_item(self, item, spider):
        if not item['wb_nick']\
                or not item['wb_location']\
                or not item['wb_images']:
            raise DropItem
        print item['wb_nick'][0]
        item['wb_content'] = ''.join(item['wb_content'])
        item['wb_date'] = item['wb_date'][0]
        item['wb_location'] = item['wb_location'][0]
        images_urls = item.pop('wb_images')
        item['wb_images'] = []
        for image_url in images_urls:
            image_url = image_url.replace('thumbnail', 'large')
            image_url = image_url.replace('square', 'large')
            item['wb_images'].append(image_url)
        self.exporter.export_item(item)
        return item

    def open_spider(self, spider):
        file = open('json/{}_products.json'.format(spider.name), 'w+b')
        self.files[spider] = file
        self.exporter = JsonLinesItemExporter(file)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()
Пример #2
0
class JsonExportPipeline(object):

    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s_all.json' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = JsonLinesItemExporter(file, encoding='utf-8', ensure_ascii=False)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Пример #3
0
class KinoPipeline(object):

    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
         pipeline = cls()
         crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
         crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
         return pipeline

    def spider_opened(self, spider):
        # It might be possible to add indent=4 and ensure_ascii=False somewhere.
        file = open('output/' + spider.name + '.jsonl', 'w+b', encoding='utf-8')
        self.files[spider] = file
        self.exporter = JsonLinesItemExporter(file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Пример #4
0
class MedPipeline(object):
    def __init__(self):
        self.ids_seen = set()

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        self.file = open('medData.json', 'wb')
        self.expoter = JsonLinesItemExporter(self.file)
        self.expoter.start_exporting()
        
    def spider_closed(self, spider):
        self.expoter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        if int(item['reply_num'][0]) == 0:
        	raise DropItem("no reply in %s" % item)
        elif item['post_id'] in self.ids_seen:
        	raise DropItem("Duplicate item found: %s" % item)
        else:
            self.ids_seen.add(item['post_id'])
            self.expoter.export_item(item)
            return item
Пример #5
0
class QiushiPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        # The file created on Dec20 2015 will be named as "12-20-2015.json"
        datestr = date.today().strftime("%m-%d-%Y")
        file = open('scraped_data/%s.json' % datestr, 'w+b')
        self.files[spider] = file
        self.exporter = JsonLinesItemExporter(file, ensure_ascii=False)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        # print item['author']
        # print item['title']
        # print item['content']
        # print item['href']

        return item
class JsonLinesExportPipeline(object):
    """
    app.pipelines.exporter_json_lines.JsonLinesExportPipeline
    """
    def __init__(self):
        self.files = {}
        self.exporter = None

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file_json_lines = open('%s_item_lines.json' % spider.name, 'w+b')
        self.files[spider] = file_json_lines
        self.exporter = JsonLinesItemExporter(file_json_lines)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file_json_lines = self.files.pop(spider)
        file_json_lines.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Пример #7
0
def load_table(table, source = "default/test-0.jsonlines",\
			   modifier = "" , dblogin = "******" , \
			   as_name = None ):
	filename = source if as_name is None else as_name
	dbname, collectioname = parse_path(filename , modifier)
	connection = pymongo.MongoClient( dblogin )
	db = getattr(connection, dbname)
	collection = getattr( db , collectioname )
	try:
		result = collection.insert_many((set_id(obj) for obj in odicts(table)),ordered=False)
	except BulkWriteError as e:
		result = e.details
		errs = set()
		with open( "%s.%s" % (filename,"errors") , "a") as f:
			exporter = JsonLinesItemExporter(f)
			exporter.start_exporting()
			for err in result.get("writeErrors"):
				if not err.get("op").get("_id") in errs:
					obj = dict( item = err.get("op") , \
								error = err.get("errmsg") )
					errs.add( err.get("op").get("_id") )
					exporter.export_item(obj)
			exporter.finish_exporting()
			f.close()
	return result
Пример #8
0
 def export_item(self, item):
     storage_file = open(self.item_storage_path(item["id"]), "w")
     item_exporter = JsonLinesItemExporter(storage_file)
     item_exporter.start_exporting()
     item_exporter.export_item(item)
     item_exporter.finish_exporting()
     storage_file.close()
Пример #9
0
class EduSpider(scrapy.Spider):
    """ Used to scrape .edu websites for web technology statistics """
    name = 'edu'

    def __init__(self):
        scrapy.Spider.__init__(self)
        baseurl = 'https://domaintyper.com/top-websites/most-popular-websites-with-edu-domain/page/'
        logpath = '/home/www/flasksite/static/scenes/unipagestats/schools.jl'

        self.start_urls = [baseurl + str(i) for i in xrange(1, 30)]
        self.domain = 'domaintyper.com'
        self.exporter = JsonLinesItemExporter(open(logpath, 'wb+'))

    def parse(self, response):
        self.exporter.start_exporting()
        urls = [url.encode('utf-8') for url in response.css('.wsTR > td:nth-child(2)').xpath('text()').extract()]
        for url in urls:
            fullurl = 'http://www.' + url + '/'
            yield scrapy.Request(fullurl, callback=self.parse_edu_site)

    def parse_edu_site(self, response):
        data = SiteData()
        tc = TagCounter()

        # Fill summary fields
        data['url'] = response.url
        data['domain'] = '.'.join(response.url.split('/')[2].split('.')[-2:])
        data['name'] = data['domain'].split('.')[0]
        data['title'] = response.xpath('//title/text()').extract()[0].encode('utf-8')

        # Fill CSS fields
        data['css_paths'] = [stylesheet.encode('utf-8') for stylesheet in response.xpath('//link[@rel="stylesheet"]/@href').extract()]
        data['css_files'] = [stylesheet.split('/')[-1] for stylesheet in data['css_paths']]

        # Fill JS fields
        data['js_paths'] = [script.encode('utf-8') for script in response.xpath('//script/@src').extract()]
        data['js_files'] = [script.split('/')[-1] for script in data['js_paths']]

        # Fill tag fields
        tc.feed(response.body)
        data['tagcount'] = tc.tagcount
        data['nonvoidcount'] = tc.nonvoid_tagcount
        data['topnest'] = tc.topnest

        self.exporter.export_item(data)
        yield data

    def __del__(self):
        scrapy.Spider.__del__(self)
        self.exporter.finish_exporting()
Пример #10
0
class OKCupidJsonPipeline(object):
    def __init__(self):
        dispatcher.connect(self.spider_opened, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        self.files = {}

    def spider_opened(self, spider):
        file = open('%s_items.json' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = JsonLinesItemExporter(file, ensure_ascii=True)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        #print item
        #uItem = urllib.urlencode(item)
        #jItem = dumps(uItem, cls=PythonObjectEncoder)
        self.exporter.export_item(item)
        return item
Пример #11
0
class JsonExportPipeline(object):
    def __init__(self):
        self.file_name = 'svet_androida_links_to_articles.json'
        self.file_handle = None

    def open_spider(self, spider):
        print('JsonExportPipeline Exporter opened')

        file = open(self.file_name, 'wb')
        self.file_handle = file

        self.exporter = JsonLinesItemExporter(file,
                                              encoding='utf-8',
                                              ensure_ascii=False)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        print('JsonExportPipeline Exporter closed')
        self.exporter.finish_exporting()
        self.file_handle.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Пример #12
0
class HuabanPipeline(object):
    def __init__(self):
        '''Open file to save the exported Items'''
        # save info of BoardItem
        self.board_info = open(
            'D:/litreily/Pictures/python/huaban/boards.json', 'w+b')
        self.board_exporter = JsonItemExporter(self.board_info,
                                               encoding='utf-8',
                                               indent=4)

        # save info of PinItem
        self.pin_info = open('D:/litreily/Pictures/python/huaban/pins.json',
                             'w+b')
        self.pin_exporter = JsonLinesItemExporter(self.pin_info,
                                                  encoding='utf-8',
                                                  indent=4)

    def open_spider(self, spider):
        '''Start exporting BoardItem'''
        self.board_exporter.start_exporting()
        self.pin_exporter.start_exporting()

    def process_item(self, item, spider):
        if isinstance(item, BoardItem):
            self.board_exporter.export_item(item)
        elif isinstance(item, PinItem):
            self.pin_exporter.export_item(item)

        return item

    def close_spider(self, spider):
        '''finish exporting and close files'''
        self.board_exporter.finish_exporting()
        self.pin_exporter.finish_exporting()
        self.board_info.close()
        self.pin_info.close()
Пример #13
0
class EquationscraperPipeline(object):
    def __init__(self):
        self.jsl_exporter = None
        self.pprnt_exporter = None
        self.files = {}

        authenticate('localhost:7474', 'neo4j', 'big-theta-team')
        self.graph = Graph('localhost:7474/db/data')

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):

        file_pprnt = open(
            '%s_pprint-items0' % spider.name,
            'w+b',
        )
        file_jsl = open(
            '%s_json-items0' % spider.name,
            'w+b',
        )

        self.jsl_exporter = JsonLinesItemExporter(file_jsl)
        self.pprnt_exporter = PprintItemExporter(file_pprnt)

        self.files[spider] = [file_pprnt, file_jsl]
        self.pprnt_exporter.indent = 2
        self.pprnt_exporter.start_exporting()
        self.jsl_exporter.start_exporting()

    def spider_closed(self, spider):
        self.pprnt_exporter.finish_exporting()
        self.jsl_exporter.finish_exporting()

        for f in self.files[spider]:
            f.close()

    def process_item(self, item, spider):

        if spider.settings.getbool("EXPORT_JSON"):
            self.pprnt_exporter.export_item(item)
            self.jsl_exporter.export_item(item)

        node_equation_label = 'EQUATION'
        node_subject_label = 'SUBJECT'

        link_relation = 'LINKS_TO'
        page_relation = 'SAME_PAGE_AS'

        item_array = [item['last_item'].copy(), item.copy()]
        subject_nodes_array = []

        for idx, elem in enumerate(item_array):

            subject_nodes_array.append(
                Node(node_subject_label,
                     title=item_array[idx]['title'],
                     url=item_array[idx]['url'],
                     categories=item_array[idx]['categories']))

            for expression in elem['maths']:

                expression = latexutils.strip_styles(expression)

                if latexutils.contains_equality_command(expression):
                    latex_equation_node = Node(node_equation_label,
                                               name='Equation<' +
                                               item_array[idx]['title'] + '>',
                                               equation=expression)

                    self.graph.merge(
                        Relationship(subject_nodes_array[idx],
                                     page_relation,
                                     latex_equation_node,
                                     distance=0))

        self.graph.merge(
            Relationship(subject_nodes_array[0],
                         link_relation,
                         subject_nodes_array[1],
                         distance=item_array[1]['link_dist']))

        del item
        raise exceptions.DropItem
Пример #14
0
class ResolutionPipeline(object):
    """Pipeline used for ResolutionSpider."""
    def __init__(self):
        self.file = None
        self.exporter = None

        # compile regular expressions:

        # input looks like 'dec14R.aspx'
        # we need the resolution number (14R)
        self.resolution_number_pattern = re.compile(r"^\D+(?P<number>.+?)\..*$")

        # input looks like 'ממשלה/הממשלה ה - 34 בנימין נתניהו;'
        # we need the government number (34) and prime minister name (בנימין נתניהו)
        self.gov_pattern = re.compile(r'^.+\s??\-\s?(?P<gov_number>.+?)\s+?(?P<pm_name>.+?);?$')

    def open_spider(self, spider):
        """Initialize export JSON lines file."""
        self.file = open("gov.json", "wb")
        self.exporter = JsonLinesItemExporter(self.file, ensure_ascii=False)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        """Close export file."""
        self.file.close()
        self.exporter.finish_exporting()

    def process_item(self, item, spider):
        """Sanitize text for each field, and export to file."""
        try:
            data = {
                'url': item["url"],
                'date': self.get_date(item).timestamp,
                'resolution_number': self.get_resolution_number(item),
                'gov_number': self.get_gov_number(item),
                'pm_name': self.get_pm_name(item),
                'title': self.get_title(item),
                'subject': self.get_subject(item),
                'body': self.get_body(item),
            }
        except ResolutionError as ex:
            # if one of the fields fails sanitation,
            # raise and exception
            # and export the url leading to the specific resolution
            # for later (human) review
            self.exporter.export_item({'error': repr(ex),
                                       'url': item["url"],
                                      })
        else:
            self.exporter.export_item(data)

        return item

    # the following are specific field handling functions
    # e.g. cleaning, stripping, etc.
    # these should be called before dumping the data

    def get_date(self, item):
        if len(item["date"]) != 1:
            raise ResolutionError("Date field length is not 1 for item %s", item)
        return arrow.get(item["date"][0], "YYYYMMDD")

    def get_resolution_number(self, item):
        if len(item["resolution_number"]) != 1:
            raise ResolutionError("Resolution number field length is not 1 for item %s", item)
        return self.resolution_number_pattern.search(item["resolution_number"][0]).group('number')

    def get_gov_number(self, item):
        if len(item["gov"]) != 1:
            raise ResolutionError("Government field length is not 1 for item %s", item)
        gov_match = self.gov_pattern.search(item["gov"][0])
        return gov_match.group("gov_number")

    def get_pm_name(self, item):
        if len(item["gov"]) != 1:
            raise ResolutionError("Government field length is not 1 for item %s", item)
        gov_match = self.gov_pattern.search(item["gov"][0])
        return gov_match.group("pm_name")

    def get_title(self, item):
        if len(item["title"]) == 0:
            raise ResolutionError("Title fields is empty for item %s", item)
        return '\n'.join(item["title"]).strip()

    def get_subject(self, item):
        if len(item["subject"]) == 0:
            raise ResolutionError("Subject field is empty for item %s", item)
        return '\n'.join(item["subject"]).strip()

    def get_body(self, item):
        if len(item["body"]) == 0:
            raise ResolutionError("Body field is empty for item %s", item)
        # return '\n'.join(item["body"]).strip()

        # body is originally a list of lines
        # it is intentionally not stripped
        # some resolutions have custom css, tables,
        # and other crap which i'd rather not process here,
        # but in a later stage, unrelated to the scraper
        return item["body"]
Пример #15
0
class JsonExportPipeline(object):
    def __init__(self):
        self.files = {}
        self.room_count = 0
        self.user_count = 0
        self.room_file_count = 0
        self.user_file_count = 0
        self.exporter_room = None
        self.exporter_user = None

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider, mode=0):
        if mode == 1 or mode == 0:
            room_file = open(
                'Airbnb_room_{}.json'.format(self.room_file_count), 'w+b')
            self.files['room'] = room_file
            self.exporter_room = JsonLinesItemExporter(room_file)
            self.exporter_room.start_exporting()

        if mode == 2 or mode == 0:
            user_file = open(
                'Airbnb_user_{}.json'.format(self.user_file_count), 'w+b')
            self.files['user'] = user_file
            self.exporter_user = JsonLinesItemExporter(user_file)
            self.exporter_user.start_exporting()

    def spider_closed(self, spider, mode=0):
        if mode == 1 or mode == 0:
            self.exporter_room.finish_exporting()
            file = self.files['room']
            file.close()

        if mode == 2 or mode == 0:
            self.exporter_user.finish_exporting()
            file = self.files['user']
            file.close()

    def process_item(self, item, spider):
        if isinstance(item, AirbnbItem):
            self.room_count += 1
            if self.room_count > 100000:
                self.room_count = 0
                self.room_file_count += 1
                self.spider_closed(spider, mode=1)
                self.spider_opened(spider, mode=1)
            self.exporter_room.export_item(item)
        elif isinstance(item, UserItem):
            self.user_count += 1
            if self.user_count > 100000:
                self.user_count = 0
                self.user_file_count += 1
                self.spider_closed(spider, mode=2)
                self.spider_opened(spider, mode=2)
            self.exporter_user.export_item(item)
        else:
            logger.info('Some error happened!')
Пример #16
0
class KjvPipeline(FileExporter):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        pass

    def spider_closed(self, spider):
        pass

    def process_item(self, item, spider):
        DATA_STORE = spider.settings.get('DATA_STORE')
        if item and \
                'book' in item.keys() and \
                'text' in item.keys() and \
                'mp3' in item.keys() and \
                'url' in item.keys():
            book_name = get_book_name(item)

            ensure_dir(os.path.join(DATA_STORE, book_name))

            filename = os.path.join(DATA_STORE, book_name,
                                    get_filename(item, 'txt'))
            if not os.path.exists(filename):
                chapter_file = open(filename, 'w')
                self.files[spider] = chapter_file
                self.exporter = FileExporter(chapter_file)
                self.exporter.start_exporting()
                self.exporter.export_item("\n".join(item['text']))
                self.exporter.finish_exporting()
                chapter_file = self.files.pop(spider)
                chapter_file.close()
        if item and \
                'name' in item.keys() and \
                'urls' in item.keys():
            found_in_bible_file = False
            CONTENT_FILE = os.path.join(DATA_STORE,
                                        spider.settings.get('CONTENT_FILE'))
            if os.path.exists(CONTENT_FILE):
                with open(CONTENT_FILE, 'r') as bible:
                    for books in bible:
                        if item['name'] in books:
                            found_in_bible_file = True

                            break
            else:
                ensure_dir('%s' % DATA_STORE)

            if not found_in_bible_file:
                bible_file = open(CONTENT_FILE, 'a+')
                self.files[spider] = bible_file
                self.exporter = JsonLinesItemExporter(bible_file)
                self.exporter.start_exporting()
                self.exporter.export_item(item)
                self.exporter.finish_exporting()
                chapter_file = self.files.pop(spider)
                chapter_file.close()
        return item
Пример #17
0
class EduSpider(scrapy.Spider):
    """ Used to scrape .edu websites for web technology statistics """
    name = 'edu'

    def __init__(self):
        scrapy.Spider.__init__(self)
        baseurl = 'https://domaintyper.com/top-websites/most-popular-websites-with-edu-domain/page/'
        logpath = '/home/www/flasksite/static/scenes/unipagestats/schools.jl'

        self.start_urls = [baseurl + str(i) for i in xrange(1, 30)]
        self.domain = 'domaintyper.com'
        self.exporter = JsonLinesItemExporter(open(logpath, 'wb+'))

    def parse(self, response):
        self.exporter.start_exporting()
        urls = [
            url.encode('utf-8') for url in response.css(
                '.wsTR > td:nth-child(2)').xpath('text()').extract()
        ]
        for url in urls:
            fullurl = 'http://www.' + url + '/'
            yield scrapy.Request(fullurl, callback=self.parse_edu_site)

    def parse_edu_site(self, response):
        data = SiteData()
        tc = TagCounter()

        # Fill summary fields
        data['url'] = response.url
        data['domain'] = '.'.join(response.url.split('/')[2].split('.')[-2:])
        data['name'] = data['domain'].split('.')[0]
        data['title'] = response.xpath('//title/text()').extract()[0].encode(
            'utf-8')

        # Fill CSS fields
        data['css_paths'] = [
            stylesheet.encode('utf-8') for stylesheet in response.xpath(
                '//link[@rel="stylesheet"]/@href').extract()
        ]
        data['css_files'] = [
            stylesheet.split('/')[-1] for stylesheet in data['css_paths']
        ]

        # Fill JS fields
        data['js_paths'] = [
            script.encode('utf-8')
            for script in response.xpath('//script/@src').extract()
        ]
        data['js_files'] = [
            script.split('/')[-1] for script in data['js_paths']
        ]

        # Fill tag fields
        tc.feed(response.body)
        data['tagcount'] = tc.tagcount
        data['nonvoidcount'] = tc.nonvoid_tagcount
        data['topnest'] = tc.topnest

        self.exporter.export_item(data)
        yield data

    def __del__(self):
        scrapy.Spider.__del__(self)
        self.exporter.finish_exporting()
Пример #18
0
 def process_item(self, item, spider):
     if not isinstance(item, GraphImage):
         return item
     try:
         ret = self.coll.update({"_id": item["_id"]}, {
             "$setOnInsert": {
                 "_id": item["_id"],
                 "instagram_id": item["instagram_id"],
                 "owner_id": item["owner_id"],
                 "thumbnail_src": item["thumbnail_src"],
                 "thumbnail_resources": item["thumbnail_resources"],
                 "typename": item.get("typename"),
                 "is_video": item["is_video"],
                 "date": item["date"],
                 "display_src": item["display_src"],
                 "caption": item["caption"],
                 "download_urls": item["download_urls"],
                 "downloaded_img_info": item.get("downloaded_img_info"),
                 "status": 1,
                 "scraped_ts": int(time.time()),
             },
             "$set": {
                 "update_ts": int(time.time())
             },
             "$addToSet": {
                 "hashtags": {
                     "$each": item.get('hashtags', [])
                 }
             }
         },
                                upsert=True)
         if item["date"] > self.latest_downloaded_ts:
             self.latest_downloaded_ts = item["date"]
         if item["date"] < self.earliest_downloaded_ts:
             self.earliest_downloaded_ts = item["date"]
         if ret['updatedExisting']:
             logger.info('Updated graph images: %s', item["_id"])
             self.existed += 1
         else:
             filename = '{}.jl'.format(item["_id"])
             filename = os.path.join(self.export_filepath, filename)
             export_file = open(filename, 'wb')
             exportor = JsonLinesItemExporter(export_file)
             exportor.start_exporting()
             exportor.export_item(item)
             exportor.finish_exporting()
             logger.info('dumped item to file: %s', ret['upserted'])
             logger.info('Inserted graph images: %s', ret['upserted'])
             self.task.send_task('fetch_image', (item['_id'], ))
             logger.info('Send task fetch_image: %s', item['_id'])
             self.inserted += 1
     except RedisError:
         logger.error('Send task Failed. Network unreachable')
         raise DropItem('Send fetch_image task FAILED. DROP ITEM %s' %
                        item["_id"])
     except:
         logger.error('DB FAILED: %s', traceback.format_exc())
         raise DropItem('Save graph image to db FAILED. DROP ITEM %s' %
                        item["_id"])
     else:
         return item
Пример #19
0
class ResolutionPipeline(object):
    """Pipeline used for ResolutionSpider."""
    def __init__(self):
        self.file = None
        self.exporter = None

        # compile regular expressions:

        # input looks like 'dec14R.aspx'
        # we need the resolution number (14R)
        self.resolution_number_pattern = re.compile(
            r"^\D+(?P<number>.+?)\..*$")

        # input looks like 'ממשלה/הממשלה ה - 34 בנימין נתניהו;'
        # we need the government number (34) and prime minister name (בנימין נתניהו)
        self.gov_pattern = re.compile(
            r'^.+\s??\-\s?(?P<gov_number>.+?)\s+?(?P<pm_name>.+?);?$')

    def open_spider(self, spider):
        """Initialize export JSON lines file."""
        self.file = open("gov.json", "wb")
        self.exporter = JsonLinesItemExporter(self.file, ensure_ascii=False)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        """Close export file."""
        self.file.close()
        self.exporter.finish_exporting()

    def process_item(self, item, spider):
        """Sanitize text for each field, and export to file."""
        try:
            data = {
                'url': item["url"],
                'date': self.get_date(item).timestamp,
                'resolution_number': self.get_resolution_number(item),
                'gov_number': self.get_gov_number(item),
                'pm_name': self.get_pm_name(item),
                'title': self.get_title(item),
                'subject': self.get_subject(item),
                'body': self.get_body(item),
            }
        except ResolutionError as ex:
            # if one of the fields fails sanitation,
            # raise and exception
            # and export the url leading to the specific resolution
            # for later (human) review
            self.exporter.export_item({
                'error': repr(ex),
                'url': item["url"],
            })
        else:
            self.exporter.export_item(data)

        return item

    # the following are specific field handling functions
    # e.g. cleaning, stripping, etc.
    # these should be called before dumping the data

    def get_date(self, item):
        if len(item["date"]) != 1:
            raise ResolutionError("Date field length is not 1 for item %s",
                                  item)
        return arrow.get(item["date"][0], "YYYYMMDD")

    def get_resolution_number(self, item):
        if len(item["resolution_number"]) != 1:
            raise ResolutionError(
                "Resolution number field length is not 1 for item %s", item)
        return self.resolution_number_pattern.search(
            item["resolution_number"][0]).group('number')

    def get_gov_number(self, item):
        if len(item["gov"]) != 1:
            raise ResolutionError(
                "Government field length is not 1 for item %s", item)
        gov_match = self.gov_pattern.search(item["gov"][0])
        return gov_match.group("gov_number")

    def get_pm_name(self, item):
        if len(item["gov"]) != 1:
            raise ResolutionError(
                "Government field length is not 1 for item %s", item)
        gov_match = self.gov_pattern.search(item["gov"][0])
        return gov_match.group("pm_name")

    def get_title(self, item):
        if len(item["title"]) == 0:
            raise ResolutionError("Title fields is empty for item %s", item)
        return '\n'.join(item["title"]).strip()

    def get_subject(self, item):
        if len(item["subject"]) == 0:
            raise ResolutionError("Subject field is empty for item %s", item)
        return '\n'.join(item["subject"]).strip()

    def get_body(self, item):
        if len(item["body"]) == 0:
            raise ResolutionError("Body field is empty for item %s", item)
        # return '\n'.join(item["body"]).strip()

        # body is originally a list of lines
        # it is intentionally not stripped
        # some resolutions have custom css, tables,
        # and other crap which i'd rather not process here,
        # but in a later stage, unrelated to the scraper
        return item["body"]
Пример #20
0
class S3RawStorage(object):
    """Store items as JSON lines on S3.

    Currently almost equivalent to the default scrapy built-in S3 exporter.
    It just brings full control over the process and start leveraging it with
    custom Scrapy metrics. It helps us distinguish items scraped and data
    actually stored.

    """

    STATS_TPL = 'pipeline/storage/{metric}'

    def __init__(self, stats):
        self.stats = stats

    @staticmethod
    def _validate_settings(settings):
        """Disable pipeline if criteria are not met."""
        feed_uri_tpl = settings.get('KP_RAW_FEED_URI')

        if not feed_uri_tpl:
            raise NotConfigured('no feed uri defined')

        builtin_feed_uri_tpl = settings.get('FEED_URI')
        if builtin_feed_uri_tpl:
            raise NotConfigured('conflict: built-in Scrapy feed exporter is already configured')

    @classmethod
    def _namespace(cls, metric):
        """Namespace metrics to distinguish them in Scrapy stats.

        Examples:
            >>> S3RawStorage._namespace('foo')
            'pipeline/storage/foo'

        """
        return cls.STATS_TPL.format(metric=metric)

    @classmethod
    def from_crawler(cls, crawler):
        cls._validate_settings(crawler.settings)
        pipeline = cls(crawler.stats)

        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)

        return pipeline

    @staticmethod
    def feed_uri(spider):
        """Generate item storage URI.

        Args:
            spider (scrapy.Spider):

        Returns:
            str: S3 object key to which data should be uploaded to

        """
        # use spider finish_time as default, else UTC time
        _spider_finish = spider.crawler.stats._stats.get('finish_time')
        if _spider_finish:
            _time = _spider_finish - dt.timedelta(hours=system_tz_offset())
        else:
            _time = dt.datetime.utcnow()

        uri_opts = {
            'name': spider.name,
            'time': _time.isoformat(),
            'job_id': spider.job_id,
            'bucket': ITEMS_BUCKET,
            # use the same semantic as on the ETL
            # default env is a safe playground where we can dump whatever items
            # we want without taking the risk of polluting production
            # environments
            # con: actual production env MUST specify this setting
            # pro: new environment will have by default somewhere to upload
            # items, withput displaying an error because an env-dependant
            # bucket was not created.
            #
            # we allow fallback on the env since alternative runtimes like EC2 dont
            # benefit from scrapinghub settings interface.
            'env': spider.settings.get('KP_ENV', os.getenv('KP_ENV', 'pre-production')),
        }

        return spider.settings.get('KP_RAW_FEED_URI') % uri_opts

    def spider_opened(self, spider):
        self.stats.set_value(self._namespace('backend'), 'rawS3')
        # spider finish time only available when `spider_closed`
        # uri used here only as a filler to fulfil feed storage contract
        self.storage = S3FeedStorage(
            uri=f's3://{ITEMS_BUCKET}',
            access_key=spider.settings['AWS_ACCESS_KEY_ID'],
            secret_key=spider.settings['AWS_SECRET_ACCESS_KEY'],
        )

        self.raw_content = self.storage.open(spider)
        self.exporter = JsonLinesItemExporter(self.raw_content)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        # push items to json lines feed
        self.exporter.finish_exporting()

        # update object key to use job finish time
        uri = urlparse(self.feed_uri(spider))
        self.storage.keyname = uri.path[1:]  # remove first "/"
        logger.debug(f"Data will be uploaded to `{self.storage.keyname}`")

        # push items to S3
        self.raw_content.file.seek(0)
        if len(self.raw_content.file.read()) != 0:
            self.storage.store(self.raw_content)
        else:
            logger.info("No items are scrapped, not pushing to s3")

    def process_item(self, item, spider):
        self.stats.inc_value(self._namespace('items_stored'))
        self.exporter.export_item(item)

        # running jobs on scrapinghub will still store them
        # in their database. The point of this pipeline is
        # obviously to stop relying on it but that way it
        # remains a cheap fallback/backup
        return item