class JsonLinesExportPipeline(object):
    """
    app.pipelines.exporter_json_lines.JsonLinesExportPipeline
    """
    def __init__(self):
        self.files = {}
        self.exporter = None

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file_json_lines = open('%s_item_lines.json' % spider.name, 'w+b')
        self.files[spider] = file_json_lines
        self.exporter = JsonLinesItemExporter(file_json_lines)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file_json_lines = self.files.pop(spider)
        file_json_lines.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Пример #2
0
 def export_item(self, item):
     storage_file = open(self.item_storage_path(item["id"]), "w")
     item_exporter = JsonLinesItemExporter(storage_file)
     item_exporter.start_exporting()
     item_exporter.export_item(item)
     item_exporter.finish_exporting()
     storage_file.close()
Пример #3
0
def load_table(table, source = "default/test-0.jsonlines",\
			   modifier = "" , dblogin = "******" , \
			   as_name = None ):
	filename = source if as_name is None else as_name
	dbname, collectioname = parse_path(filename , modifier)
	connection = pymongo.MongoClient( dblogin )
	db = getattr(connection, dbname)
	collection = getattr( db , collectioname )
	try:
		result = collection.insert_many((set_id(obj) for obj in odicts(table)),ordered=False)
	except BulkWriteError as e:
		result = e.details
		errs = set()
		with open( "%s.%s" % (filename,"errors") , "a") as f:
			exporter = JsonLinesItemExporter(f)
			exporter.start_exporting()
			for err in result.get("writeErrors"):
				if not err.get("op").get("_id") in errs:
					obj = dict( item = err.get("op") , \
								error = err.get("errmsg") )
					errs.add( err.get("op").get("_id") )
					exporter.export_item(obj)
			exporter.finish_exporting()
			f.close()
	return result
Пример #4
0
class JsonExportPipeline(object):

    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s_all.json' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = JsonLinesItemExporter(file, encoding='utf-8', ensure_ascii=False)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Пример #5
0
class CardPipeline(object):

    def __init__(self):
        self.files = {}

    def process_item(self, item, spider):
        if not item['wb_nick']\
                or not item['wb_location']\
                or not item['wb_images']:
            raise DropItem
        print item['wb_nick'][0]
        item['wb_content'] = ''.join(item['wb_content'])
        item['wb_date'] = item['wb_date'][0]
        item['wb_location'] = item['wb_location'][0]
        images_urls = item.pop('wb_images')
        item['wb_images'] = []
        for image_url in images_urls:
            image_url = image_url.replace('thumbnail', 'large')
            image_url = image_url.replace('square', 'large')
            item['wb_images'].append(image_url)
        self.exporter.export_item(item)
        return item

    def open_spider(self, spider):
        file = open('json/{}_products.json'.format(spider.name), 'w+b')
        self.files[spider] = file
        self.exporter = JsonLinesItemExporter(file)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()
Пример #6
0
class QiushiPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        # The file created on Dec20 2015 will be named as "12-20-2015.json"
        datestr = date.today().strftime("%m-%d-%Y")
        file = open('scraped_data/%s.json' % datestr, 'w+b')
        self.files[spider] = file
        self.exporter = JsonLinesItemExporter(file, ensure_ascii=False)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        # print item['author']
        # print item['title']
        # print item['content']
        # print item['href']

        return item
Пример #7
0
class KinoPipeline(object):

    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
         pipeline = cls()
         crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
         crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
         return pipeline

    def spider_opened(self, spider):
        # It might be possible to add indent=4 and ensure_ascii=False somewhere.
        file = open('output/' + spider.name + '.jsonl', 'w+b', encoding='utf-8')
        self.files[spider] = file
        self.exporter = JsonLinesItemExporter(file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Пример #8
0
class MedPipeline(object):
    def __init__(self):
        self.ids_seen = set()

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        self.file = open('medData.json', 'wb')
        self.expoter = JsonLinesItemExporter(self.file)
        self.expoter.start_exporting()
        
    def spider_closed(self, spider):
        self.expoter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        if int(item['reply_num'][0]) == 0:
        	raise DropItem("no reply in %s" % item)
        elif item['post_id'] in self.ids_seen:
        	raise DropItem("Duplicate item found: %s" % item)
        else:
            self.ids_seen.add(item['post_id'])
            self.expoter.export_item(item)
            return item
Пример #9
0
 def spider_opened(self, spider):
     for i in self.JSONWriters.values():
         file = open('%s_out.json' % i, 'w+b')
         self.files[spider] = file
         exporter = JsonLinesItemExporter(file)
         self.exporters[i] = exporter
         exporter.start_exporting()
     print(self.exporters)
Пример #10
0
 def _exporter_for_item(self, item):
     season = item["season"]
     if season not in self.season_to_exporter:
         f = open(f'./outputs/champions_league_{season}.json', 'wb')
         exporter = JsonLinesItemExporter(f)
         exporter.start_exporting()
         self.season_to_exporter[season] = exporter
     return self.season_to_exporter[season]
Пример #11
0
 def process_item(self, item, spider):
     if not isinstance(item, Publisher):
         return item
     ts = int(time.time())
     try:
         is_updated = self._is_updated(item)
         ret = self.coll.update({"_id": item["_id"]}, {
             "$setOnInsert": {
                 "_id": item["_id"],
                 "username": item["username"],
                 "first_scraped_ts": ts,
             },
             "$set": {
                 "full_name": item["full_name"],
                 "profile_pic_url": item["profile_pic_url"],
                 "profile_pic_url_hd": item["profile_pic_url_hd"],
                 "followed_by": item["followed_by"],
                 "biography": item["biography"],
                 "external_url": item["external_url"],
                 "published_count": item["published_count"],
                 "downloaded_avatar_info":
                 item.get("downloaded_avatar_info"),
                 "update_ts": ts,
                 "begin_ts": ts,
                 "status": -1
             }
         },
                                upsert=True)
         if ret['updatedExisting']:
             logger.info('Updated publisher: %s', item["username"])
         else:
             logger.info('Inserted publisher: %s', item["username"])
         if is_updated:
             logger.info('Publisher %s is updated.', item["username"])
             filename = '{}.jl'.format(item["username"])
             filename = os.path.join(self.export_filepath, filename)
             export_file = open(filename, 'wb')
             exportor = JsonLinesItemExporter(export_file)
             exportor.start_exporting()
             exportor.export_item(item)
             exportor.finish_exporting()
             logger.info('dumped item to file: %s', item["username"])
             self.task.send_task('sync_publisher', (item["username"], ))
             logger.info('Send task sync_publisher: %s', item["username"])
         else:
             logger.info(
                 'Publisher %s is not updated. No dumping data or sending task',
                 item["username"])
     except RedisError:
         logger.error('Send task Failed. Network unreachable')
         raise DropItem('Send sync_publisher task FAILED. DROP ITEM %s' %
                        item["username"])
     except:
         logger.error('DB FAILED: %s', traceback.format_exc())
         raise DropItem('Save publisher to db FAILED. DROP ITEM %s' %
                        item["_id"])
     else:
         return item
Пример #12
0
class ArticleJsonSavePipeline(object):
    def __init__(self):
        self.file = open('data.json', 'wb')
        self.exporter = JsonLinesItemExporter(self.file)
        self.exporter.start_exporting()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Пример #13
0
class DictionaryPipeline(FileExporter):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        pass

    def spider_closed(self, spider):
        pass

    def process_item(self, item, spider):
        DATA_STORE = spider.settings.get('DATA_STORE')
        if item and \
                'letter' in item.keys() and \
                'strongs_number' in item.keys() and \
                'word_original' in item.keys() and \
                'word_translated' in item.keys():
            found_in_words_file = False

            language = 'all'
            if item['strongs_number'][0] == 'H':
                language = 'hebrew'
            elif item['strongs_number'][0] == 'G':
                language = 'greek'

            WORDS_FILE = os.path.join(
                DATA_STORE,
                spider.settings.get('DICTIONARY_FILE') %
                (language, item['letter']))
            if os.path.exists(WORDS_FILE):
                with open(WORDS_FILE, 'r') as words:
                    for word in words:
                        data = json.loads(word)
                        if item['word_translated'] == data['word_translated'] and \
                                item['strongs_number'] == data['strongs_number']:
                            found_in_words_file = True
                            break
            else:
                ensure_dir('%s' % os.path.dirname(WORDS_FILE))

            if not found_in_words_file:
                words_file = open(WORDS_FILE, 'a+')
                self.files[spider] = words_file
                self.exporter = JsonLinesItemExporter(words_file)
                self.exporter.start_exporting()
                self.exporter.export_item(item)
                self.exporter.finish_exporting()
                word_file = self.files.pop(spider)
                word_file.close()
        return item
Пример #14
0
 def _exporter_for_item(self, item):
     doc_id = item['url_id']
     dpath = Path('gradsop/data')
     if doc_id not in self.url_id_to_exporter:
         f = open(dpath/f'{doc_id}.json', 'wb')
         exporter = JsonLinesItemExporter(f)
         exporter.start_exporting()
         self.url_id_to_exporter[doc_id] = exporter
     return self.url_id_to_exporter[doc_id]
Пример #15
0
 def open_spider(self, spider):
     destination_file = spider.settings.get("DESTINATION_FILE")
     destination_dir = os.path.dirname(destination_file)
     os.makedirs(destination_dir, exist_ok=True)
     if os.path.exists(destination_file):
         os.replace(destination_file, '{}_backup_{}'.format(destination_file, datetime.utcnow().isoformat()))
     file_ = open(destination_file, 'ab+')
     company_exporter = JsonLinesItemExporter(file_)
     company_exporter.start_exporting()
     self.items_exporter = company_exporter
Пример #16
0
    def _get_exporter(self, item):
        year = item['year']
        if year not in self.year_exporters:
            f = open(os.path.join(self.database_dir, '{}.json'.format(year)),
                     'ab')
            exporter = JsonLinesItemExporter(f)
            exporter.start_exporting()
            self.year_exporters[year] = exporter

        return self.year_exporters[year]
Пример #17
0
 def get_exporter_for_item(self, item, hasDate=False):
     file = self.get_exporter_key_for_item(item)
     if file not in self.all_exporters:
         f = open(file, 'wb')
         exporter = JsonLinesItemExporter(f)
         exporter.start_exporting()
         self.all_exporters[file] = exporter
         if hasDate:
             print(f'----- Starting crawling process for {item["race_id"]} ({item["race_date"]})')
     return self.all_exporters[file]
 def _exporter_for_item(self, item, spider):
     url = item['url']
     ticker = re.search(r'/[A-Z]+/', url).group()[1:-1]
     metric = re.search(r'[A-Z]/.+p', url).group()[2:-2]
     f_name = f'{ticker}-{metric}'
     if f_name not in self.f_lst:
         f = open(f_name + '.jl', 'wb')
         exporter = JsonLinesItemExporter(f)
         exporter.start_exporting()
         self.f_lst[f_name] = exporter
     return self.f_lst[f_name]
Пример #19
0
 def _exporter_for_item(self, item):
     # Get the subject of the current item
     subject = item['subject']
     # If this is a new subject, create a new file with subject.jl as its name                                                                                                                                                                                                                                           ject.jl as name to store all courses of the same subject
     if subject not in self.subject_exporters:
         f = open(Path(OUTPUT_PATH).joinpath(f'{subject}.jl'), mode='wb')
         exporter = JsonLinesItemExporter(f)
         exporter.start_exporting()
         self.subject_exporters[
             subject] = exporter  # add a new entry in the exporter dictionary
     return self.subject_exporters[subject]
 def process_item(self, item, spider):
     filename = str(item['listing'][0]['id']) + '.jl'
     with open(filename, 'wb') as file:
         exporter = JsonLinesItemExporter(
             file, fields_to_export=['listing', 'trovokasa'])
         exporter.start_exporting()
         exporter.export_item(item)
         exporter.finish_exporting()
         pathlib.Path(__file__).parents[1].joinpath(filename).rename(
             self.feed_path.joinpath(filename))
     return item
Пример #21
0
class TutorialPipeline(object):
    def open_spider(self, spider):
        self.file = open('result.json', 'w')
        self.exporter = JsonLinesItemExporter(self.file)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Пример #22
0
class WxappPipeline:
    def __init__(self):
        self.fp = open("wxapp.json", "wb")
        self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False)
        self.exporter.start_exporting()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.fp.close()
Пример #23
0
class JsonLPipeline:
    def open_spider(self, spider):
        self.file = open('static/maquinas.jsonl', 'ab')
        self.exporter = JsonLinesItemExporter(self.file)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Пример #24
0
	def _exporter_for_item(self, item):
		if 'category' in item.keys():
			category = item['category']
		else:
			category = 'reviews_of_'+item['review_category']
		
		if category not in self.category_to_exporter.keys():
			f = open(f'{category}.json', 'wb')
			exporter = JsonLinesItemExporter(f,indent=4)
			exporter.start_exporting()
			self.category_to_exporter[category] = exporter
		
		return self.category_to_exporter[category]
Пример #25
0
class JsonPickerPipe(object):
    def __init__(self):
        self.jsonfile = open("arts.json","wb")
        self.exporter = JsonLinesItemExporter(self.jsonfile,encoding="utf-8")
        self.exporter.start_exporting()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

    def close_spider(self,spider):
        self.jsonfile.close()
        self.exporter.finish_exporting()
Пример #26
0
class JsonExportPipeline(object):
    def open_spider(self, spider):
        now = datetime.datetime.now()
        file = open(now.strftime('%Y%m%d%H%M%S%f.json'), 'wb')
        self.exporter = JsonLinesItemExporter(file)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.exporter.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Пример #27
0
class Artnet_Headline_Pipeline:
    def process_item(self, item, spider):
        self.exporter.export_item(item)
        #print("Pipeline test:  " + item['blurb'][0])
        return item

    def __init__(self):
        self.file = open("metadata.jsonl", 'wb')
        self.exporter = JsonLinesItemExporter(self.file)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()
Пример #28
0
class TrackerPipeline(object):
    def __init__(self):
        self.file = open('items.json', 'wb')
        self.exporter = JsonLinesItemExporter(self.file,
                                              encoding='utf-8',
                                              ensure_ascii=False)
        self.exporter.start_exporting()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

    def close_spider(self, spider):
        self.file.close()
Пример #29
0
class EduSpider(scrapy.Spider):
    """ Used to scrape .edu websites for web technology statistics """
    name = 'edu'

    def __init__(self):
        scrapy.Spider.__init__(self)
        baseurl = 'https://domaintyper.com/top-websites/most-popular-websites-with-edu-domain/page/'
        logpath = '/home/www/flasksite/static/scenes/unipagestats/schools.jl'

        self.start_urls = [baseurl + str(i) for i in xrange(1, 30)]
        self.domain = 'domaintyper.com'
        self.exporter = JsonLinesItemExporter(open(logpath, 'wb+'))

    def parse(self, response):
        self.exporter.start_exporting()
        urls = [url.encode('utf-8') for url in response.css('.wsTR > td:nth-child(2)').xpath('text()').extract()]
        for url in urls:
            fullurl = 'http://www.' + url + '/'
            yield scrapy.Request(fullurl, callback=self.parse_edu_site)

    def parse_edu_site(self, response):
        data = SiteData()
        tc = TagCounter()

        # Fill summary fields
        data['url'] = response.url
        data['domain'] = '.'.join(response.url.split('/')[2].split('.')[-2:])
        data['name'] = data['domain'].split('.')[0]
        data['title'] = response.xpath('//title/text()').extract()[0].encode('utf-8')

        # Fill CSS fields
        data['css_paths'] = [stylesheet.encode('utf-8') for stylesheet in response.xpath('//link[@rel="stylesheet"]/@href').extract()]
        data['css_files'] = [stylesheet.split('/')[-1] for stylesheet in data['css_paths']]

        # Fill JS fields
        data['js_paths'] = [script.encode('utf-8') for script in response.xpath('//script/@src').extract()]
        data['js_files'] = [script.split('/')[-1] for script in data['js_paths']]

        # Fill tag fields
        tc.feed(response.body)
        data['tagcount'] = tc.tagcount
        data['nonvoidcount'] = tc.nonvoid_tagcount
        data['topnest'] = tc.topnest

        self.exporter.export_item(data)
        yield data

    def __del__(self):
        scrapy.Spider.__del__(self)
        self.exporter.finish_exporting()
Пример #30
0
    def process_item(self, item, spider):
        directory = get_data_dir(item['key'])
        if not os.path.exists(directory):
            os.makedirs(directory)

        write_file = directory + "/" + item['key'] + ".jsonlines"
        f = open(write_file, "w")
        exporter = JsonLinesItemExporter(f)
        exporter.start_exporting()
        exporter.export_item(item)
        exporter.finish_exporting()
        f.close()

        return item
Пример #31
0
class CrawlerPipeline(object):
    def __init__(self):
        self.file = None
        self.exporter = None
        self.dirname = None
        self.articles_seen = set()

    def open_spider(self, spider):
        self.file = open('articles.json', 'a+b')

        self.exporter = JsonLinesItemExporter(self.file)
        self.exporter.start_exporting()

        self.file.seek(0)
        articles_seen = [
            json.loads(line)['url'] for line in self.file.read().splitlines()
        ]
        self.articles_seen = set(articles_seen)

        self.dirname = os.path.join("articles",
                                    spider.allowed_domains[0]) + "/html"
        if not os.path.exists(self.dirname):
            os.makedirs(self.dirname)

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, article, spider):
        if article['url'] in self.articles_seen:
            raise DropItem("Duplicate article found: %s" % article)

        filename = sha1(article['url']).hexdigest() + '.html'
        path = os.path.join(self.dirname, filename)

        item = {
            "domain": spider.allowed_domains[0],
            "url": article['url'],
            "title": article['title'],
            "path": path
        }

        with open(path, 'wb+') as f:
            f.write(article['html'])

        self.exporter.export_item(item)
        self.articles_seen.add(article['url'])

        return article
Пример #32
0
class TutorialPipeline(object):
    def __init__(self):
        self.file = open("headlines_2021.json", 'wb')
        self.exporter = JsonLinesItemExporter(self.file,
                                              encoding='utf-8',
                                              ensure_ascii=False)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Пример #33
0
 def _exporter_for_item(self, item, spider_name):
     create_time = int(float(item['create_time'])/86400)*86400
     dt = datetime.fromtimestamp(create_time)
     exporter_dir = os.path.join(self.export_root_dir, spider_name, str(dt.year), str(dt.month))
     pathlib.Path(exporter_dir).mkdir(parents=True, exist_ok=True)
     exporter_file_name = os.path.join(exporter_dir, str(dt.day))
     if create_time not in self.exporters_dict[spider_name]:
         self._close_exporter(spider_name)
         logging.info("start new exporter, saved to: %s", exporter_file_name)
         f = open(exporter_file_name, 'ab')
         exporter = JsonLinesItemExporter(f, ensure_ascii=False)
         exporter.start_exporting()
         self.exporters_dict[spider_name][create_time] = exporter
     logging.info("export to: %s", exporter_file_name)
     return self.exporters_dict[spider_name][create_time]
Пример #34
0
class JsonWriterPipeline(object):
    def __init__(self):
        self.file = open('top_posts.csv', 'w+b')

    def spider_opened(self, spider):
        file = open('top_posts.csv', 'w+b')
        self.exporter = JsonLinesItemExporter(self.file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        JsonLinesItemExporter(self.file).export_item(item)
        return item
Пример #35
0
class JsonPipelines:
    def __init__(self):
        self.file = open('data.json', 'wb')
        self.exporters = JsonLinesItemExporter(self.file,
                                               ensure_ascii=False,
                                               encoding='utf-8')
        self.file.write(b'[')
        self.exporters.start_exporting()

    def process_item(self, item, spider):
        self.exporters.export_item(item)
        self.file.write(b',')

    def close_item(self, spider):
        self.file.write(b']')
        self.file.close()
Пример #36
0
class WriteImdbPipeline(object):
    def __init__(self):
        self.filename = 'imdb.json'

    def open_spider(self, spider):
        self.jsonfile = open(self.filename, 'wb')
        self.exporter = JsonLinesItemExporter(self.jsonfile)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.jsonfile.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Пример #37
0
class MovieJsonPipeline(object):
    def open_spider(self, spider):
        if isinstance(spider, doubanSpider.DoubanSpider):
            file = open("../movies.json", 'wb')
            self.exporter = JsonLinesItemExporter(file, ensure_ascii=False)
            self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.exporter.file.close()

    def process_item(self, item, spider):
        if isinstance(item, MovieItem) and isinstance(
                spider, doubanSpider.DoubanSpider):
            self.exporter.export_item(item)
        return item
Пример #38
0
class TopicPipeline(FileExporter):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        pass

    def spider_closed(self, spider):
        pass

    def process_item(self, item, spider):
        DATA_STORE = spider.settings.get('DATA_STORE')
        if item and \
                'name' in item.keys() and \
                'url' in item.keys() and \
                'letter' in item.keys():
            found_in_topic_file = False
            TOPICS_FILE = os.path.join(
                DATA_STORE,
                spider.settings.get('TOPICS_FILE') % item['letter'])
            if os.path.exists(TOPICS_FILE):
                with open(TOPICS_FILE, 'r') as topics:
                    for topic in topics:
                        if item['name'] in topic:
                            found_in_topic_file = True

                            break
            else:
                ensure_dir('%s' % os.path.dirname(TOPICS_FILE))

            if not found_in_topic_file:
                topics_file = open(TOPICS_FILE, 'a+')
                self.files[spider] = topics_file
                self.exporter = JsonLinesItemExporter(topics_file)
                self.exporter.start_exporting()
                self.exporter.export_item(item)
                self.exporter.finish_exporting()
                chapter_file = self.files.pop(spider)
                chapter_file.close()
        return item
Пример #39
0
class DemoPipeline(object):
    def __init__(self):
        self.fp = open('resp.json', 'wb')
        self.exporter = JsonLinesItemExporter(self.fp,
                                              ensure_ascii=False,
                                              encoding='utf-8')
        self.exporter.start_exporting()

    def open_spider(self, spider):
        print('start')

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

    def close_spider(self, spider):
        self.fp.close()
Пример #40
0
class QiushiSpiderPipeline(object):
    def open_spider(self,spider):
        print('爬虫开始了')
        # wb 以二进制形式打开
        self.fp = open('duanzi.json', 'wb')
        self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False,encoding='utf-8')
        self.exporter.start_exporting()

    # 有 item 传过来的时候就会被调用
    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

    def close_spider(self,spider):
        self.exporter.finish_exporting()
        self.fp.close()
        print('爬虫结束了')
Пример #41
0
class OKCupidJsonPipeline(object):
    def __init__(self):
        dispatcher.connect(self.spider_opened, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        self.files = {}

    def spider_opened(self, spider):
        file = open('%s_items.json' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = JsonLinesItemExporter(file, ensure_ascii=True)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        #print item
        #uItem = urllib.urlencode(item)
        #jItem = dumps(uItem, cls=PythonObjectEncoder)
        self.exporter.export_item(item)
        return item
Пример #42
0
class ResolutionPipeline(object):
    """Pipeline used for ResolutionSpider."""
    def __init__(self):
        self.file = None
        self.exporter = None

        # compile regular expressions:

        # input looks like 'dec14R.aspx'
        # we need the resolution number (14R)
        self.resolution_number_pattern = re.compile(r"^\D+(?P<number>.+?)\..*$")

        # input looks like 'ממשלה/הממשלה ה - 34 בנימין נתניהו;'
        # we need the government number (34) and prime minister name (בנימין נתניהו)
        self.gov_pattern = re.compile(r'^.+\s??\-\s?(?P<gov_number>.+?)\s+?(?P<pm_name>.+?);?$')

    def open_spider(self, spider):
        """Initialize export JSON lines file."""
        self.file = open("gov.json", "wb")
        self.exporter = JsonLinesItemExporter(self.file, ensure_ascii=False)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        """Close export file."""
        self.file.close()
        self.exporter.finish_exporting()

    def process_item(self, item, spider):
        """Sanitize text for each field, and export to file."""
        try:
            data = {
                'url': item["url"],
                'date': self.get_date(item).timestamp,
                'resolution_number': self.get_resolution_number(item),
                'gov_number': self.get_gov_number(item),
                'pm_name': self.get_pm_name(item),
                'title': self.get_title(item),
                'subject': self.get_subject(item),
                'body': self.get_body(item),
            }
        except ResolutionError as ex:
            # if one of the fields fails sanitation,
            # raise and exception
            # and export the url leading to the specific resolution
            # for later (human) review
            self.exporter.export_item({'error': repr(ex),
                                       'url': item["url"],
                                      })
        else:
            self.exporter.export_item(data)

        return item

    # the following are specific field handling functions
    # e.g. cleaning, stripping, etc.
    # these should be called before dumping the data

    def get_date(self, item):
        if len(item["date"]) != 1:
            raise ResolutionError("Date field length is not 1 for item %s", item)
        return arrow.get(item["date"][0], "YYYYMMDD")

    def get_resolution_number(self, item):
        if len(item["resolution_number"]) != 1:
            raise ResolutionError("Resolution number field length is not 1 for item %s", item)
        return self.resolution_number_pattern.search(item["resolution_number"][0]).group('number')

    def get_gov_number(self, item):
        if len(item["gov"]) != 1:
            raise ResolutionError("Government field length is not 1 for item %s", item)
        gov_match = self.gov_pattern.search(item["gov"][0])
        return gov_match.group("gov_number")

    def get_pm_name(self, item):
        if len(item["gov"]) != 1:
            raise ResolutionError("Government field length is not 1 for item %s", item)
        gov_match = self.gov_pattern.search(item["gov"][0])
        return gov_match.group("pm_name")

    def get_title(self, item):
        if len(item["title"]) == 0:
            raise ResolutionError("Title fields is empty for item %s", item)
        return '\n'.join(item["title"]).strip()

    def get_subject(self, item):
        if len(item["subject"]) == 0:
            raise ResolutionError("Subject field is empty for item %s", item)
        return '\n'.join(item["subject"]).strip()

    def get_body(self, item):
        if len(item["body"]) == 0:
            raise ResolutionError("Body field is empty for item %s", item)
        # return '\n'.join(item["body"]).strip()

        # body is originally a list of lines
        # it is intentionally not stripped
        # some resolutions have custom css, tables,
        # and other crap which i'd rather not process here,
        # but in a later stage, unrelated to the scraper
        return item["body"]