Пример #1
0
def load_table(table, source = "default/test-0.jsonlines",\
			   modifier = "" , dblogin = "******" , \
			   as_name = None ):
	filename = source if as_name is None else as_name
	dbname, collectioname = parse_path(filename , modifier)
	connection = pymongo.MongoClient( dblogin )
	db = getattr(connection, dbname)
	collection = getattr( db , collectioname )
	try:
		result = collection.insert_many((set_id(obj) for obj in odicts(table)),ordered=False)
	except BulkWriteError as e:
		result = e.details
		errs = set()
		with open( "%s.%s" % (filename,"errors") , "a") as f:
			exporter = JsonLinesItemExporter(f)
			exporter.start_exporting()
			for err in result.get("writeErrors"):
				if not err.get("op").get("_id") in errs:
					obj = dict( item = err.get("op") , \
								error = err.get("errmsg") )
					errs.add( err.get("op").get("_id") )
					exporter.export_item(obj)
			exporter.finish_exporting()
			f.close()
	return result
Пример #2
0
class KinoPipeline(object):

    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
         pipeline = cls()
         crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
         crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
         return pipeline

    def spider_opened(self, spider):
        # It might be possible to add indent=4 and ensure_ascii=False somewhere.
        file = open('output/' + spider.name + '.jsonl', 'w+b', encoding='utf-8')
        self.files[spider] = file
        self.exporter = JsonLinesItemExporter(file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Пример #3
0
 def export_item(self, item):
     storage_file = open(self.item_storage_path(item["id"]), "w")
     item_exporter = JsonLinesItemExporter(storage_file)
     item_exporter.start_exporting()
     item_exporter.export_item(item)
     item_exporter.finish_exporting()
     storage_file.close()
Пример #4
0
class JsonExportPipeline(object):

    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s_all.json' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = JsonLinesItemExporter(file, encoding='utf-8', ensure_ascii=False)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Пример #5
0
class MedPipeline(object):
    def __init__(self):
        self.ids_seen = set()

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        self.file = open('medData.json', 'wb')
        self.expoter = JsonLinesItemExporter(self.file)
        self.expoter.start_exporting()
        
    def spider_closed(self, spider):
        self.expoter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        if int(item['reply_num'][0]) == 0:
        	raise DropItem("no reply in %s" % item)
        elif item['post_id'] in self.ids_seen:
        	raise DropItem("Duplicate item found: %s" % item)
        else:
            self.ids_seen.add(item['post_id'])
            self.expoter.export_item(item)
            return item
class JsonLinesExportPipeline(object):
    """
    app.pipelines.exporter_json_lines.JsonLinesExportPipeline
    """
    def __init__(self):
        self.files = {}
        self.exporter = None

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file_json_lines = open('%s_item_lines.json' % spider.name, 'w+b')
        self.files[spider] = file_json_lines
        self.exporter = JsonLinesItemExporter(file_json_lines)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file_json_lines = self.files.pop(spider)
        file_json_lines.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Пример #7
0
class QiushiPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        # The file created on Dec20 2015 will be named as "12-20-2015.json"
        datestr = date.today().strftime("%m-%d-%Y")
        file = open('scraped_data/%s.json' % datestr, 'w+b')
        self.files[spider] = file
        self.exporter = JsonLinesItemExporter(file, ensure_ascii=False)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        # print item['author']
        # print item['title']
        # print item['content']
        # print item['href']

        return item
Пример #8
0
class EduSpider(scrapy.Spider):
    """ Used to scrape .edu websites for web technology statistics """
    name = 'edu'

    def __init__(self):
        scrapy.Spider.__init__(self)
        baseurl = 'https://domaintyper.com/top-websites/most-popular-websites-with-edu-domain/page/'
        logpath = '/home/www/flasksite/static/scenes/unipagestats/schools.jl'

        self.start_urls = [baseurl + str(i) for i in xrange(1, 30)]
        self.domain = 'domaintyper.com'
        self.exporter = JsonLinesItemExporter(open(logpath, 'wb+'))

    def parse(self, response):
        self.exporter.start_exporting()
        urls = [url.encode('utf-8') for url in response.css('.wsTR > td:nth-child(2)').xpath('text()').extract()]
        for url in urls:
            fullurl = 'http://www.' + url + '/'
            yield scrapy.Request(fullurl, callback=self.parse_edu_site)

    def parse_edu_site(self, response):
        data = SiteData()
        tc = TagCounter()

        # Fill summary fields
        data['url'] = response.url
        data['domain'] = '.'.join(response.url.split('/')[2].split('.')[-2:])
        data['name'] = data['domain'].split('.')[0]
        data['title'] = response.xpath('//title/text()').extract()[0].encode('utf-8')

        # Fill CSS fields
        data['css_paths'] = [stylesheet.encode('utf-8') for stylesheet in response.xpath('//link[@rel="stylesheet"]/@href').extract()]
        data['css_files'] = [stylesheet.split('/')[-1] for stylesheet in data['css_paths']]

        # Fill JS fields
        data['js_paths'] = [script.encode('utf-8') for script in response.xpath('//script/@src').extract()]
        data['js_files'] = [script.split('/')[-1] for script in data['js_paths']]

        # Fill tag fields
        tc.feed(response.body)
        data['tagcount'] = tc.tagcount
        data['nonvoidcount'] = tc.nonvoid_tagcount
        data['topnest'] = tc.topnest

        self.exporter.export_item(data)
        yield data

    def __del__(self):
        scrapy.Spider.__del__(self)
        self.exporter.finish_exporting()
Пример #9
0
class WxappPipeline(object):

    def __init__(self):
        #爬虫开始之前,先打开文件(没有就创建文件)
        self.fp = open('wxapp_test2.json','wb')

        #创建导出文件
        self.exporter = JsonLinesItemExporter(self.fp,ensure_ascii=False,encoding='utf-8')

    def open_spider(self,spider):
        print('爬虫开始了。。。')

    def process_item(self, item, spider):
        #处理json数据
        self.exporter.export_item(item)
        return item    #因为pipeline有可能有多个,前面的处理完了item,如果不把item返回去,其他的pipeline获取不到item了,就无法处理item数据了

    def close_spider(self,spider):
        print('爬虫结束了。。。')
        #爬虫结束,关闭文件
        self.fp.close()
Пример #10
0
class HouseRedisPipeline(object):
    def __init__(self):

        self.newhouse_fp = open('new_house.json', 'wb')
        self.erhouse_fp = open('er_house.json', 'wb')

        self.newhouse_exporter = JsonLinesItemExporter(self.newhouse_fp,
                                                       ensure_ascii=False)
        self.erhouse_exporter = JsonLinesItemExporter(self.erhouse_fp,
                                                      ensure_ascii=False)

    # 保存json格式
    def process_item(self, item, spider):

        self.newhouse_exporter.export_item(item)
        self.erhouse_exporter.export_item(item)
        return item

    def close_spider(self, spider):
        self.newhouse_fp.close()
        self.erhouse_fp.close()
Пример #11
0
class DuanziPipeline(object):

    # 在初始化方法中打开duanzi.json
    def __init__(self):
        self.fp = open('duanzi.json', 'wb')
        self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False, encoding='utf-8')

    def open_spider(self, spider):
        print('爬虫开始了')

    # 在这个方法中 将 数据 写入 json 文件中
    def process_item(self, item, spider):
        # 这里简单了,不用转格式什么的了
        self.exporter.export_item(item)
        return item

    def close_spider(self, spider):
        self.fp.close()
        print('爬虫结束了')

    pass
Пример #12
0
class QsbkPipeline(object):
    def __init__(self):
        # JsonLinesItemExporter 必须要以二进制的方式打开
        # 注意:以二进制的方式打开写入,不需要指定编码格式;以字符串的形式打开写入,就需要指定编码格式
        self.fp = open('duanzi.json', 'wb')

        # 定义一个 exporters
        self.exporter = JsonLinesItemExporter(self.fp,
                                              ensure_ascii=False,
                                              encoding='utf-8')

    def open_spider(self, spider):
        print('爬虫开始了...')

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

    def close_spider(self, spider):
        self.fp.close()
        print('爬虫结束了。')
Пример #13
0
class EsfsalePipeline(object):
    def __init__(self):
        self.path = PROJECT_PATH
        self.json = open(os.path.join(self.path, 'Esfsale{}.json'.format(datetime.now().strftime('%Y-%m-%d'))), 'ab')
        self.json_exporter = JsonLinesItemExporter(self.json, ensure_ascii=False, encoding='utf-8')
        self.csv = open(os.path.join(self.path, 'Esfsale{}.csv'.format(datetime.now().strftime('%Y-%m-%d'))), 'ab')
        self.csv_exporter = CsvItemExporter(self.csv, encoding='utf-8')

    def open_spider(self, spider):
        print("爬虫开始了")

    def process_item(self, item, spider):
        self.json_exporter.export_item(item)
        self.csv_exporter.export_item(item)
        return item

    def close_spider(self, spider):
        self.json.close()
        self.csv.close()

        print("爬虫结束了")
Пример #14
0
class AnzSpiderPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        return pipeline

    def process_item(self, item, spider):
        file = open('%s_items.json' % spider.name, 'w+b')
        self.files[spider] = file
        file.write('{"anzbank":')
        self.exporter = JsonLinesItemExporter(file)
        self.exporter.start_exporting()
        self.exporter.export_item(item)
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.write("}")
        file.close()
        return item
Пример #15
0
class ModulePipeline(object):
    def __init__(self):
        self.fp = open("sogou.json", 'wb')
        self.exporter = JsonLinesItemExporter(self.fp,
                                              ensure_ascii=False,
                                              encoding='utf-8')
        # self.exporter.start_exporting()
    def open_spider(self, spider):
        print("start!!")

    def process_item(self, item, spider):
        if item['name']:
            self.exporter.export_item(item)
        else:
            pass
        return item

    def close_spider(self, spider):
        # self.exporter.finish_exporting()
        self.fp.close()
        print("end!!")
Пример #16
0
class News163JsonPipeline(object):

    @classmethod
    def from_crawler(cls, crawler):
         pipeline = cls()
         crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
         crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
         return pipeline

    def spider_opened(self, spider):
        name = '{0}_{1}'.format(spider.name, str(datetime.now()).replace(':', '-'))
        self.file = open('{0}_products.json'.format(name) , 'wb')
        self.exporter = JsonLinesItemExporter(self.file)

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Пример #17
0
class FangtianxiaPipeline(object):
    def __init__(self):

        self.newhouse_fp = open('newhouse.json', 'wb')
        self.esfhouse_fp = open('esfhouse.json', 'wb')

        self.newhouse_exporter = JsonLinesItemExporter(self.newhouse_fp,
                                                       ensure_ascii=False)
        self.esfhouse_exporter = JsonLinesItemExporter(self.esfhouse_fp,
                                                       ensure_ascii=False)

    def process_item(self, item, spider):

        self.newhouse_exporter.export_item(item)
        self.esfhouse_exporter.export_item(item)

        return item

    def close_spider(self, spider):
        self.newhouse_fp.close()
        self.esfhouse_fp.close()
Пример #18
0
class JsonExportPipeline(object):
    def __init__(self, settings):
        self.save_file = open(
            os.path.join(settings.get("RESULT_PATH"), "result.json"), "wb")
        self.exporter = JsonLinesItemExporter(self.save_file,
                                              encoding="utf8",
                                              ensure_ascii=False)
        self.exporter.start_exporting()

    @classmethod
    def from_crawler(cls, crawler):
        settings = crawler.settings
        return cls(settings)

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.save_file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Пример #19
0
class HousePipeline(object):
    def __init__(self):
        self.newhouse_fp = open('newhouse.json', 'wb')
        self.esfhouse_fp = open('esfhouse.json', 'wb')
        self.newhouse_exporter = JsonLinesItemExporter(self.newhouse_fp,
                                                       ensure_ascii=False,
                                                       encoding='utf-8')
        self.esfhouse_exporter = JsonLinesItemExporter(self.esfhouse_fp,
                                                       ensure_ascii=False,
                                                       encoding="utf-8")

    def process_item(self, item, spider):
        if isinstance(item, HouseItem):
            self.newhouse_exporter.export_item(item)
        if isinstance(item, esfHouseItem):
            self.esfhouse_exporter.export_item(item)
        return item

    def close_spider(self, spider):
        self.newhouse_fp.close()
        self.esfhouse_fp.close()
Пример #20
0
class WeiboPipeline(object):
    def __init__(self):
        self.comments_fp = open("comments.json", "wb")
        self.people_fp = open('people.json', 'wb')
        self.statuses_fp = open('statuses.json', 'wb')
        self.comments_exporter = JsonLinesItemExporter(self.comments_fp,
                                                       ensure_ascii=False)
        self.people_exporter = JsonLinesItemExporter(self.people_fp,
                                                     ensure_ascii=False)
        self.statuses_exporter = JsonLinesItemExporter(self.statuses_fp,
                                                       ensure_ascii=False)

    def process_item(self, item, spider):
        if isinstance(item, CommentItem):
            self.comments_exporter.export_item(item)
        elif isinstance(item, PeopleItem):
            self.people_exporter.export_item(item)
        else:
            self.statuses_exporter.export_item(item)

        return item

    def close_item(self, spider):
        print("存储成功!")
        self.comments_fp.close()
        self.people_fp.close()
        self.statuses_fp.close()
class ZhihuCrawlerPipeline(object):
    def __init__(self):
        self.fp1 = open('./data/answers.txt', 'wb')
        self.fp2 = open('./data/users_v2.txt', 'wb')
        self.fp3 = open('./data/questions.txt', 'wb')
        self.exporter1 = JsonLinesItemExporter(self.fp1,
                                               ensure_ascii=False,
                                               encoding='utf-8')
        self.exporter2 = JsonLinesItemExporter(self.fp2,
                                               ensure_ascii=False,
                                               encoding='utf-8')
        self.exporter3 = JsonLinesItemExporter(self.fp3,
                                               ensure_ascii=False,
                                               encoding='utf-8')

    def open_spider(self, spider):
        print('spider has opened.')

    def process_item(self, item, spider):
        if (isinstance(item, ZhihuAnswerItem)):
            self.exporter1.export_item(item)
        elif (isinstance(item, ZhihuUserItem)):
            self.exporter2.export_item(item)
        elif (isinstance(item, ZhihuQuestionItem)):
            self.exporter3.export_item(item)
        return item

    def close_spider(self, spider):
        self.exporter1.finish_exporting()
        self.exporter2.finish_exporting()
        self.exporter3.finish_exporting()
        self.fp1.close()
        self.fp2.close()
        self.fp3.close()
        print('spider has closed.')
Пример #22
0
class NewsPipeline(object):
    def __init__(self):
        self.fp_article = open('article.json', 'wb')
        self.fp_author = open('author.json', 'wb')
        self.fp_navItem = open('navItem.json', 'wb')
        self.exporter_article = JsonLinesItemExporter(self.fp_article,
                                                      ensure_ascii=False,
                                                      encoding='utf-8')
        self.exporter_author = JsonLinesItemExporter(self.fp_author,
                                                     ensure_ascii=False,
                                                     encoding='utf-8')
        self.exporter_navItem = JsonLinesItemExporter(self.fp_navItem,
                                                      ensure_ascii=False,
                                                      encoding='utf-8')

    def process_item(self, item, spider):
        if isinstance(item, ArticleItem):
            self.exporter_article.export_item(item)
        if isinstance(item, AuthorItem):
            self.exporter_author.export_item(item)
        if isinstance(item, NavItem):
            self.exporter_navItem.export_item(item)
        return item

    def close_spider(self, spider):
        self.fp_article.close()
        self.fp_author.close()
        self.fp_navItem.close()
Пример #23
0
class SfwPipeline(object):
    def __init__(self):
        self.fp_new_house = open('new_house.json', 'wb')
        self.fp_esf_house = open('esf_house.json', 'wb')

        self.exporter_new_house = JsonLinesItemExporter(self.fp_new_house,
                                                        ensure_ascii=False)
        self.exporter_esf_house = JsonLinesItemExporter(self.fp_esf_house,
                                                        ensure_ascii=False)

    def process_item(self, item, spider):
        if isinstance(item, NewHouseItem):
            print('写入一条新手房数据')
            self.exporter_new_house.export_item(item)
        else:
            print('写入一条二手房数据')
            self.exporter_esf_house.export_item(item)
        return item

    def close_spider(self, spider):
        self.fp_new_house.close()
        self.fp_esf_house.close()
Пример #24
0
    def _make_fileobj(self):
        """
        Build file object from items.
        """

        bio = BytesIO()
        f = gzip.GzipFile(mode='wb', fileobj=bio) if self.use_gzip else bio

        # Build file object using ItemExporter
        exporter = JsonLinesItemExporter(f, encoding='utf-8')
        exporter.start_exporting()
        for item in self.items:
            exporter.export_item(item)
        exporter.finish_exporting()

        if f is not bio:
            f.close()  # Close the file if GzipFile

        # Seek to the top of file to be read later
        bio.seek(0)

        return bio
Пример #25
0
class FangPipeline(object):
    def __init__(self):
        self.xf = open('xf.json', 'wb')
        self.es = open('es.json', 'wb')
        self.xf_ex = JsonLinesItemExporter(self.xf,
                                           ensure_ascii=False,
                                           encoding='utf-8')
        self.es_ex = JsonLinesItemExporter(self.es,
                                           ensure_ascii=False,
                                           encoding='utf-8')

    def process_item(self, item, spider):

        if item['xf_or_es'] == 'xf':
            self.xf_ex.export_item(item)
        elif item['xf_or_es'] == 'es':
            self.es_ex.export_item(item)
        return item

    def close(self):
        self.xf.close()
        self.es.close()
Пример #26
0
class JsonLinesExportPipeline(object):
    nbLines = 0
    nbFiles = 0

    def __init__(self):
        dispatcher.connect(self.spider_opened, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        self.files = {}

    def spider_opened(self, spider):
        i = datetime.now()
        file = codecs.open(
            '%s_items_%s_%s.json' %
            (spider.name, self.nbFiles, i.strftime('%Y-%m-%dT%H-%M-%S')),
            'w+b')
        self.files[spider] = file
        self.exporter = JsonLinesItemExporter(file, ensure_ascii=False)
        self.exporter.start_exporting()

    def process_item(self, item, spider):
        if self.nbLines >= 10000:
            self.nbFiles = self.nbFiles + 1
            self.nbLines = 0
            i = datetime.now()
            file = codecs.open(
                '%s_items_%s_%s.json' %
                (spider.name, self.nbFiles, i.strftime('%Y-%m-%dT%H-%M-%S')),
                'w+b')
            self.files[spider] = file
            self.exporter = JsonLinesItemExporter(file, ensure_ascii=False)
        else:
            self.nbLines = self.nbLines + 1
        self.exporter.export_item(item)
        return item

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()
Пример #27
0
class WxappPipeline:
    def __init__(self):
        self.file = open('wxapp.json', 'wb')
        self.export = JsonLinesItemExporter(self.file,
                                            ensure_ascii=False,
                                            encoding='utf-8')
        pass

    def open_spider(self, spider):
        print("爬虫开始...")
        pass

    def process_item(self, item, spider):
        print("存储...")
        self.export.export_item(item)
        return item
        pass

    def close_spider(self, spider):
        print("爬虫结束...")
        self.file.close()
        pass
Пример #28
0
class QsbkPipeline(object):
    def __init__(self):
        self.fp = open('duanzi.json', 'wb')
        self.exporter = JsonLinesItemExporter(self.fp,
                                              ensure_ascii=False,
                                              encoding='utf-8')

    def open_spider(self, spider):
        pass

    def process_item(self, item, spider):
        self.exporter.export_item(item)

        return item

    def close_spider(self, spider):
        self.fp.close()


# from scrapy.exporters import JsonItemExporter
#
# class QsbkPipeline(object):
#     def __init__(self):
#         self.fp = open('duanzi.json', 'wb')
#         self.exporter = JsonItemExporter(self.fp, ensure_ascii=False,
#                                          encoding='utf-8')
#         self.exporter.start_exporting()
#
#     def open_spider(self, spider):
#         pass
#
#     def process_item(self, item, spider):
#         self.exporter.export_item(item)
#
#         return item
#
#     def close_spider(self, spider):
#         self.exporter.finish_exporting()
#         self.fp.close()
Пример #29
0
class PowerMarketPipeline:
    def __init__(self):
        pass

    def open_spider(self, spiders):
        print("------ TablePipeline start ------")

    def process_item(self, item, spider):
        if isinstance(item, CurrentItem):
            self.fp = open(item['rename'], 'wb')
            self.exporters = JsonLinesItemExporter(self.fp,
                                                   ensure_ascii=False,
                                                   encoding='utf-8')
            self.exporters.export_item(item)
            self.fp.close()
        else:
            Type = str(type(item))
            print('------ ' + 'Warnning, PowerMarketPipeline: ItemType(' +
                  Type + ') matched failed, so skip it. ------')
        return item

    def close_spider(self, spider):
        print("------ TablePipeline end ------")
Пример #30
0
class FangPipeline:
    def __init__(self):
        self.fp_new = open("new.json", 'bw')
        self.exporter_new = JsonLinesItemExporter(self.fp_new,
                                                  ensure_ascii=False,
                                                  encoding='utf-8')
        self.fp_old = open('old.json', 'bw')
        self.exporter_old = JsonLinesItemExporter(self.fp_old,
                                                  ensure_ascii=False,
                                                  encoding='utf-8')

    def process_item(self, item, spider):
        old_item = item.get('old')
        new_item = item.get('new')
        if new_item:
            self.exporter_new.export_item(item)
        if old_item:
            self.exporter_old.export_item(item)
        return item

    def close_spider(self, spider):
        self.fp_new.close()
        self.fp_old.close()
Пример #31
0
class FangPipeline(object):
    def __init__(self):
        self.newhouse_fp = open('newhouse.json', 'wb')
        self.esfhouse_fp = open('esfhouse.json', 'wb')
        self.newhouse_exporter = JsonLinesItemExporter(self.newhouse_fp, ensure_ascii=False)
        self.esfhouse_exporter = JsonLinesItemExporter(self.esfhouse_fp, ensure_ascii=False)

    def process_item(self, item, spider):
        # if item['sale'] is not None:
        #     self.newhouse_exporter.export_item(item)
        # else:
        #     self.esfhouse_exporter.export_item(item)
        try:
            sale = item['sale']
            self.newhouse_exporter.export_item(item)
        except:
            self.esfhouse_exporter.export_item(item)

        return item

    def close_spider(self, spider):
        self.newhouse_fp.close()
        self.esfhouse_fp.close()
Пример #32
0
class NetlendingPipeline:
    def __init__(self):
        #wb以二进制方式打开
        self.fp = open("comments.json", 'wb')
        self.exporter = JsonLinesItemExporter(self.fp,
                                              ensure_ascii=False,
                                              encoding='utf-8')

    def open_spider(self, spider):
        print("爬虫开始了...")

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

    def process_info(self, info):
        info = [re.sub(r"\t|\n|\s", "", i) for i in info]
        info = [i for i in info if len(i) > 0]
        return info

    def close_spider(self, spider):
        self.fp.close()
        print("爬虫结束了...")
Пример #33
0
class QsbkPipeline(object):
    # 好处:每次调用export_item就把数据储存到磁盘,不消耗内存。坏处:不满足json数据格式
    def __init__(self):
        # wb 以二进制打开;因为JsonItemExporter要以byte类型储存
        self.fp = open("budejie.json", "wb")
        self.exporter = JsonLinesItemExporter(self.fp,
                                              ensure_ascii=False,
                                              encoding="utf-8")

    def open_spider(self, spider):
        '''打开爬虫就执行'''
        print('爬虫开始啦.....')

    def process_item(self, item, spider):
        '''保存数据'''
        self.exporter.export_item(item)
        return item

    def close_spider(self, spider):
        '''爬虫关闭时执行'''
        # 关闭文件
        self.fp.close()
        print("爬虫结束啦....")
Пример #34
0
class OKCupidJsonPipeline(object):
    def __init__(self):
        dispatcher.connect(self.spider_opened, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        self.files = {}

    def spider_opened(self, spider):
        file = open('%s_items.json' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = JsonLinesItemExporter(file, ensure_ascii=True)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        #print item
        #uItem = urllib.urlencode(item)
        #jItem = dumps(uItem, cls=PythonObjectEncoder)
        self.exporter.export_item(item)
        return item
Пример #35
0
class QsbkPipeline(object):
    def __init__(self):
        self.f = open("qsbk.json", "wb")
        self.exporter = JsonLinesItemExporter(self.f,
                                              ensure_ascii=False,
                                              encoding='utf-8')
        self.start_time = datetime.datetime.now()

    def open_spider(self, spider):
        print("[{}]开始抓取数据".format(self.start_time))

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

    def close_spider(self, spider):
        self.f.close()
        end_time = datetime.datetime.now()
        print("数据抓取完毕,总计用时:{}".format(end_time - self.start_time))


# class QsbkPipeline(object):
#     def __init__(self):
#         self.f = open("qsbk.json", "w", encoding='utf-8')
#
#     def open_spider(self, spider):
#         print("开始抓取数据")
#
#     def process_item(self, item, spider):
#         item_json = json.dumps(dict(item), ensure_ascii=False)
#         # print(item_json)
#         self.f.write(item_json + "\n")
#         return item
#
#     def close_spider(self, spider):
#         self.f.close()
#         print("数据抓取完毕")
Пример #36
0
class SpiderPipeline(object):

    # 爬虫开始时打开的文件,也可以放再open函数中
    def __init__(self):
        # 爬虫数据要写入的文件(wb 二进制写入 JsonItemExporter必须用这个)
        # 同时二进制写入就不再指定编码方式
        self.fp = open("saveText.json", 'wb')
        self.exporter = JsonLinesItemExporter(self.fp,
                                              ensure_ascii=False,
                                              encoding='utf-8')
        # 不需要开始导入
        # self.exporter.start_exporting()

    # 爬虫打开之后就会调用这个函数
    def open_spider(self, spider):
        print("爬虫开始了......")

    # 爬虫运行过程中的传递数据调用这个函数
    def process_item(self, item, spider):
        # 需要将传入的数据 item,导入json
        # 将字典 dict 转换成字符串 str,并解释成中文
        # item_json = json.dumps(item, ensure_ascii=False)

        # item是 ITEM模型传过来的啊,用dict转变数据类型
        # item_json = json.dumps(dict(item), ensure_ascii=False)
        # self.fp.write(item_json + "\n")
        # return item

        self.exporter.export_item(item)
        return item

    # 爬虫调用完成之后调用
    def close_spider(self, spider):
        # 不需要完成导入
        # self.exporter.finish_exporting()
        self.fp.close()
        print("爬虫结束了......")
Пример #37
0
class HuabanPipeline(object):
    def __init__(self):
        '''Open file to save the exported Items'''
        # save info of BoardItem
        self.board_info = open(
            'D:/litreily/Pictures/python/huaban/boards.json', 'w+b')
        self.board_exporter = JsonItemExporter(self.board_info,
                                               encoding='utf-8',
                                               indent=4)

        # save info of PinItem
        self.pin_info = open('D:/litreily/Pictures/python/huaban/pins.json',
                             'w+b')
        self.pin_exporter = JsonLinesItemExporter(self.pin_info,
                                                  encoding='utf-8',
                                                  indent=4)

    def open_spider(self, spider):
        '''Start exporting BoardItem'''
        self.board_exporter.start_exporting()
        self.pin_exporter.start_exporting()

    def process_item(self, item, spider):
        if isinstance(item, BoardItem):
            self.board_exporter.export_item(item)
        elif isinstance(item, PinItem):
            self.pin_exporter.export_item(item)

        return item

    def close_spider(self, spider):
        '''finish exporting and close files'''
        self.board_exporter.finish_exporting()
        self.pin_exporter.finish_exporting()
        self.board_info.close()
        self.pin_info.close()
Пример #38
0
class JsonExportPipeline(object):
    def __init__(self):
        self.file_name = 'svet_androida_links_to_articles.json'
        self.file_handle = None

    def open_spider(self, spider):
        print('JsonExportPipeline Exporter opened')

        file = open(self.file_name, 'wb')
        self.file_handle = file

        self.exporter = JsonLinesItemExporter(file,
                                              encoding='utf-8',
                                              ensure_ascii=False)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        print('JsonExportPipeline Exporter closed')
        self.exporter.finish_exporting()
        self.file_handle.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Пример #39
0
class ScrapyGraphExport(object):

    #Open the exporting file and init the spider
    def __init__(self):
        dispatcher.connect(self.response_received,
                           signal=signals.response_received)
        dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
        dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
        self.output = {}
        self.file = open("/tmp/rawData_1.json", 'wb')
        self.exporter = JsonLinesItemExporter(self.file,
                                              encoding='utf-8',
                                              ensure_ascii=False)

    def spider_opened(self, spider):
        print "Spider opened"

    def response_received(self, response, request, spider):

        hxs = HtmlXPathSelector(response)
        i = torItem()
        i['url'] = response.url
        i['http_status'] = response.status
        llinks = []
        for anchor in hxs.select('//a[@href]'):
            href = anchor.select('@href').extract()[0]
            if not href.lower().startswith("javascript"):
                llinks.append(urljoin_rfc(response.url, href))
                i['linkedurls'] = llinks
        if request.headers.has_key('Referer'):
            i['referer'] = request.headers['Referer']
            self.exporter.export_item(i)
            return i

    def spider_closed(self, spider):
        self.file.close()
Пример #40
0
class ResolutionPipeline(object):
    """Pipeline used for ResolutionSpider."""
    def __init__(self):
        self.file = None
        self.exporter = None

        # compile regular expressions:

        # input looks like 'dec14R.aspx'
        # we need the resolution number (14R)
        self.resolution_number_pattern = re.compile(r"^\D+(?P<number>.+?)\..*$")

        # input looks like 'ממשלה/הממשלה ה - 34 בנימין נתניהו;'
        # we need the government number (34) and prime minister name (בנימין נתניהו)
        self.gov_pattern = re.compile(r'^.+\s??\-\s?(?P<gov_number>.+?)\s+?(?P<pm_name>.+?);?$')

    def open_spider(self, spider):
        """Initialize export JSON lines file."""
        self.file = open("gov.json", "wb")
        self.exporter = JsonLinesItemExporter(self.file, ensure_ascii=False)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        """Close export file."""
        self.file.close()
        self.exporter.finish_exporting()

    def process_item(self, item, spider):
        """Sanitize text for each field, and export to file."""
        try:
            data = {
                'url': item["url"],
                'date': self.get_date(item).timestamp,
                'resolution_number': self.get_resolution_number(item),
                'gov_number': self.get_gov_number(item),
                'pm_name': self.get_pm_name(item),
                'title': self.get_title(item),
                'subject': self.get_subject(item),
                'body': self.get_body(item),
            }
        except ResolutionError as ex:
            # if one of the fields fails sanitation,
            # raise and exception
            # and export the url leading to the specific resolution
            # for later (human) review
            self.exporter.export_item({'error': repr(ex),
                                       'url': item["url"],
                                      })
        else:
            self.exporter.export_item(data)

        return item

    # the following are specific field handling functions
    # e.g. cleaning, stripping, etc.
    # these should be called before dumping the data

    def get_date(self, item):
        if len(item["date"]) != 1:
            raise ResolutionError("Date field length is not 1 for item %s", item)
        return arrow.get(item["date"][0], "YYYYMMDD")

    def get_resolution_number(self, item):
        if len(item["resolution_number"]) != 1:
            raise ResolutionError("Resolution number field length is not 1 for item %s", item)
        return self.resolution_number_pattern.search(item["resolution_number"][0]).group('number')

    def get_gov_number(self, item):
        if len(item["gov"]) != 1:
            raise ResolutionError("Government field length is not 1 for item %s", item)
        gov_match = self.gov_pattern.search(item["gov"][0])
        return gov_match.group("gov_number")

    def get_pm_name(self, item):
        if len(item["gov"]) != 1:
            raise ResolutionError("Government field length is not 1 for item %s", item)
        gov_match = self.gov_pattern.search(item["gov"][0])
        return gov_match.group("pm_name")

    def get_title(self, item):
        if len(item["title"]) == 0:
            raise ResolutionError("Title fields is empty for item %s", item)
        return '\n'.join(item["title"]).strip()

    def get_subject(self, item):
        if len(item["subject"]) == 0:
            raise ResolutionError("Subject field is empty for item %s", item)
        return '\n'.join(item["subject"]).strip()

    def get_body(self, item):
        if len(item["body"]) == 0:
            raise ResolutionError("Body field is empty for item %s", item)
        # return '\n'.join(item["body"]).strip()

        # body is originally a list of lines
        # it is intentionally not stripped
        # some resolutions have custom css, tables,
        # and other crap which i'd rather not process here,
        # but in a later stage, unrelated to the scraper
        return item["body"]