Exemplo n.º 1
0
class JsonPipelineExporterMixin:
    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s_items.json' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = JsonItemExporter(file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        item = self.pre_process_item(item)
        self.exporter.export_item(item)
        return item

    def pre_process_item(self, item):
        return item
Exemplo n.º 2
0
class JsonExportPipeline(object):

    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = codecs.open('%s_data.json' % spider.name, 'w+b', encoding='utf-8')
        self.files[spider] = file
        self.exporter = JsonItemExporter(file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Exemplo n.º 3
0
class JsonPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('/home/gaoliang/Desktop/result.json', 'w+b')
        self.files[spider] = file
        self.exporter = JsonItemExporter(file, ensure_ascii=False)  # 添加ensure_ascii=False用于使json保存中文不乱码
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Exemplo n.º 4
0
class BaseFilePipeline(object):
    def __init__(self,saved_path):
        self.files = {}
        self.saved_path = saved_path
        self.exporter = None

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls(crawler.settings.get('SAVED_PATH'))
        return pipeline


    def open_spider(self, spider):
        tp = self.gettype()['name']
        filename = '%s_%s.json' % (spider.name,tp)
        filename = os.path.join(self.saved_path,filename)

        file_ = open(filename,'w+b')
        self.files[spider] = file_
        self.exporter = JsonItemExporter(file_,ensure_ascii=False,encoding='utf-8')
        self.exporter.start_exporting()

    def gettype():
        pass

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        file_ = self.files.pop(spider)
        file_.close()
Exemplo n.º 5
0
class JsonExportPipeline(object):
    """
    app.pipelines.exporter_json.JsonExportPipeline
    """
    def __init__(self):
        self.files = {}
        self.exporter = None

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file_json = open('%s_items.json' % spider.name, 'w+b')
        self.files[spider] = file_json
        self.exporter = JsonItemExporter(file_json)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file_json = self.files.pop(spider)
        file_json.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Exemplo n.º 6
0
class JsonExportPipeline(object):
    def __init__(self):
        _log.info('JsonExportPipeline.init....')
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        _log.info('JsonExportPipeline.from_crawler....')
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        _log.info('JsonExportPipeline.spider_opened....')
        file = open('%s.json' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = JsonItemExporter(file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        _log.info('JsonExportPipeline.spider_closed....')
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        _log.info('JsonExportPipeline.process_item....')
        self.exporter.export_item(item)
        return item
Exemplo n.º 7
0
class SaveItemToJson(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s_items.json' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = JsonItemExporter(file=file)
        print self.exporter
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Exemplo n.º 8
0
class JsonPipeline(object):
  """Save Pipeline output to JSON."""
  def __init__(self, spider_name):
    self.file = open("output/{}_recipes.json".format(spider_name), 'wb')
    self.file.write(
        '{"date_scraped": "%s", "recipes": ' % datetime.datetime.now()
    )
    self.exporter = JsonItemExporter(self.file, encoding='utf-8',
                                     ensure_ascii=False)
    self.exporter.start_exporting()

  @classmethod
  def from_crawler(cls, crawler):
    return cls(
        spider_name=crawler.spider.name
    )

  def close_spider(self):
    self.exporter.finish_exporting()
    self.file.write("}")
    self.file.close()

  def process_item(self, item):
    self.exporter.export_item(item)
    return item
Exemplo n.º 9
0
class JsonExporterPipeline(object):
    # 调用scrapy提供的json_export 导出json文件
    def __init__(self):
        self.file = open('articleexport.json','wb')
        self.exporter = JsonItemExporter(self.file,encoding="utf-8",ensure_ascii = False)
        self.exporter.start_exporting()
    def close_spider(self,spider):
        self.exporter.finish_exporting()
        self.file.close()
    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Exemplo n.º 10
0
class VisionsJsonPipeline(object):
    def __init__(self):
        self.exporter = None

    def open_spider(self, spider):
        self.exporter = JsonItemExporter(open('%s.json' %spider.name, 'wb'))
        self.exporter.start_exporting()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

    def close_spider(self, spider):
        self.exporter.finish_exporting()
Exemplo n.º 11
0
class ExaminationPipeline(object):
    def __init__(self):
        self.file = open(str(uuid.uuid1()) + '.json', 'wb')
        self.exporter = JsonItemExporter(self.file,
                                         encoding="utf-8",
                                         ensure_ascii=False)
        self.exporter.start_exporting()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()
Exemplo n.º 12
0
class JsonExporterPipeline(object):  #将json文件输出    #在setting.py里配置这个pipeline的数字为2,进行测试
    #调用scrapy提供的json export导出json文件
    def __init__(self):
        self.file = open('articleexporter.json', 'wb')
        self.exporter = JsonItemExporter(self.file, encoding='utf_8', ensure_ascii=False)  #用JsonItemExporter 做实例化
        self.exporter.start_exporting()
    
    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()
        
    def process_item(self, item, spider):  #pipelines.py 会接受item  在这里将item写入文件
        #调用process_item时要记得return item, 因为下一pipeline可能还需要处理item
        self.exporter.export_item(item)
        return item
Exemplo n.º 13
0
class JsonPipeline(object):
    def __init__(self):
        self.file = open("newsCrawl.json", "wb")
        self.exporter = JsonItemExporter(self.file,
                                         encoding="utf-8",
                                         ensure_ascii=False)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Exemplo n.º 14
0
class JsonExporterPipeline(object):
    def __init__(self):
        self.file = open('json_jobbole.json', 'wb')
        self.exporter = JsonItemExporter(self.file,
                                         encoding='utf-8',
                                         ensure_ascii=False)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Exemplo n.º 15
0
class JsonExporterPipeline(JsonItemExporter):
    def __init__(self):
        self.file = open('articleexporter.json', 'wb')
        self.expoter = JsonItemExporter(self.file,
                                        encoding="utf-8",
                                        ensure_ascii=False)
        self.expoter.start_exporting()

    def close_spider(self, spider):
        self.expoter.finish_exporting()
        self.file.close()

    def process_item(self, item):
        self.expoter.expote_item(item)
        return item
Exemplo n.º 16
0
class SaveProductToFile(object):
    file = None

    def open_spider(self, spider):
        self.file = open('./walmart_products/%s.json' % spider.filename, 'wb')
        self.exporter = JsonItemExporter(self.file, indent=4)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Exemplo n.º 17
0
class JsonExporterPipleline(object):
    #调用scrapy提供的json.export导出json文件
    def __init__(self):
        self.file = open('articleexport.json', 'wb')
        self.exporter = JsonItemExporter(self.file,
                                         encoding='utf-8',
                                         ensure_ascii=False)
        self.exporter.start_exporting()

    def close_spider(self, item, spider):
        self.exporter.export_item(item)

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Exemplo n.º 18
0
class JsonExporterPipeline(object):
    def __init__(self):
        self.file = open('articleexport.json', 'wb')  #以二进制的方式打开
        self.exporter = JsonItemExporter(
            self.file, encoding="utf-8",
            ensure_ascii=False)  #实例化一个JsonItemExporter变量
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()  #停止导出
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)  #把item传进来,进行处理
        return item  #返回item,进入下一个pipeline
Exemplo n.º 19
0
class jsonPipelines(JsonItemExporter):
    #调用scrapy提供的json exporter来导出json文件
    def __init__(self):
        self.file = open('article.json', 'wb')
        self.exporter = JsonItemExporter(self.file,
                                         encoding='utf8',
                                         ensure_ascii=False)
        self.exporter.start_exporting()

    def close_spider(self):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Exemplo n.º 20
0
class JobJsonExporter(object):
    def __init__(self):
        self.file = codecs.open('job_exporter.json', 'wb')
        self.exporter = JsonItemExporter(file=self.file,
                                         encoding='utf-8',
                                         ensure_ascii=False)
        self.exporter.start_exporting()

    def process_item(self, item, spider):
        # 传递数据给exporter
        self.exporter.export_item(item)
        return item

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.file.close()
Exemplo n.º 21
0
class RrysSpiderPipeline(object):
    def open_spider(self, spider):
        self.file = open(result_json_path, "wb")
        self.exporter = JsonItemExporter(self.file,
                                         encoding="utf-8",
                                         ensure_ascii=False)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()
        print(f"spider result is saved to {result_json_path}")

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Exemplo n.º 22
0
class Jsonexporterpipeline(object):
    #调用scrapy提供的函数进行json导出
    def __init__(self):
        self.file = open('baike.json', 'wb')
        self.exporter = JsonItemExporter(self.file,
                                         encoding="utf-8",
                                         ensure_ascii=False)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Exemplo n.º 23
0
class JinyongPipeline(object):
    def open_spider(self, spider):
        # 可选实现,当spider被开启时,这个方法被调用。
        # 输出到tongcheng_pipeline.json文件
        self.file = open('jingyongBook.json', 'wb')
        self.exporter = JsonItemExporter(self.file, encoding='utf-8')
        self.exporter.start_exporting()

    def close_spier(self, spider):
        # 可选实现,当spider被关闭时,这个方法被调用
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Exemplo n.º 24
0
class NeihanbaPipeline(object):
    def process_item(self, item, spider):
        # 默认现在没有开启管道,所以现在不会进入文件写入
        self.writer.export_item(item)
        return item

    def open_spider(self, spider):
        self.file = open('neihanba.json', 'wb')  # 源码中是以二进制的列表进行写入的
        # 写入器
        self.writer = JsonItemExporter()
        self.writer.start_exporting()

    def close_spider(self, spider):
        # 需要先关闭写入器再关闭文件
        self.writer.finish_exporting()
        self.file.close()
Exemplo n.º 25
0
class JsonExporterPipeline(object):
    # 一个关键的地方,在init时,可以直接使用open方法,并且需要传递exporter
    def __init__(self):
        self.file = open('articleexporter.json', 'wb')  # 二进制方式打开
        self.exporter = JsonItemExporter(self.file,
                                         encoding="utf-8",
                                         ensure_ascii=False)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Exemplo n.º 26
0
class JsonExporterPipeline(object):
    # 调用scrapy提供的json_export导出json文件
    def __init__(self):
        self.file = open("articleexport.json", 'wb')
        self.exporter = JsonItemExporter(self.file,
                                         encoding="utf-8",
                                         ensure_ascii=False)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Exemplo n.º 27
0
class JsonExporterPipeline(object):
    '''用scrapy内置的exporter导出json文件'''
    def __init__(self):
        self.file = open('article_export.json', 'wb')
        self.exporter = JsonItemExporter(self.file,
                                         encoding='utf-8',
                                         ensure_ascii=False)
        self.exporter.start_exporting()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.file.close()
Exemplo n.º 28
0
class JsonWriterPipeline:
    def open_spider(self, spider):
        if not os.path.exists("output/%s" % spider.name):
            os.system("mkdir -p output/%s" % spider.name)

        self.file = open("output/%s/out.json" % spider.name, "w+b")
        self.exporter = JsonItemExporter(self.file)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Exemplo n.º 29
0
class JsonExporterPipeline(object):
    '''scrapy自带方式:json export导出json文件'''
    def __init__(self):
        self.file = open("articleexport.json", "wb")  # 创建json文件,设定文件操作权限
        self.exporter = JsonItemExporter(self.file,
                                         encoding="utf-8",
                                         ensure_ascii=False)
        self.exporter.start_exporting()  #

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Exemplo n.º 30
0
class JsonWithEncodeingPipeline(object):
    def __init__(self):
        self.file = open('goodsinfo.json', 'wb')
        self.exporter = JsonItemExporter(self.file,
                                         encoding="utf-8",
                                         ensure_ascii=False,
                                         sort_keys=True)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Exemplo n.º 31
0
class JsonExportPipeline(object):
    #调用SCRAPY提供的Jsonexport 导出JSON文件
    def __init__(self):
        self.file = open('Taobao.json', 'wb')
        self.exporter = JsonItemExporter(self.file,
                                         encoding='utf-8',
                                         ensure_ascii=False)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Exemplo n.º 32
0
class JsonExporterPipeline(object):
    # Open file
    def __init__(self):  # when running scrapy will first call this method
        self.file = open('article_export.json', 'wb')
        self.exporter = JsonItemExporter(self.file,
                                         encoding='utf-8',
                                         ensure_ascii=False)
        self.exporter.start_exporting()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.file.close()
Exemplo n.º 33
0
class jsonPipeline(object):
    def __init__(self):
        # 打开文件,二进制写入
        self.file = open('doubanTv.json', 'wb')
        self.exporter = JsonItemExporter(self.file,
                                         encoding="utf-8",
                                         ensure_ascii=False)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Exemplo n.º 34
0
class JsonExporterPipeline(object):
    """
    使用 scrapy 提供的 JsonExporter 导出 Item 为 JSON 文件。
    """
    def __init__(self):
        self.file = open('article_exporter.json', 'wb')
        self.exporter = JsonItemExporter(self.file, encoding='utf-8', ensure_ascii=False)
        self.exporter.start_exporting()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()
Exemplo n.º 35
0
class ArticleJsonExplorerPipeline(object):
    # 使用框架保存数据到json
    def __init__(self):
        self.file = open('article_explorer', 'wb')
        self.json_explorer = JsonItemExporter(self.file,
                                              encoding='utf-8',
                                              ensure_ascii=False)
        self.json_explorer.start_exporting()

    def process_item(self, item, spider):
        self.json_explorer.export_item(item)
        return item

    def spider_closed(self, spider):
        self.json_explorer.finish_exporting()
        self.file.close()
Exemplo n.º 36
0
class JsonExporterPipeline(object):
    #调用scrapy提供的到处json功能
    def __init__(self):
        self.file = open('articleexport.json', 'wb')
        self.exporter = JsonItemExporter(self.file,
                                         encoding="utf8",
                                         ensure_ascii=False)
        self.exporter.start_exporting()

    def process_item(self, item, spider):
        self.exporter.export_item(item=item)
        return item

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()
Exemplo n.º 37
0
class JsonExporterPipleline(object):
    #方法二:调用scrapy提供的json export导出json文件,固定的模式,爬取其他的内容也可以这样写!!!
    def __init__(self):
        self.file = open('articleexport.json', 'wb')
        self.exporter = JsonItemExporter(self.file,
                                         encoding="utf-8",
                                         ensure_ascii=False)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Exemplo n.º 38
0
class JsonExporterPipPipeline(object):
    #调用scrapy的json exporter导出json文件还提供xml,csv
    def __init__(self):
        self.file = open("youyuanexporter.json", "wb")
        self.exporter = JsonItemExporter(self.file,
                                         encoding="utf-8",
                                         ensure_ascii=False)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Exemplo n.º 39
0
class DoubanspiderPipeline(object):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.file = open('top250.json', 'wb')
        self.exporter = JsonItemExporter(self.file,
                                         encoding='utf-8',
                                         ensure_ascii=False)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Exemplo n.º 40
0
class JobspiderJsonPipeline(object):
    '''
    json格式数据存储
    '''
    def __init__(self):
        self.file = open('51job.json', 'wb')
        self.exporter = JsonItemExporter(self.file, ensure_ascii=False)
        self.exporter.start_exporting()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()
Exemplo n.º 41
0
    def process_item(self, item, spider):

        designer_dir_name = skutils.escape_filename(item['name'])
        designer_dir_path = os.path.join(GlobalState.data_dir, designer_dir_name)
        file_path = os.path.join(designer_dir_path, designer_dir_name)

        # write json file
        with open('%s.json' % file_path, 'w+b') as f:
            exporter = JsonItemExporter(f)
            exporter.start_exporting()
            exporter.export_item(item)
            exporter.finish_exporting()

        # write excel file
        excelutils.write_designer_excel(item, file_path, designer_dir_name)

        return item
Exemplo n.º 42
0
class JsonWriterPipeline(BaseItemExporter):

  def __init__(self, **kwargs):
    self._configure(kwargs)
    self.files = {} 
    self.encoder = json.JSONEncoder(ensure_ascii=False, **kwargs)
 
  @classmethod
  def from_crawler(cls, crawler):
    pipeline = cls()
    crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
    crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
    return pipeline

  def spider_opened(self, spider):
    file = codecs.open('item.json', 'wb', encoding="utf-8")
    self.files[spider] = file
    self.exporter = JsonItemExporter(file)
    self.exporter.start_exporting()

  def spider_closed(self, spider):
    self.exporter.finish_exporting()
    file = self.files.pop(spider)
    file.close() 

  def process_item(self, item, spider):

    if item['title']: # and item['image_url'] :
      item['description'] = re.sub("\r|\n","", item['description'])
      item['general_impression'] = re.sub("\r|\n","", item['general_impression'])
      item['subject_of_photo'] = re.sub("\r|\n","", item['subject_of_photo'])
      item['composition'] = re.sub("\r|\n","", item['composition'])
      item['use_of_camera'] = re.sub("\r|\n","", item['use_of_camera'])
      item['depth_of_field'] = re.sub("\r|\n","", item['depth_of_field'])
      item['color_lighting'] = re.sub("\r|\n","", item['color_lighting'])
      item['focus'] = re.sub("\r|\n","", item['focus'])

      ##line = json.dumps(dict(item)) + '\n'
      ##self.file.write(line)
      self.exporter.export_item(item)
    return item   
Exemplo n.º 43
0
class JsonExportPipeline(object):

    def __init__(self):
        dispatcher.connect(self.spider_opened, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        self.files = {}

    def spider_opened(self, spider):
        file = open('%s_items.json' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = JsonItemExporter(file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Exemplo n.º 44
0
class WikicrawlerPipeline(object):

    def __init__(self):
        self.item_file = open('items.json', 'wb')
        self.exporter = JsonItemExporter(self.item_file)

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        self.exporter.start_exporting()

    def process_item(self, item, spider):
        self.exporter.export_item(item)

    def spider_closed(self):
        self.exporter.finish_exporting()
        self.item_file.close()
Exemplo n.º 45
0
class SiteMapJsonExportPipeline(object):
	'''Process the SiteMap spider output Items, and write them as JSON to an output file. The output file is taken from the Spider's config (spider.config)'''

	@classmethod
	def from_crawler(cls, crawler):
		''' Boilerplate '''
		pipeline = cls()
		crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
		crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
		return pipeline

	def spider_opened(self, spider):
		self.file = open(spider.config['map_file'], 'wb')
		self.exporter = JsonItemExporter(self.file)
		self.exporter.start_exporting()

	def spider_closed(self, spider):
		self.exporter.finish_exporting()
		self.file.close()

	def process_item(self, item, spider):
		self.exporter.export_item(item)
		return item
 def create_exporter(self, filename):
     file = open(filename, "w+b")
     exporter = JsonItemExporter(file)
     exporter.start_exporting()
     self.files.append(file)
     return exporter
Exemplo n.º 47
0
class DuplicatesExportPipeline(object):

    def __init__(self):
        self.category_seen = set()
        self.product_seen = set()
        self.shop_seen = set()
        self.product_price_seen = set()

    def open_spider(self, spider):
        # Creates 4 files for storage scraped items
        self.category_file = open('spider/scraped/category.json', 'wb')
        self.category_exporter = JsonItemExporter(self.category_file, encoding="utf-8")
        self.category_exporter.start_exporting()
        self.product_file = open('spider/scraped/product.json', 'wb')
        self.product_exporter = JsonItemExporter(self.product_file, encoding="utf-8")
        self.product_exporter.start_exporting()
        self.shop_file = open('spider/scraped/shop.json', 'wb')
        self.shop_exporter = JsonItemExporter(self.shop_file, encoding="utf-8")
        self.shop_exporter.start_exporting()
        self.product_price_file = open('spider/scraped/productprice.json', 'wb')
        self.product_price_exporter = JsonItemExporter(self.product_price_file, encoding="utf-8")
        self.product_price_exporter.start_exporting()

    def close_spider(self, spider):
        # Closing exports and scraped item files

        self.category_exporter.finish_exporting()
        self.category_file.close()
        self.product_exporter.finish_exporting()
        self.product_file.close()
        self.shop_exporter.finish_exporting()
        self.shop_file.close()
        self.product_price_exporter.finish_exporting()
        self.product_price_file.close()

    def process_item(self, item, spider):

        if 'id' in item.keys() and 'name' in item.keys() and 'parent_category_id' in item.keys():
            # Drops duplicates in category
            if item['id'] in self.category_seen:
                raise DropItem("Duplicate category item found: %s" % item)
            else:
                self.category_seen.add(item['id'])
                # Exports category item
                self.category_exporter.export_item(item)
                return item

        if 'name' in item.keys() and 'category_id' in item.keys() and 'thumbnail_url' in item.keys() and 'url' in item.keys():
            # Drops duplicates in products

            if item['url'] in self.product_seen:
                raise DropItem("Duplicate product item found: %s" % item)
            else:
                self.product_seen.add(item['url'])
                # Exports category item
                self.product_exporter.export_item(item)
                return item

        if 'name' in item.keys() and 'url' in item.keys() and 'thumbnail_url' in item.keys():
            # Drops duplicates in shops
            if item['url'] in self.shop_seen:
                raise DropItem("Duplicate shop item found: %s" % item)

            else:
                self.shop_seen.add(item['url'])
                # Exports shop item
                self.shop_exporter.export_item(item)
                return item

        if 'shop_id' in item.keys() and 'product_id' in item.keys() and 'price' in item.keys() and 'price_and_shipment' in item.keys():
            # Drops duplicates in product price

            if item['shop_id'] + '-' + item['product_id'] in self.product_price_seen:
                raise DropItem("Duplicate product price item found: %s" % item)
            else:
                self.product_price_seen.add(item['shop_id'] + '-' + item['product_id'])
                # Exports product price item
                self.product_price_exporter.export_item(item)
                return item

        return item
class JsonExportPipeline(object):
    def __init__(self):
        self.files = []
        self.exporters = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        if spider.__class__ == MayorsSpider:
            mayor_file = open("data/mayor_candidates.json", "w+b")
            council_file = open("data/city_counsils.json", "w+b")
            self.files.append(mayor_file)
            self.files.append(council_file)
            self.mayor_exporter = JsonItemExporter(mayor_file)
            self.council_exporter = JsonItemExporter(council_file)
            self.mayor_exporter.start_exporting()
            self.council_exporter.start_exporting()
        elif spider.__class__ == RegionCountiesSpider:
            counties_file = open("data/region_counties.json", "w+b")
            self.counties_exporter = JsonItemExporter(counties_file)
            self.files.append(counties_file)

    def create_exporter(self, filename):
        file = open(filename, "w+b")
        exporter = JsonItemExporter(file)
        exporter.start_exporting()
        self.files.append(file)
        return exporter

    def spider_closed(self, spider):
        if spider.__class__ == MayorsSpider:
            self.mayor_exporter.finish_exporting()
            self.council_exporter.finish_exporting()
        elif spider.__class__ == RegionCountiesSpider:
            for exporter in self.exporters.itervalues():
                exporter.finish_exporting()
        for file in self.files:
            file.close()

    def process_item(self, item, spider):
        if item.__class__ == CityCouncil:
            self.council_exporter.export_item(item)
        elif item.__class__ == MayorCandidate:
            self.mayor_exporter.export_item(item)
            self.counties_exporter.export_item(item)
        else:
            if item.__class__ == RegionCandidate:
                filename = "data/region_council_candidates.json"
            elif item.__class__ == RegionCounty:
                filename = "data/region_counties.json"
            elif item.__class__ == CityCouncilCandidate:
                filename = "data/city_council_candidates.json"
            elif item.__class__ == CityCounty:
                filename = "data/city_counties.json"
            exporter_name = item.__class__.__name__
            if exporter_name not in self.exporters:
                self.exporters[exporter_name] = self.create_exporter(filename)
            self.exporters[exporter_name].export_item(item)
        return item