示例#1
0
    def open_spider(self, spider):
        print 'open spider'

        self.files['question'] = open(
            ''.join(['data/', spider.name, '_questions', '.json']), 'a+b')

        self.files['answer'] = open(
            ''.join(['data/', spider.name, '_answers', '.json']), 'a+b')

        self.exporter['question'] = JsonLinesItemExporter(
            self.files['question'])
        self.exporter['answer'] = JsonLinesItemExporter(self.files['answer'])
        for exporter in self.exporter.itervalues():
            exporter.start_exporting()
示例#2
0
 def spider_opened(self, spider):
     file = open(
         '%s/%s/%s.json' % (settings.DATA_DIR, spider.name,
                            datetime.date.today().isoformat()), 'w+b')
     self.files[spider] = file
     self.exporter = JsonLinesItemExporter(file)
     self.exporter.start_exporting()
示例#3
0
 def open_spider(self, spider):
     if FeedSpider.is_feed_op(spider):
         spider.make_sure_path_exists(spider.get_output_dir_path())
         file_name = spider.get_feed_output_file_path()
         self.file = open(file_name, 'a')
         self.item_exporter = JsonLinesItemExporter(self.file)
         log.msg('FeedWriterPipeline, opened file %s to append.' % file_name)
示例#4
0
 def get_exporter(self, spider, item):
     filename = self.get_filename(spider, item)
     if filename not in self.exporters:
         f = self.get_file(spider, item)
         self.exporters[filename] = JsonLinesItemExporter(f)
         self.exporters[filename].start_exporting()
     return self.exporters[filename]
示例#5
0
 def spider_opened(self, spider):
     path = os.path.abspath("dir.txt").replace("dir.txt","")
     # For checking usage
     if(os.path.isfile(path +'check_item1.txt')):
         if(os.path.isfile(path +'check_item2.txt')):
             print('check_item2 is already exist.')
         else:
             file = open('%s_items2.json' % spider.name, 'w+b')
             self.files[spider] = file
             self.exporter = JsonLinesItemExporter(file)
             self.exporter.start_exporting()
             print('check_item1 is already exist.')
     else:
         file = open('%s_items1.json' % spider.name, 'w+b')
         self.files[spider] = file
         self.exporter = JsonLinesItemExporter(file)
         self.exporter.start_exporting()
示例#6
0
 def open_spider(self, spider):
     self.files = {}
     self.exporters = {}
     for name in items.__all__:
         path = os.path.join(self.path, name + '.jsonl')
         output = open(path, 'a+')
         self.files[name] = output
         self.exporters[name] = JsonLinesItemExporter(output)
     for e in self.exporters.itervalues():
         e.start_exporting()
示例#7
0
    def spider_opened(self, spider):

        reviews_file_name = settings.get(
            'OUTPUT_PATH') + '/reviews' + spider.uid + '.json'
        self.reviews_file = open(reviews_file_name, 'w')

        products_file_name = settings.get(
            'OUTPUT_PATH') + '/products' + spider.uid + '.json'
        self.products_file = open(products_file_name, 'w')

        customers_file_name = settings.get(
            'OUTPUT_PATH') + '/customers' + spider.uid + '.json'
        self.customers_file = open(customers_file_name, 'w')

        # set up a separate exporter for each item type
        self.exporters = {}
        self.exporters['AmznReviewItem'] = JsonLinesItemExporter(
            self.reviews_file)
        self.exporters['AmznProductItem'] = JsonLinesItemExporter(
            self.products_file)
        self.exporters['AmznCustomerItem'] = JsonLinesItemExporter(
            self.customers_file)
示例#8
0
 def get_exporter(self, item):
     exporter = None
     if item.__class__ in self.exporters:
         exporter = self.exporters[item.__class__]
     else:
         if item.__class__ == items.unused_genotype_data:
             exporter = JsonLinesItemExporter(
                 open(_class_to_file(item.__class__), 'w+b'))
         else:
             exporter = CsvItemExporter(
                 open(_class_to_file(item.__class__), 'w+b'))
         self.exporters[item.__class__] = exporter
         exporter.start_exporting()
     return exporter
示例#9
0
    def process_item(self, item, spider):
        """
        Writes the item to output
        """

        # create the output file for a new class of item per spider
        settings = spider.crawler.settings
        if item.__class__ not in self.xporters[spider.name]:
            filename = '%s.json' % item.export_filename
            dirpath = path.join(settings.get('IO_PATH', 'io'),
                                settings['DATA_SET'])
            _mkdir_p(dirpath)
            file_h = open(path.join(dirpath, filename), 'w')
            xporter = JsonLinesItemExporter(file=file_h)
            xporter.start_exporting()
            self.xporters[spider.name][item.__class__] = (file_h, xporter)

        xporter = self.xporters[spider.name][item.__class__][1]
        xporter.export_item(item)
        return item
示例#10
0
 def spider_opened(self, spider):
     fname = open(_get_spider_output_filename(spider), 'wb')
     self.files[spider] = fname
     self.exporter = JsonLinesItemExporter(fname)
     self.exporter.fields_to_export = _get_fields_to_check(ProductItem)
     self.exporter.start_exporting()
示例#11
0
 def spider_opened(self, spider):
     file = open(
         currpath + '\\results\\' + '%s_products.json' % spider.name, 'w+b')
     self.files[spider] = file
     self.exporter = JsonLinesItemExporter(file)
     self.exporter.start_exporting()
示例#12
0
 def open_spider(self, spider):
     self.file = open('data/' + spider.name+'.jsonlines', 'a')
     self.exporter = JsonLinesItemExporter(self.file) 
     self.exporter.start_exporting()
示例#13
0
 def spider_opened(self, spider):
     file = open('{0}_{1}.{2}'.format(spider.name, self.proj, self.suffix), 'w+b')
     self.files[spider] = file
     #self.exporter = JsonItemExporter(file)
     self.exporter = JsonLinesItemExporter(file)        
     self.exporter.start_exporting()
示例#14
0
 def open_spider(self, spider):
     print "===open_spider==="
     file = open('data/%s_products_%s.json' % (spider.name, datetime.now().strftime("%Y%m%d%H%M%S")), 'w+b')
     self.files[spider] = file
     self.exporter = JsonLinesItemExporter(file, ensure_ascii=False)
     self.exporter.start_exporting()
示例#15
0
 def open_spider(self, spider):
     self.exporter = JsonLinesItemExporter(self.file)
     self.exporter.start_exporting()
示例#16
0
 def spider_opened(self, spider):
     file = open('2_reports.json', 'w+b')
     #self.files[spider] = file
     self.exporter = JsonLinesItemExporter(file)
     self.exporter.start_exporting()
示例#17
0
 def process_item(self, item, spider):
     exporter = JsonLinesItemExporter(self.file, encoding='utf-8')
     exporter.export_item(item)
     return item
示例#18
0
 def spider_opened(self, spider):
     file = open('%s_items.json' % spider.name, 'w+b')
     self.files[spider] = file
     self.exporter = JsonLinesItemExporter(file)
     self.exporter.start_exporting()
示例#19
0
    def open_spider(self, spider):

        self.startTime = datetime.datetime.now()
        nowStr = self.startTime.strftime("%Y-%m-%d %H:%M")
        scrapeDir = ""
        if spider.scrapeMode == "FIXERRORS":
            scrapeDir = spider.fixpath + "_FIX"
        else:
            if spider.scrapeMode == "FULL":
                if not os.path.exists("FullScrapes"):
                    os.makedirs("FullScrapes")
                typeDir = "FullScrapes/"
            elif spider.scrapeMode == "INCREMENTAL":
                if not os.path.exists("IncrementalScrapes"):
                    os.makedirs("IncrementalScrapes")
                typeDir = "IncrementalScrapes/"
            else:
                if not os.path.exists("TestScrapes"):
                    os.makedirs("TestScrapes")
                typeDir = "TestScrapes/"
            scrapeDir = typeDir + nowStr

        if not os.path.exists(scrapeDir):
            os.makedirs(scrapeDir)

        spider.setScrapePath(scrapeDir)
        self.tendersfile = open(scrapeDir + "/" + "tenders.json", 'wb')
        self.procuringEntitiesfile = open(
            scrapeDir + "/" + 'organisations.json', 'wb')
        self.tenderBiddersFile = open(scrapeDir + "/" + 'tenderBidders.json',
                                      'wb')
        self.tenderAgreementsFile = open(
            scrapeDir + "/" + 'tenderAgreements.json', 'wb')
        self.tenderDocumentationFile = open(
            scrapeDir + "/" + 'tenderDocumentation.json', 'wb')
        self.tenderCPVCodeFile = open(scrapeDir + "/" + 'tenderCPVCode.json',
                                      'wb')
        self.whiteListFile = open(scrapeDir + "/" + 'whiteList.json', 'wb')
        self.blackListFile = open(scrapeDir + "/" + 'blackList.json', 'wb')
        self.complaintFile = open(scrapeDir + "/" + 'complaints.json', 'wb')
        self.bidderResultFile = open(scrapeDir + "/" + 'bidderResult.json',
                                     'wb')

        self.tenderExporter = JsonLinesItemExporter(self.tendersfile)
        self.procurerExporter = JsonLinesItemExporter(
            self.procuringEntitiesfile)
        self.biddersExporter = JsonLinesItemExporter(self.tenderBiddersFile)
        self.agreementExporter = JsonLinesItemExporter(
            self.tenderAgreementsFile)
        self.documentationExporter = JsonLinesItemExporter(
            self.tenderDocumentationFile)
        self.cpvCodeExporter = JsonLinesItemExporter(self.tenderCPVCodeFile)
        self.whiteListExporter = JsonLinesItemExporter(self.whiteListFile)
        self.blackListExporter = JsonLinesItemExporter(self.blackListFile)
        self.complaintExporter = JsonLinesItemExporter(self.complaintFile)
        self.bidderResultExporter = JsonLinesItemExporter(
            self.bidderResultFile)

        self.tenderExporter.start_exporting()
        self.procurerExporter.start_exporting()
        self.biddersExporter.start_exporting()
        self.agreementExporter.start_exporting()
        self.documentationExporter.start_exporting()
        self.cpvCodeExporter.start_exporting()
        self.whiteListExporter.start_exporting()
        self.blackListExporter.start_exporting()
        self.complaintExporter.start_exporting()
        self.bidderResultExporter.start_exporting()

        self.infoFile = open(scrapeDir + "/" + 'scrapeInfo.txt', 'wb')
        self.infoFile.write("StartTime: " + nowStr + "\n")
示例#20
0
 def spider_opened(self,spider):
   self.jsonlines_exporter = JsonLinesItemExporter(open(spider.name+".linejson", "w"), fields_to_export=self.fields_to_export)
   self.jsonlines_exporter.start_exporting()
示例#21
0
 def spider_opened(self, spider):
     self.file = open('scraped/%s.json' % spider.external_id, 'w')
     self.exporter = JsonLinesItemExporter(self.file)
     self.exporter.start_exporting()
示例#22
0
 def spider_opened(self, spider):
     # write utf-8 file
     f = codecs.open('articles.json', 'w+', encoding='utf-8')
     self.files[spider] = f
     self.exporter = JsonLinesItemExporter(f, ensure_ascii=False)
     self.exporter.start_exporting()
 def _get_exporter(self, **kwargs):
     return JsonLinesItemExporter(self.output, **kwargs)
示例#24
0
 def spider_opened(self, spider):
     json_path = os.path.join('data', '%s.json' % spider.name)
     file = open(json_path, 'w+b')
     self.files[spider] = file
     self.exporter = JsonLinesItemExporter(file)
     self.exporter.start_exporting()
示例#25
0
 def spider_opened(self, spider):
     file = open('%sItems.json' % spider.name, 'w+b')
     self.files[spider] = file
     # 因为json.dumps 序列化时对中文默认使用的ascii编码.想输出真正的中文需要指定ensure_ascii=False,否则将输出ascii编码
     self.exporter = JsonLinesItemExporter(file, ensure_ascii=False)
     self.exporter.start_exporting()
示例#26
0
文件: pipelines.py 项目: etongle/dc
 def open_spider(self, spider):
     self.file = open('test.json', 'w+b')
     self.exporter = JsonLinesItemExporter(self.file)