def open_spider(self, spider): print 'open spider' self.files['question'] = open( ''.join(['data/', spider.name, '_questions', '.json']), 'a+b') self.files['answer'] = open( ''.join(['data/', spider.name, '_answers', '.json']), 'a+b') self.exporter['question'] = JsonLinesItemExporter( self.files['question']) self.exporter['answer'] = JsonLinesItemExporter(self.files['answer']) for exporter in self.exporter.itervalues(): exporter.start_exporting()
def spider_opened(self, spider): file = open( '%s/%s/%s.json' % (settings.DATA_DIR, spider.name, datetime.date.today().isoformat()), 'w+b') self.files[spider] = file self.exporter = JsonLinesItemExporter(file) self.exporter.start_exporting()
def open_spider(self, spider): if FeedSpider.is_feed_op(spider): spider.make_sure_path_exists(spider.get_output_dir_path()) file_name = spider.get_feed_output_file_path() self.file = open(file_name, 'a') self.item_exporter = JsonLinesItemExporter(self.file) log.msg('FeedWriterPipeline, opened file %s to append.' % file_name)
def get_exporter(self, spider, item): filename = self.get_filename(spider, item) if filename not in self.exporters: f = self.get_file(spider, item) self.exporters[filename] = JsonLinesItemExporter(f) self.exporters[filename].start_exporting() return self.exporters[filename]
def spider_opened(self, spider): path = os.path.abspath("dir.txt").replace("dir.txt","") # For checking usage if(os.path.isfile(path +'check_item1.txt')): if(os.path.isfile(path +'check_item2.txt')): print('check_item2 is already exist.') else: file = open('%s_items2.json' % spider.name, 'w+b') self.files[spider] = file self.exporter = JsonLinesItemExporter(file) self.exporter.start_exporting() print('check_item1 is already exist.') else: file = open('%s_items1.json' % spider.name, 'w+b') self.files[spider] = file self.exporter = JsonLinesItemExporter(file) self.exporter.start_exporting()
def open_spider(self, spider): self.files = {} self.exporters = {} for name in items.__all__: path = os.path.join(self.path, name + '.jsonl') output = open(path, 'a+') self.files[name] = output self.exporters[name] = JsonLinesItemExporter(output) for e in self.exporters.itervalues(): e.start_exporting()
def spider_opened(self, spider): reviews_file_name = settings.get( 'OUTPUT_PATH') + '/reviews' + spider.uid + '.json' self.reviews_file = open(reviews_file_name, 'w') products_file_name = settings.get( 'OUTPUT_PATH') + '/products' + spider.uid + '.json' self.products_file = open(products_file_name, 'w') customers_file_name = settings.get( 'OUTPUT_PATH') + '/customers' + spider.uid + '.json' self.customers_file = open(customers_file_name, 'w') # set up a separate exporter for each item type self.exporters = {} self.exporters['AmznReviewItem'] = JsonLinesItemExporter( self.reviews_file) self.exporters['AmznProductItem'] = JsonLinesItemExporter( self.products_file) self.exporters['AmznCustomerItem'] = JsonLinesItemExporter( self.customers_file)
def get_exporter(self, item): exporter = None if item.__class__ in self.exporters: exporter = self.exporters[item.__class__] else: if item.__class__ == items.unused_genotype_data: exporter = JsonLinesItemExporter( open(_class_to_file(item.__class__), 'w+b')) else: exporter = CsvItemExporter( open(_class_to_file(item.__class__), 'w+b')) self.exporters[item.__class__] = exporter exporter.start_exporting() return exporter
def process_item(self, item, spider): """ Writes the item to output """ # create the output file for a new class of item per spider settings = spider.crawler.settings if item.__class__ not in self.xporters[spider.name]: filename = '%s.json' % item.export_filename dirpath = path.join(settings.get('IO_PATH', 'io'), settings['DATA_SET']) _mkdir_p(dirpath) file_h = open(path.join(dirpath, filename), 'w') xporter = JsonLinesItemExporter(file=file_h) xporter.start_exporting() self.xporters[spider.name][item.__class__] = (file_h, xporter) xporter = self.xporters[spider.name][item.__class__][1] xporter.export_item(item) return item
def spider_opened(self, spider): fname = open(_get_spider_output_filename(spider), 'wb') self.files[spider] = fname self.exporter = JsonLinesItemExporter(fname) self.exporter.fields_to_export = _get_fields_to_check(ProductItem) self.exporter.start_exporting()
def spider_opened(self, spider): file = open( currpath + '\\results\\' + '%s_products.json' % spider.name, 'w+b') self.files[spider] = file self.exporter = JsonLinesItemExporter(file) self.exporter.start_exporting()
def open_spider(self, spider): self.file = open('data/' + spider.name+'.jsonlines', 'a') self.exporter = JsonLinesItemExporter(self.file) self.exporter.start_exporting()
def spider_opened(self, spider): file = open('{0}_{1}.{2}'.format(spider.name, self.proj, self.suffix), 'w+b') self.files[spider] = file #self.exporter = JsonItemExporter(file) self.exporter = JsonLinesItemExporter(file) self.exporter.start_exporting()
def open_spider(self, spider): print "===open_spider===" file = open('data/%s_products_%s.json' % (spider.name, datetime.now().strftime("%Y%m%d%H%M%S")), 'w+b') self.files[spider] = file self.exporter = JsonLinesItemExporter(file, ensure_ascii=False) self.exporter.start_exporting()
def open_spider(self, spider): self.exporter = JsonLinesItemExporter(self.file) self.exporter.start_exporting()
def spider_opened(self, spider): file = open('2_reports.json', 'w+b') #self.files[spider] = file self.exporter = JsonLinesItemExporter(file) self.exporter.start_exporting()
def process_item(self, item, spider): exporter = JsonLinesItemExporter(self.file, encoding='utf-8') exporter.export_item(item) return item
def spider_opened(self, spider): file = open('%s_items.json' % spider.name, 'w+b') self.files[spider] = file self.exporter = JsonLinesItemExporter(file) self.exporter.start_exporting()
def open_spider(self, spider): self.startTime = datetime.datetime.now() nowStr = self.startTime.strftime("%Y-%m-%d %H:%M") scrapeDir = "" if spider.scrapeMode == "FIXERRORS": scrapeDir = spider.fixpath + "_FIX" else: if spider.scrapeMode == "FULL": if not os.path.exists("FullScrapes"): os.makedirs("FullScrapes") typeDir = "FullScrapes/" elif spider.scrapeMode == "INCREMENTAL": if not os.path.exists("IncrementalScrapes"): os.makedirs("IncrementalScrapes") typeDir = "IncrementalScrapes/" else: if not os.path.exists("TestScrapes"): os.makedirs("TestScrapes") typeDir = "TestScrapes/" scrapeDir = typeDir + nowStr if not os.path.exists(scrapeDir): os.makedirs(scrapeDir) spider.setScrapePath(scrapeDir) self.tendersfile = open(scrapeDir + "/" + "tenders.json", 'wb') self.procuringEntitiesfile = open( scrapeDir + "/" + 'organisations.json', 'wb') self.tenderBiddersFile = open(scrapeDir + "/" + 'tenderBidders.json', 'wb') self.tenderAgreementsFile = open( scrapeDir + "/" + 'tenderAgreements.json', 'wb') self.tenderDocumentationFile = open( scrapeDir + "/" + 'tenderDocumentation.json', 'wb') self.tenderCPVCodeFile = open(scrapeDir + "/" + 'tenderCPVCode.json', 'wb') self.whiteListFile = open(scrapeDir + "/" + 'whiteList.json', 'wb') self.blackListFile = open(scrapeDir + "/" + 'blackList.json', 'wb') self.complaintFile = open(scrapeDir + "/" + 'complaints.json', 'wb') self.bidderResultFile = open(scrapeDir + "/" + 'bidderResult.json', 'wb') self.tenderExporter = JsonLinesItemExporter(self.tendersfile) self.procurerExporter = JsonLinesItemExporter( self.procuringEntitiesfile) self.biddersExporter = JsonLinesItemExporter(self.tenderBiddersFile) self.agreementExporter = JsonLinesItemExporter( self.tenderAgreementsFile) self.documentationExporter = JsonLinesItemExporter( self.tenderDocumentationFile) self.cpvCodeExporter = JsonLinesItemExporter(self.tenderCPVCodeFile) self.whiteListExporter = JsonLinesItemExporter(self.whiteListFile) self.blackListExporter = JsonLinesItemExporter(self.blackListFile) self.complaintExporter = JsonLinesItemExporter(self.complaintFile) self.bidderResultExporter = JsonLinesItemExporter( self.bidderResultFile) self.tenderExporter.start_exporting() self.procurerExporter.start_exporting() self.biddersExporter.start_exporting() self.agreementExporter.start_exporting() self.documentationExporter.start_exporting() self.cpvCodeExporter.start_exporting() self.whiteListExporter.start_exporting() self.blackListExporter.start_exporting() self.complaintExporter.start_exporting() self.bidderResultExporter.start_exporting() self.infoFile = open(scrapeDir + "/" + 'scrapeInfo.txt', 'wb') self.infoFile.write("StartTime: " + nowStr + "\n")
def spider_opened(self,spider): self.jsonlines_exporter = JsonLinesItemExporter(open(spider.name+".linejson", "w"), fields_to_export=self.fields_to_export) self.jsonlines_exporter.start_exporting()
def spider_opened(self, spider): self.file = open('scraped/%s.json' % spider.external_id, 'w') self.exporter = JsonLinesItemExporter(self.file) self.exporter.start_exporting()
def spider_opened(self, spider): # write utf-8 file f = codecs.open('articles.json', 'w+', encoding='utf-8') self.files[spider] = f self.exporter = JsonLinesItemExporter(f, ensure_ascii=False) self.exporter.start_exporting()
def _get_exporter(self, **kwargs): return JsonLinesItemExporter(self.output, **kwargs)
def spider_opened(self, spider): json_path = os.path.join('data', '%s.json' % spider.name) file = open(json_path, 'w+b') self.files[spider] = file self.exporter = JsonLinesItemExporter(file) self.exporter.start_exporting()
def spider_opened(self, spider): file = open('%sItems.json' % spider.name, 'w+b') self.files[spider] = file # 因为json.dumps 序列化时对中文默认使用的ascii编码.想输出真正的中文需要指定ensure_ascii=False,否则将输出ascii编码 self.exporter = JsonLinesItemExporter(file, ensure_ascii=False) self.exporter.start_exporting()
def open_spider(self, spider): self.file = open('test.json', 'w+b') self.exporter = JsonLinesItemExporter(self.file)