示例#1
0
class DmozPipeline(object):
	def __init__(self):
		self.exporter = BaseItemExporter()
	
	def process_item(self, domain, item):
	        """Store the item, serialized as json, in a file within a directory hierarchy corresponding to it's
		   place in the ontology
	        """
        	#the path is of the form: cat/subcat/leaf
	        filedir = os.path.join(DATA_PATH,'Top',item['category'])	
		#Store the contents in files apart (to optimize the json loadings)
		pdfdir = os.path.join(DATA_PATH,'PDF')	 
		htmldir = os.path.join(DATA_PATH,'HTML')	 
		if not os.path.isdir(pdfdir):
			os.makedirs(pdfdir)
		if not os.path.isdir(htmldir):
			os.makedirs(htmldir)
		#replace evil characters for an underscore 
		#cf. http://www.linfo.org/file_name.html
		rawname = re.sub('[ /.$%]+','_',item['name'])
		filename = os.path.join(filedir, rawname)
		#truncate the filename if it exceeds the permitted maximum...
		filename = filename if len(filename) <= MAX_FILENAME_LENGTH else filename[:MAX_FILENAME_LENGTH]
		if item['type'] == 'pdf':
			#content_str = "%s.pdf" % filename.replace(filedir, pdfdir)
			content_str = os.path.join(pdfdir, rawname+".pdf")
			temp = open(content_str, 'wb')
		else:
			#content_str = "%s" % filename.replace(filedir, htmldir)
			content_str = os.path.join(htmldir, rawname)
			temp = codecs.open(content_str, 'w', 'utf-8')
		temp.write(item['content'])
		temp.close()	
		item['content'] = content_str.replace(HOME, '$HOME')
			
		if not os.path.isdir(filedir):
			os.makedirs(filedir)
		file = open(filename, 'w')
		itemdict = dict(self.exporter._get_serialized_fields(item))
		json.dump(itemdict, file)
		file.close()
		return item
 def _get_exporter(self, **kwargs):
     return BaseItemExporter(**kwargs)
示例#3
0
 def finish_exporting(self):
     BaseItemExporter.finish_exporting(self)
示例#4
0
 def start_exporting(self):
     BaseItemExporter.start_exporting(self)
示例#5
0
	def __init__(self):
		self.exporter = BaseItemExporter()
示例#6
-1
 def __init__(self, file, **kwargs):
     BaseItemExporter.__init__(self, **kwargs)
     self.file = file