class DmozPipeline(object): def __init__(self): self.exporter = BaseItemExporter() def process_item(self, domain, item): """Store the item, serialized as json, in a file within a directory hierarchy corresponding to it's place in the ontology """ #the path is of the form: cat/subcat/leaf filedir = os.path.join(DATA_PATH,'Top',item['category']) #Store the contents in files apart (to optimize the json loadings) pdfdir = os.path.join(DATA_PATH,'PDF') htmldir = os.path.join(DATA_PATH,'HTML') if not os.path.isdir(pdfdir): os.makedirs(pdfdir) if not os.path.isdir(htmldir): os.makedirs(htmldir) #replace evil characters for an underscore #cf. http://www.linfo.org/file_name.html rawname = re.sub('[ /.$%]+','_',item['name']) filename = os.path.join(filedir, rawname) #truncate the filename if it exceeds the permitted maximum... filename = filename if len(filename) <= MAX_FILENAME_LENGTH else filename[:MAX_FILENAME_LENGTH] if item['type'] == 'pdf': #content_str = "%s.pdf" % filename.replace(filedir, pdfdir) content_str = os.path.join(pdfdir, rawname+".pdf") temp = open(content_str, 'wb') else: #content_str = "%s" % filename.replace(filedir, htmldir) content_str = os.path.join(htmldir, rawname) temp = codecs.open(content_str, 'w', 'utf-8') temp.write(item['content']) temp.close() item['content'] = content_str.replace(HOME, '$HOME') if not os.path.isdir(filedir): os.makedirs(filedir) file = open(filename, 'w') itemdict = dict(self.exporter._get_serialized_fields(item)) json.dump(itemdict, file) file.close() return item
def _get_exporter(self, **kwargs): return BaseItemExporter(**kwargs)
def finish_exporting(self): BaseItemExporter.finish_exporting(self)
def start_exporting(self): BaseItemExporter.start_exporting(self)
def __init__(self): self.exporter = BaseItemExporter()
def __init__(self, file, **kwargs): BaseItemExporter.__init__(self, **kwargs) self.file = file