def extract_pdf(path, languages=None): """ Extract content from a PDF file. This will convert the whole file to XML using `pdftohtml`, then run OCR on individual images within the file. """ temp_dir = make_tempdir() try: out_file = os.path.join(temp_dir, 'pdf.xml') log.info("Converting PDF to XML: %r...", path) pdftohtml = get_config('PDFTOHTML_BIN') args = [pdftohtml, '-xml', '-hidden', '-q', '-nodrm', path, out_file] subprocess.call(args) if not os.path.exists(out_file): raise IngestorException("Could not convert PDF to XML: %s" % path) with open(out_file, 'r') as fh: xml = string_value(fh.read()) xml = xml.replace('encoding="UTF-8"', '') parser = etree.XMLParser(recover=True, remove_comments=True) doc = etree.fromstring(xml, parser=parser) log.debug("Parsed XML: %r", path) pages = [] for page in doc.findall('./page'): pages.append(extract_page(path, temp_dir, page, languages)) return {'pages': pages} finally: remove_tempdir(temp_dir)
def ingest(self, meta, local_path): work_dir = make_tempdir() try: bin_path = os.environ.get('READPST_BIN', 'readpst') args = [ bin_path, '-D', '-e', '-8', '-b', '-o', work_dir, local_path ] log.debug('Converting Outlook PST file: %r', ' '.join(args)) subprocess.call(args) for (dirpath, dirnames, filenames) in os.walk(work_dir): reldir = os.path.relpath(string_value(dirpath), string_value(work_dir)) for filename in filenames: filename = string_value(filename) child = meta.make_child() for kw in reldir.split(os.path.sep): child.add_keyword(kw) fid = os.path.join(string_value(meta.foreign_id), string_value(reldir), filename) child.foreign_id = string_value(fid) file_path = os.path.join(string_value(dirpath), filename) ingest_file(self.collection_id, child, file_path, move=True) finally: remove_tempdir(work_dir)
def ingest(self, meta, local_path): temp_dir = make_tempdir() try: log.info("Unpacking bundle: %r", meta.file_name) with ZipFile(local_path, 'r') as zf: zf.extractall(temp_dir) self.ingest_directory(meta, temp_dir) finally: remove_tempdir(temp_dir)
def ingest(self, meta, local_path): # Work-around: try to unpack multi-part files by changing into # the directory containing the file. prev_cwd = os.getcwd() os.chdir(os.path.dirname(local_path)) temp_dir = make_tempdir(meta.file_name) try: log.info("Descending into package: %r", meta.file_name) self.unpack(meta, local_path, temp_dir) ingest_directory(self.collection_id, meta, temp_dir, base_path=meta.foreign_id, move=True) except rarfile.NeedFirstVolume: pass finally: remove_tempdir(temp_dir) os.chdir(prev_cwd)
def ingest(self, meta, local_path): work_dir = make_tempdir() try: bin_path = os.environ.get('READPST_BIN', 'readpst') args = [bin_path, '-D', '-e', '-o', work_dir, local_path] log.debug('Converting Outlook PST file: %r', ' '.join(args)) subprocess.call(args) for (dirpath, dirnames, filenames) in os.walk(work_dir): reldir = os.path.relpath(dirpath, work_dir) for filename in filenames: child = meta.make_child() for kw in reldir.split(os.path.sep): child.add_keyword(kw) child.foreign_id = os.path.join(meta.foreign_id, reldir, filename) ingest_file(self.source_id, meta, os.path.join(dirpath, filename), move=True) finally: remove_tempdir(work_dir)
def emit_bundle(self, meta, directory, files): bundle = meta.make_child() if meta.foreign_id: bundle.source_path = os.path.join(meta.foreign_id, self.BUNDLE_EXTENSION) bundle.mime_type = self.BUNDLE_MIME bundle.file_name = '%s.%s' % (meta.file_name, self.BUNDLE_EXTENSION) log.info("Creating bundle: %r", bundle.file_name) temp_dir = make_tempdir() try: bundle_path = os.path.join(temp_dir, bundle.file_name) with ZipFile(bundle_path, 'w', ZIP_STORED) as zf: for file_name in files: file_path = os.path.join(directory, file_name) zf.write(file_path, file_name) ingest_file(self.collection_id, bundle, bundle_path, move=True) finally: remove_tempdir(temp_dir)