def ingest_attachment(self, name, mime_type, body): has_body = body is not None and len(body) if safe_string(name) is None and not has_body: # Hello, Outlook. return file_name = safe_filename(name, default='attachment') name = safe_string(name) or file_name foreign_id = join_path(self.result.id, name) file_path = join_path(self.work_path, file_name) with open(file_path, 'wb') as fh: if isinstance(body, str): body = body.encode('utf-8') if body is not None: fh.write(body) if isinstance(mime_type, bytes): mime_type = mime_type.decode('utf-8') self.manager.handle_child(self.result, file_path, id=foreign_id, file_name=name, mime_type=mime_type)
def document_to_pdf(self, file_path, temp_dir): """Converts an office document to PDF.""" if self.is_unoconv_available(): return self.unoconv_to_pdf(file_path, temp_dir) instance_dir = join_path(temp_dir, 'soffice_instance') out_dir = join_path(temp_dir, 'soffice_output') make_directory(out_dir) log.info('Converting [%s] to PDF...', self.result) instance_dir = '-env:UserInstallation=file://{}'.format(instance_dir) self.exec_command('soffice', instance_dir, '--nofirststartwizard', '--norestore', '--nologo', '--nodefault', '--nolockcheck', '--invisible', '--headless', '--convert-to', 'pdf', '--outdir', out_dir, file_path) for out_file in os.listdir(out_dir): return join_path(out_dir, out_file) msg = "Failed to convert to PDF: {}".format(file_path) raise ProcessingException(msg)
def ingest_attachment(self, name, mime_type, body, temp_dir): file_name = safe_filename(name, default='attachment') name = stringify(name) or file_name file_path = join_path(temp_dir, file_name) with open(file_path, 'w') as fh: if isinstance(body, six.text_type): body = body.encode('utf-8') if body is not None: fh.write(body) self.manager.handle_child(self.result, file_path, id=join_path(self.result.id, name), file_name=name, mime_type=mime_type)
def ingest(self, file_path): self.result.flag(self.result.FLAG_WORKBOOK) for table_name in self.get_tables(file_path): csv_name = safe_filename(table_name, extension='csv') csv_path = join_path(self.work_path, csv_name) self.dump_table(file_path, table_name, csv_path) child_id = join_path(self.result.id, table_name) self.manager.handle_child(self.result, csv_path, id=child_id, title=table_name, file_name=csv_name, mime_type='text/csv')
def ingest(self, file_path): mbox = mailbox.mbox(file_path) self.result.mime_type = self.DEFAULT_MIME self.result.flag(self.result.FLAG_DIRECTORY) with self.create_temp_dir() as temp_dir: for i, msg in enumerate(mbox, 1): msg_name = 'Message_%s.eml' % i msg_path = join_path(temp_dir, msg_name) child_id = join_path(self.result.id, str(i)) with open(msg_path, 'wb') as fh: fh.write(msg.as_string()) self.manager.handle_child(self.result, msg_path, id=child_id, mime_type='multipart/mixed')
def ingest(self, file_path): self.result.flag(self.result.FLAG_PDF) pdf_path = join_path(self.work_path, 'tiff.pdf') self.exec_command('tiff2pdf', file_path, '-x', '300', '-y', '300', '-o', pdf_path) self.assert_outfile(pdf_path) self.pdf_alternative_extract(pdf_path)
def _document_to_pdf(self, file_path, result, work_path): """Converts an office document to PDF.""" log.info('Converting [%s] to PDF...', result.file_name) out_path = os.path.basename(file_path) out_path = join_path(work_path, '%s.pdf' % out_path) file_name = result.file_name or 'data' mime_type = result.mime_type or DEFAULT attempt = 1 for attempt in service_retries(): fh = open(file_path, 'rb') try: files = {'file': (file_name, fh, mime_type)} res = requests.post(self.SERVICE_URL, files=files, timeout=(5, 305), stream=True) res.raise_for_status() with open(out_path, 'wb') as fh: for chunk in res.iter_content(chunk_size=None): fh.write(chunk) return out_path except RequestException as exc: if isinstance(exc, HTTPError): if exc.response.status_code == 400: raise ProcessingException(exc.response.text) log.error("Conversion failed: %s", exc) backoff(failures=attempt) finally: fh.close() raise ProcessingException("Document could not be converted to PDF.")
def ingest(self, file_path): pdf_path = join_path(self.work_path, 'image.pdf') self.exec_command('convert', file_path, '-density', '300', '-define', 'pdf:fit-page=A4', pdf_path) self.assert_outfile(pdf_path) self.result.flag(self.result.FLAG_IMAGE) self.pdf_alternative_extract(pdf_path)
def unoconv_to_pdf(self, file_path, temp_dir): """Converts an office document to PDF.""" if not self.is_unoconv_available(): raise ConfigurationException("UNOSERVICE_URL is missing.") log.info('Converting [%s] to PDF...', self.result) file_name = os.path.basename(file_path) out_path = join_path(temp_dir, '%s.pdf' % file_name) for try_num in range(3): try: with open(file_path, 'rb') as fh: data = {'format': 'pdf', 'doctype': 'document'} files = {'file': (file_name, fh, self.UNO_MIME)} # http://docs.python-requests.org/en/latest/user/advanced/#chunk-encoded-requests res = self.unoconv_client.post(self.get_unoconv_url(), data=data, files=files, timeout=300.0, stream=True) length = 0 with open(out_path, 'w') as fh: for chunk in res.iter_content(chunk_size=None): length += len(chunk) fh.write(chunk) if length == 0: raise ProcessingException("Could not convert to PDF.") return out_path except RequestException as re: log.exception(re) time.sleep(3**try_num) raise ProcessingException("Could not convert to PDF.")
def ingest(self, file_path): """Ingestor implementation.""" self.result.flag(self.result.FLAG_PDF) pdf_path = join_path(self.work_path, 'page.pdf') self.exec_command('ddjvu', '-format=pdf', '-quality=100', '-skip', file_path, pdf_path) self.assert_outfile(pdf_path) self.pdf_alternative_extract(pdf_path)
def ingest(self, file_path): """Ingestor implementation.""" with self.create_temp_dir() as temp_dir: pdf_path = join_path(temp_dir, 'page.pdf') self.exec_command('ddjvu', '-format=pdf', '-quality=100', '-skip', file_path, pdf_path) self.assert_outfile(pdf_path) self.pdf_alternative_extract(pdf_path)
def dump_table(self, file_path, table_name, temp_dir): out_file = safe_filename(table_name, extension='csv') out_file = join_path(temp_dir, out_file) mdb_export = self.find_command('mdb-export') args = [mdb_export, '-b', 'strip', file_path, table_name] with open(out_file, 'w') as fh: self.subprocess.call(args, stdout=fh) return out_file
def ingest(self, file_path): """Ingestor implementation.""" if not os.path.isdir(file_path): raise ProcessingException("Not a directory.") self.result.flag(self.result.FLAG_DIRECTORY) for name in os.listdir(file_path): name = decode_path(name) if name in self.SKIP_ENTRIES: continue sub_path = join_path(file_path, name) child_id = join_path(self.result.id, name) self.manager.handle_child(self.result, sub_path, file_name=name, id=child_id)
def unpack(self, file_path, temp_dir): file_name = self.result.file_name or 'extracted' for ext in self.EXTENSIONS: ext = '.' + ext if file_name.endswith(ext): file_name = file_name[:len(file_name) - len(ext)] temp_file = join_path(temp_dir, file_name) self.unpack_file(file_path, temp_file)
def ingest(self, file_path): """Ingestor implementation.""" self.result.flag(self.result.FLAG_DIRECTORY) file_path = decode_path(file_path) if file_path is None or not os.path.isdir(file_path): return for name in os.listdir(file_path): name = decode_path(name) if name in self.SKIP_ENTRIES: continue sub_path = join_path(file_path, name) child_id = join_path(self.result.id, name) self.manager.handle_child(self.result, sub_path, file_name=name, id=child_id)
def ingest(self, file_path): self.result.flag(self.result.FLAG_WORKBOOK) with self.create_temp_dir() as temp_dir: for table_name in self.get_tables(file_path): csv_path = self.dump_table(file_path, table_name, temp_dir) child_id = join_path(self.result.id, table_name) self.manager.handle_child(self.result, csv_path, id=child_id, title=table_name, mime_type='text/csv')
def csv_child_iter(self, iter, name): with self.create_temp_dir() as temp_dir: out_name = safe_filename(name, extension='csv') out_path = join_path(temp_dir, out_name) row_count = 0 with io.open(out_path, 'w', newline='', encoding='utf-8') as fh: writer = csv.writer(fh, quoting=csv.QUOTE_ALL) for row in iter: writer.writerow(row) row_count += 1 log.info("Generated [%s]: %s, %s rows", name, out_name, row_count) child_id = join_path(self.result.id, name) self.manager.handle_child(self.result, out_path, id=child_id, title=name, file_name=out_name, mime_type='text/csv')
def csv_child_iter(self, iter, name): out_name = safe_filename(name, default='sheet.csv', extension='csv') out_path = join_path(self.work_path, out_name) row_count = 0 with io.open(out_path, 'w', newline='', encoding='utf-8') as fh: writer = csv.writer(fh, quoting=csv.QUOTE_ALL) for row in iter: writer.writerow(row) row_count += 1 name = stringify(name) or 'sheet' if row_count == 0: log.warning("Skip [%s]: no rows", name) return log.info("Generated [%s]: %s, %s rows", name, out_name, row_count) child_id = join_path(self.result.id, name) self.manager.handle_child(self.result, out_path, id=child_id, title=name, file_name=out_name, mime_type='text/csv')
def ensure_path(self, base_dir, name, encoding='utf-8'): if isinstance(name, bytes): name = name.decode(encoding, 'ignore') out_path = join_path(base_dir, name) # out_path = os.path.normpath(out_path) if not out_path.startswith(base_dir): return if os.path.exists(out_path): return out_dir = os.path.dirname(out_path) make_directory(out_dir) if os.path.isdir(out_path): return return out_path
def ingest(self, file_path): with open(file_path, 'r') as fh: try: img = Image.open(fh) except DecompressionBombWarning as dce: raise ProcessingException("Image too large: %s", dce) except IOError as ioe: raise ProcessingException("Cannot open image: %s", ioe) self.extract_exif(img) self.result.flag(self.result.FLAG_IMAGE) if img.width >= self.MIN_WIDTH and img.height >= self.MIN_HEIGHT: with self.create_temp_dir() as temp_dir: pdf_path = join_path(temp_dir, 'image.pdf') self.exec_command('convert', file_path, '-density', '300', '-define', 'pdf:fit-page=A4', pdf_path) self.assert_outfile(pdf_path) self.pdf_alternative_extract(pdf_path)
def unoconv_to_pdf(self, file_path, retry=5): """Converts an office document to PDF.""" if not self.is_unoconv_available(): raise ConfigurationException("UNOSERVICE_URL is missing.") log.info('Converting [%s] to PDF...', self.result) out_path = os.path.basename(file_path) out_path = join_path(self.work_path, '%s.pdf' % out_path) file_name = self.result.file_name or 'data' mime_type = self.result.mime_type or DEFAULT attempt = 1 while attempt <= retry: fh = open(file_path, 'rb') try: files = {'file': (file_name, fh, mime_type)} res = requests.post(self.get_unoconv_url(), files=files, timeout=(5, 305), stream=True) except RequestException as exc: log.warning("Conversion failed: %s", exc) time.sleep(2**attempt) attempt += 1 continue finally: fh.close() if res.status_code == 400: raise ProcessingException(res.text) with open(out_path, 'wb') as fh: for chunk in res.iter_content(chunk_size=None): fh.write(chunk) return out_path raise ProcessingException("Document could not be converted to PDF.")
def unpack_members(self, pack, temp_dir): # Some archives come with non-Unicode file names, this # attempts to avoid that issue by naming the destination # explicitly. names = pack.namelist() names = [n for n in names if isinstance(n, six.binary_type)] encoding = guess_encoding('\n'.join(names)) log.debug('Detected filename encoding: %s', encoding) for name in pack.namelist(): file_name = name if isinstance(name, six.binary_type): file_name = name.decode(encoding, 'ignore') out_path = join_path(temp_dir, file_name) if os.path.exists(out_path): continue if not out_path.startswith(temp_dir): continue out_dir = os.path.dirname(out_path) make_directory(out_dir) if os.path.isdir(out_path): continue try: in_fh = pack.open(name) try: log.debug("Unpack: %s -> %s", self.result, file_name) with open(out_path, 'w') as out_fh: shutil.copyfileobj(in_fh, out_fh) finally: in_fh.close() except Exception as ex: # TODO: should this be a fatal error? log.debug("Failed to unpack [%s]: %s", file_name, ex)