Exemplo n.º 1
0
    def ingest_attachment(self, name, mime_type, body):
        has_body = body is not None and len(body)
        if safe_string(name) is None and not has_body:
            # Hello, Outlook.
            return

        file_name = safe_filename(name, default='attachment')
        name = safe_string(name) or file_name
        foreign_id = join_path(self.result.id, name)

        file_path = join_path(self.work_path, file_name)
        with open(file_path, 'wb') as fh:
            if isinstance(body, str):
                body = body.encode('utf-8')
            if body is not None:
                fh.write(body)

        if isinstance(mime_type, bytes):
            mime_type = mime_type.decode('utf-8')

        self.manager.handle_child(self.result,
                                  file_path,
                                  id=foreign_id,
                                  file_name=name,
                                  mime_type=mime_type)
Exemplo n.º 2
0
    def document_to_pdf(self, file_path, temp_dir):
        """Converts an office document to PDF."""
        if self.is_unoconv_available():
            return self.unoconv_to_pdf(file_path, temp_dir)

        instance_dir = join_path(temp_dir, 'soffice_instance')
        out_dir = join_path(temp_dir, 'soffice_output')
        make_directory(out_dir)
        log.info('Converting [%s] to PDF...', self.result)
        instance_dir = '-env:UserInstallation=file://{}'.format(instance_dir)
        self.exec_command('soffice',
                          instance_dir,
                          '--nofirststartwizard',
                          '--norestore',
                          '--nologo',
                          '--nodefault',
                          '--nolockcheck',
                          '--invisible',
                          '--headless',
                          '--convert-to', 'pdf',
                          '--outdir', out_dir,
                          file_path)

        for out_file in os.listdir(out_dir):
            return join_path(out_dir, out_file)

        msg = "Failed to convert to PDF: {}".format(file_path)
        raise ProcessingException(msg)
Exemplo n.º 3
0
 def ingest_attachment(self, name, mime_type, body, temp_dir):
     file_name = safe_filename(name, default='attachment')
     name = stringify(name) or file_name
     file_path = join_path(temp_dir, file_name)
     with open(file_path, 'w') as fh:
         if isinstance(body, six.text_type):
             body = body.encode('utf-8')
         if body is not None:
             fh.write(body)
     self.manager.handle_child(self.result, file_path,
                               id=join_path(self.result.id, name),
                               file_name=name,
                               mime_type=mime_type)
Exemplo n.º 4
0
 def ingest(self, file_path):
     self.result.flag(self.result.FLAG_WORKBOOK)
     for table_name in self.get_tables(file_path):
         csv_name = safe_filename(table_name, extension='csv')
         csv_path = join_path(self.work_path, csv_name)
         self.dump_table(file_path, table_name, csv_path)
         child_id = join_path(self.result.id, table_name)
         self.manager.handle_child(self.result,
                                   csv_path,
                                   id=child_id,
                                   title=table_name,
                                   file_name=csv_name,
                                   mime_type='text/csv')
Exemplo n.º 5
0
 def ingest(self, file_path):
     mbox = mailbox.mbox(file_path)
     self.result.mime_type = self.DEFAULT_MIME
     self.result.flag(self.result.FLAG_DIRECTORY)
     with self.create_temp_dir() as temp_dir:
         for i, msg in enumerate(mbox, 1):
             msg_name = 'Message_%s.eml' % i
             msg_path = join_path(temp_dir, msg_name)
             child_id = join_path(self.result.id, str(i))
             with open(msg_path, 'wb') as fh:
                 fh.write(msg.as_string())
             self.manager.handle_child(self.result,
                                       msg_path,
                                       id=child_id,
                                       mime_type='multipart/mixed')
Exemplo n.º 6
0
 def ingest(self, file_path):
     self.result.flag(self.result.FLAG_PDF)
     pdf_path = join_path(self.work_path, 'tiff.pdf')
     self.exec_command('tiff2pdf', file_path, '-x', '300', '-y', '300',
                       '-o', pdf_path)
     self.assert_outfile(pdf_path)
     self.pdf_alternative_extract(pdf_path)
Exemplo n.º 7
0
 def _document_to_pdf(self, file_path, result, work_path):
     """Converts an office document to PDF."""
     log.info('Converting [%s] to PDF...', result.file_name)
     out_path = os.path.basename(file_path)
     out_path = join_path(work_path, '%s.pdf' % out_path)
     file_name = result.file_name or 'data'
     mime_type = result.mime_type or DEFAULT
     attempt = 1
     for attempt in service_retries():
         fh = open(file_path, 'rb')
         try:
             files = {'file': (file_name, fh, mime_type)}
             res = requests.post(self.SERVICE_URL,
                                 files=files,
                                 timeout=(5, 305),
                                 stream=True)
             res.raise_for_status()
             with open(out_path, 'wb') as fh:
                 for chunk in res.iter_content(chunk_size=None):
                     fh.write(chunk)
             return out_path
         except RequestException as exc:
             if isinstance(exc, HTTPError):
                 if exc.response.status_code == 400:
                     raise ProcessingException(exc.response.text)
             log.error("Conversion failed: %s", exc)
             backoff(failures=attempt)
         finally:
             fh.close()
     raise ProcessingException("Document could not be converted to PDF.")
Exemplo n.º 8
0
 def ingest(self, file_path):
     pdf_path = join_path(self.work_path, 'image.pdf')
     self.exec_command('convert', file_path, '-density', '300', '-define',
                       'pdf:fit-page=A4', pdf_path)
     self.assert_outfile(pdf_path)
     self.result.flag(self.result.FLAG_IMAGE)
     self.pdf_alternative_extract(pdf_path)
Exemplo n.º 9
0
    def unoconv_to_pdf(self, file_path, temp_dir):
        """Converts an office document to PDF."""
        if not self.is_unoconv_available():
            raise ConfigurationException("UNOSERVICE_URL is missing.")

        log.info('Converting [%s] to PDF...', self.result)
        file_name = os.path.basename(file_path)
        out_path = join_path(temp_dir, '%s.pdf' % file_name)
        for try_num in range(3):
            try:
                with open(file_path, 'rb') as fh:
                    data = {'format': 'pdf', 'doctype': 'document'}
                    files = {'file': (file_name, fh, self.UNO_MIME)}
                    # http://docs.python-requests.org/en/latest/user/advanced/#chunk-encoded-requests
                    res = self.unoconv_client.post(self.get_unoconv_url(),
                                                   data=data,
                                                   files=files,
                                                   timeout=300.0,
                                                   stream=True)
                length = 0
                with open(out_path, 'w') as fh:
                    for chunk in res.iter_content(chunk_size=None):
                        length += len(chunk)
                        fh.write(chunk)

                if length == 0:
                    raise ProcessingException("Could not convert to PDF.")
                return out_path
            except RequestException as re:
                log.exception(re)
                time.sleep(3**try_num)
        raise ProcessingException("Could not convert to PDF.")
Exemplo n.º 10
0
 def ingest(self, file_path):
     """Ingestor implementation."""
     self.result.flag(self.result.FLAG_PDF)
     pdf_path = join_path(self.work_path, 'page.pdf')
     self.exec_command('ddjvu', '-format=pdf', '-quality=100', '-skip',
                       file_path, pdf_path)
     self.assert_outfile(pdf_path)
     self.pdf_alternative_extract(pdf_path)
Exemplo n.º 11
0
 def ingest(self, file_path):
     """Ingestor implementation."""
     with self.create_temp_dir() as temp_dir:
         pdf_path = join_path(temp_dir, 'page.pdf')
         self.exec_command('ddjvu', '-format=pdf', '-quality=100', '-skip',
                           file_path, pdf_path)
         self.assert_outfile(pdf_path)
         self.pdf_alternative_extract(pdf_path)
Exemplo n.º 12
0
 def dump_table(self, file_path, table_name, temp_dir):
     out_file = safe_filename(table_name, extension='csv')
     out_file = join_path(temp_dir, out_file)
     mdb_export = self.find_command('mdb-export')
     args = [mdb_export, '-b', 'strip', file_path, table_name]
     with open(out_file, 'w') as fh:
         self.subprocess.call(args, stdout=fh)
     return out_file
Exemplo n.º 13
0
    def ingest(self, file_path):
        """Ingestor implementation."""
        if not os.path.isdir(file_path):
            raise ProcessingException("Not a directory.")

        self.result.flag(self.result.FLAG_DIRECTORY)

        for name in os.listdir(file_path):
            name = decode_path(name)
            if name in self.SKIP_ENTRIES:
                continue
            sub_path = join_path(file_path, name)
            child_id = join_path(self.result.id, name)
            self.manager.handle_child(self.result,
                                      sub_path,
                                      file_name=name,
                                      id=child_id)
Exemplo n.º 14
0
 def unpack(self, file_path, temp_dir):
     file_name = self.result.file_name or 'extracted'
     for ext in self.EXTENSIONS:
         ext = '.' + ext
         if file_name.endswith(ext):
             file_name = file_name[:len(file_name) - len(ext)]
     temp_file = join_path(temp_dir, file_name)
     self.unpack_file(file_path, temp_file)
Exemplo n.º 15
0
    def ingest(self, file_path):
        """Ingestor implementation."""
        self.result.flag(self.result.FLAG_DIRECTORY)
        file_path = decode_path(file_path)

        if file_path is None or not os.path.isdir(file_path):
            return

        for name in os.listdir(file_path):
            name = decode_path(name)
            if name in self.SKIP_ENTRIES:
                continue
            sub_path = join_path(file_path, name)
            child_id = join_path(self.result.id, name)
            self.manager.handle_child(self.result,
                                      sub_path,
                                      file_name=name,
                                      id=child_id)
Exemplo n.º 16
0
 def ingest(self, file_path):
     self.result.flag(self.result.FLAG_WORKBOOK)
     with self.create_temp_dir() as temp_dir:
         for table_name in self.get_tables(file_path):
             csv_path = self.dump_table(file_path, table_name, temp_dir)
             child_id = join_path(self.result.id, table_name)
             self.manager.handle_child(self.result,
                                       csv_path,
                                       id=child_id,
                                       title=table_name,
                                       mime_type='text/csv')
Exemplo n.º 17
0
    def csv_child_iter(self, iter, name):
        with self.create_temp_dir() as temp_dir:
            out_name = safe_filename(name, extension='csv')
            out_path = join_path(temp_dir, out_name)
            row_count = 0
            with io.open(out_path, 'w', newline='', encoding='utf-8') as fh:
                writer = csv.writer(fh, quoting=csv.QUOTE_ALL)
                for row in iter:
                    writer.writerow(row)
                    row_count += 1

            log.info("Generated [%s]: %s, %s rows", name, out_name, row_count)

            child_id = join_path(self.result.id, name)
            self.manager.handle_child(self.result,
                                      out_path,
                                      id=child_id,
                                      title=name,
                                      file_name=out_name,
                                      mime_type='text/csv')
Exemplo n.º 18
0
    def csv_child_iter(self, iter, name):
        out_name = safe_filename(name, default='sheet.csv', extension='csv')
        out_path = join_path(self.work_path, out_name)
        row_count = 0
        with io.open(out_path, 'w', newline='', encoding='utf-8') as fh:
            writer = csv.writer(fh, quoting=csv.QUOTE_ALL)
            for row in iter:
                writer.writerow(row)
                row_count += 1

        name = stringify(name) or 'sheet'
        if row_count == 0:
            log.warning("Skip [%s]: no rows", name)
            return

        log.info("Generated [%s]: %s, %s rows", name, out_name, row_count)

        child_id = join_path(self.result.id, name)
        self.manager.handle_child(self.result,
                                  out_path,
                                  id=child_id,
                                  title=name,
                                  file_name=out_name,
                                  mime_type='text/csv')
Exemplo n.º 19
0
    def ensure_path(self, base_dir, name, encoding='utf-8'):
        if isinstance(name, bytes):
            name = name.decode(encoding, 'ignore')

        out_path = join_path(base_dir, name)
        # out_path = os.path.normpath(out_path)
        if not out_path.startswith(base_dir):
            return
        if os.path.exists(out_path):
            return

        out_dir = os.path.dirname(out_path)
        make_directory(out_dir)
        if os.path.isdir(out_path):
            return

        return out_path
Exemplo n.º 20
0
    def ingest(self, file_path):
        with open(file_path, 'r') as fh:
            try:
                img = Image.open(fh)
            except DecompressionBombWarning as dce:
                raise ProcessingException("Image too large: %s", dce)
            except IOError as ioe:
                raise ProcessingException("Cannot open image: %s", ioe)

        self.extract_exif(img)
        self.result.flag(self.result.FLAG_IMAGE)

        if img.width >= self.MIN_WIDTH and img.height >= self.MIN_HEIGHT:
            with self.create_temp_dir() as temp_dir:
                pdf_path = join_path(temp_dir, 'image.pdf')
                self.exec_command('convert', file_path, '-density', '300',
                                  '-define', 'pdf:fit-page=A4', pdf_path)
                self.assert_outfile(pdf_path)
                self.pdf_alternative_extract(pdf_path)
Exemplo n.º 21
0
    def unoconv_to_pdf(self, file_path, retry=5):
        """Converts an office document to PDF."""
        if not self.is_unoconv_available():
            raise ConfigurationException("UNOSERVICE_URL is missing.")

        log.info('Converting [%s] to PDF...', self.result)
        out_path = os.path.basename(file_path)
        out_path = join_path(self.work_path, '%s.pdf' % out_path)
        file_name = self.result.file_name or 'data'
        mime_type = self.result.mime_type or DEFAULT
        attempt = 1
        while attempt <= retry:
            fh = open(file_path, 'rb')
            try:
                files = {'file': (file_name, fh, mime_type)}
                res = requests.post(self.get_unoconv_url(),
                                    files=files,
                                    timeout=(5, 305),
                                    stream=True)
            except RequestException as exc:
                log.warning("Conversion failed: %s", exc)
                time.sleep(2**attempt)
                attempt += 1
                continue
            finally:
                fh.close()

            if res.status_code == 400:
                raise ProcessingException(res.text)

            with open(out_path, 'wb') as fh:
                for chunk in res.iter_content(chunk_size=None):
                    fh.write(chunk)
            return out_path

        raise ProcessingException("Document could not be converted to PDF.")
Exemplo n.º 22
0
    def unpack_members(self, pack, temp_dir):
        # Some archives come with non-Unicode file names, this
        # attempts to avoid that issue by naming the destination
        # explicitly.
        names = pack.namelist()
        names = [n for n in names if isinstance(n, six.binary_type)]
        encoding = guess_encoding('\n'.join(names))
        log.debug('Detected filename encoding: %s', encoding)

        for name in pack.namelist():
            file_name = name
            if isinstance(name, six.binary_type):
                file_name = name.decode(encoding, 'ignore')

            out_path = join_path(temp_dir, file_name)
            if os.path.exists(out_path):
                continue
            if not out_path.startswith(temp_dir):
                continue

            out_dir = os.path.dirname(out_path)
            make_directory(out_dir)
            if os.path.isdir(out_path):
                continue

            try:
                in_fh = pack.open(name)
                try:
                    log.debug("Unpack: %s -> %s", self.result, file_name)
                    with open(out_path, 'w') as out_fh:
                        shutil.copyfileobj(in_fh, out_fh)
                finally:
                    in_fh.close()
            except Exception as ex:
                # TODO: should this be a fatal error?
                log.debug("Failed to unpack [%s]: %s", file_name, ex)