Пример #1
0
    def get_attachments(self, tmpdir: str, filename: str,
                        parameters: dict) -> List[AttachedFile]:
        """
        :param tmpdir: directory where file is located
        :param filename: Name of the file from which you should extract attachments
        :param parameters: dict with different parameters for extracting
        :return: list of lists (name of original file and binary file content)
        """
        result = []
        name, ext = splitext_(filename)

        if ext == '.docx':
            with zipfile.ZipFile(os.path.join(tmpdir, filename), 'r') as zfile:
                files = zfile.namelist()

                attachments = [
                    file for file in files if file.startswith("word/media/")
                ]
                attachments += [
                    file for file in files
                    if file.startswith("word/embeddings/")
                ]
                try:
                    for attachment in attachments:
                        original_name = os.path.split(attachment)[-1]
                        if not original_name.endswith(
                                '.emf') and not original_name.endswith('.bin'):
                            result.append(
                                (original_name, zfile.read(attachment)))

                        elif original_name.endswith('.bin'):
                            # extracting PDF-files
                            with zfile.open(attachment) as f:
                                ole = olefile.OleFileIO(f.read())
                            if ole.exists("CONTENTS"):
                                data = ole.openstream('CONTENTS').read()
                                if data[0:5] == b'%PDF-':
                                    result.append(
                                        (os.path.splitext(original_name)[-2] +
                                         '.pdf', data))
                            # extracting files in other formats
                            elif ole.exists("\x01Ole10Native"):
                                data = ole.openstream("\x01Ole10Native").read()
                                original_name, contents = self.__parse_ole_contents(
                                    data)
                                result.append((original_name, contents))

                    attachments = self._content2attach_file(content=result,
                                                            tmpdir=tmpdir)
                    diagram_attachments = self.__extract_diagrams(zfile)
                    attachments += self._content2attach_file(
                        content=diagram_attachments,
                        tmpdir=tmpdir,
                        need_content_analysis=False)
                    assert len(attachments) == 0 or isinstance(
                        attachments[0], AttachedFile)
                    return attachments
                except Exception as error:
                    print(error)
                    return []
Пример #2
0
    def parse_file(self, tmp_dir: str, filename: str,
                   parameters: Dict[str, str]) -> [UnstructuredDocument, bool]:

        name, extension = splitext_(filename)
        file_path = os.path.join(tmp_dir, filename)
        mime = get_file_mime_type(file_path)
        document_type = parameters.get("document_type")

        for reader in self.readers:
            if reader.can_read(path=file_path,
                               mime=mime,
                               extension=extension,
                               document_type=document_type):
                unstructured_document, need_analyze_attachments = reader.read(
                    path=file_path,
                    document_type=document_type,
                    parameters=parameters)
                assert len(unstructured_document.lines) == 0 or isinstance(
                    unstructured_document.lines[0], LineWithMeta)
                assert isinstance(unstructured_document,
                                  UnstructuredDocument)  # TODO remove
                return unstructured_document, need_analyze_attachments

        raise BadFileFormatException(
            msg=
            "no one can read file: name = {}, extension = {}, mime = {}, document type = {}"
            .format(filename, extension, mime, document_type),
            msg_api="Unsupported file format {} of the input file {}".format(
                mime, filename))
Пример #3
0
 def do_converting(self, tmp_dir: str, filename: str) -> str:
     name, extension = splitext_(filename)
     mime = get_file_mime_type(os.path.join(tmp_dir, filename))
     for converter in self.converters:
         if converter.can_convert(extension=extension, mime=mime):
             filename = converter.do_convert(tmp_dir, name, extension)
             break
     file_path = os.path.join(tmp_dir, filename)
     os.chmod(file_path, S_IREAD | S_IRGRP | S_IROTH)
     return filename
Пример #4
0
    def can_extract(self, mime: str, filename: str) -> bool:
        """
        Check if this Extractor can handle given file.
        :param mime: mime type of the file.
        :param filename: name of the file with extension.
        :return: True if this extractor can handle given file, False otherwise
        """

        if mime in recognized_mimes.docx_like_format:
            name, ext = splitext_(filename)
            return ext == '.docx'
        return False
Пример #5
0
    def get_attachments(self, tmpdir: str, filename: str,
                        parameters: dict) -> List[List[Union[str, bytes]]]:
        """
        :param tmpdir: directory where file is located
        :param filename: Name of the file from which you should extract attachments
        :param parameters: dict with different parameters for extracting
        :return: list of lists (name of original file and binary file content)
        """
        result = []
        name, ext = splitext_(filename)

        if ext == '.docx':
            with zipfile.ZipFile(os.path.join(tmpdir, filename), 'r') as zfile:
                files = zfile.namelist()

                attachments = [
                    file for file in files if file.startswith("word/media/")
                ]
                attachments += [
                    file for file in files
                    if file.startswith("word/embeddings/")
                ]
                try:
                    for attachment in attachments:
                        namefile = os.path.split(attachment)[-1]
                        if not namefile.endswith(
                                '.emf') and not namefile.endswith('.bin'):
                            result.append([namefile, zfile.read(attachment)])

                        elif namefile.endswith('.bin'):
                            # extracting PDF-files
                            with zfile.open(attachment) as f:
                                ole = olefile.OleFileIO(f.read())
                            if ole.exists("CONTENTS"):
                                data = ole.openstream('CONTENTS').read()
                                if data[0:5] == b'%PDF-':
                                    result.append([
                                        os.path.splitext(namefile)[-2] +
                                        '.pdf', data
                                    ])
                            # extracting files in other formats
                            elif ole.exists("\x01Ole10Native"):
                                data = ole.openstream("\x01Ole10Native").read()
                                namefile, contents = self.__parse_ole_contents(
                                    data)
                                result.append([namefile, contents])
                except Exception as error:
                    print(error)
        return result
    def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[List[Union[str, bytes]]]:
        attachments = []
        name, ext = splitext_(filename)
        if ext == '.xlsx':

            with zipfile.ZipFile(os.path.join(tmpdir, filename), 'r') as zfile:
                name_zip, *files = zfile.namelist()
                print(name_zip)

                medias = [file for file in files if file.startswith("xl/media/")]

                for media in medias:
                    namefile = os.path.split(media)[-1]
                    attachments.append([namefile, zfile.read(media)])

        return attachments
Пример #7
0
    def get_attachments(self, tmpdir: str, filename: str,
                        parameters: dict) -> List[AttachedFile]:
        attachments = []
        name, ext = splitext_(filename)
        if ext == '.xlsx':

            with zipfile.ZipFile(os.path.join(tmpdir, filename), 'r') as zfile:
                name_zip, *files = zfile.namelist()
                print(name_zip)

                medias = [
                    file for file in files if file.startswith("xl/media/")
                ]

                for media in medias:
                    namefile = os.path.split(media)[-1]
                    attachments.append((namefile, zfile.read(media)))
        return self._content2attach_file(content=attachments, tmpdir=tmpdir)
Пример #8
0
 def do_converting(self, tmp_dir: str, filename: str, parameters: Optional[dict] = None) -> str:
     name, extension = splitext_(filename)
     mime = get_file_mime_type(os.path.join(tmp_dir, filename))
     for converter in self.converters:
         if "parameters" in inspect.getfullargspec(converter.can_convert).args:
             can_convert = converter.can_convert(extension=extension, mime=mime, parameters=parameters)
         else:
             warnings.warn("!WARNING! you converter requires an update\n" +
                           "Please specify parameters argument in method can_convert in {}\n".format(
                               type(converter).__name__) +
                           " This parameters would be mandatory in the near future")
             can_convert = converter.can_convert(extension=extension, mime=mime)
         if can_convert:
             filename = converter.do_convert(tmp_dir, name, extension)
             break
     file_path = os.path.join(tmp_dir, filename)
     os.chmod(file_path, S_IREAD | S_IRGRP | S_IROTH)
     return filename
Пример #9
0
    def parse_file(self, tmp_dir: str, filename: str,
                   parameters: Dict[str, str]) -> UnstructuredDocument:

        name, extension = splitext_(filename)
        file_path = os.path.join(tmp_dir, filename)
        mime = get_file_mime_type(file_path)
        document_type = parameters.get("document_type")

        for reader in self.readers:
            if "parameters" in inspect.getfullargspec(reader.can_read).args:
                can_read = reader.can_read(path=file_path,
                                           mime=mime,
                                           extension=extension,
                                           document_type=document_type,
                                           parameters=parameters)
            else:
                warnings.warn(
                    "!WARNING! you reader requires an update\n" +
                    "Please specify parameters argument in method can_read in {}\n"
                    .format(reader) +
                    " This parameters would be mandatory in the near future")
                can_read = reader.can_read(path=file_path,
                                           mime=mime,
                                           extension=extension,
                                           document_type=document_type)
            if can_read:
                unstructured_document = reader.read(
                    path=file_path,
                    document_type=document_type,
                    parameters=parameters)
                assert len(unstructured_document.lines) == 0 or isinstance(
                    unstructured_document.lines[0], LineWithMeta)
                assert isinstance(unstructured_document,
                                  UnstructuredDocument)  # TODO remove
                return unstructured_document

        raise BadFileFormatException(
            msg=
            "no one can read file: name = {}, extension = {}, mime = {}, document type = {}"
            .format(filename, extension, mime, document_type),
            msg_api="Unsupported file format {} of the input file {}".format(
                mime, filename))
 def can_extract(self, mime: str, filename: str) -> bool:
     if mime in recognized_mimes.excel_like_format:
         name, ext = splitext_(filename)
         return ext == '.xlsx'
     return False