def text(self):

        """
        Extract the raw plain text.

        Returns:
            str: The text content.
        """

        ft = self.libmagic_file_type

        # Empty:
        if ft == 'inode/x-empty':
            return None

        # Plaintext:
        elif ft == 'text/plain':
            with open(self.path, 'r') as fh:
                return fh.read()

        # HTML/XML:
        elif ft == 'text/html':
            return utils.html_text(self.path)

        # PDF:
        elif ft == 'application/pdf':
            return utils.pdf_text(self.path)

        # Everything else:
        else:
            return utils.docx_text(self.path)
    def text(self):
        """
        Extract the raw plain text.

        Returns:
            str: The text content.
        """

        ft = self.libmagic_file_type

        # Empty:
        if ft == 'inode/x-empty':
            return None

        # Plaintext:
        elif ft == 'text/plain':
            with open(self.path, 'r') as fh:
                return fh.read()

        # HTML/XML:
        elif ft == 'text/html':
            return utils.html_text(self.path)

        # PDF:
        elif ft == 'application/pdf':
            return utils.pdf_text(self.path)

        # Everything else:
        else:
            return utils.docx_text(self.path)
Пример #3
0
def test_extract_text(mock_osp):
    """
    Text in pages should be extracted and concatenated.
    """

    # Create a PDF with 3 pages.
    path = mock_osp.add_file(content='text', ftype='pdf')

    # Should extract the text.
    text = pdf_text(path).strip()
    assert text == 'text'
def test_extract_text(mock_osp):

    """
    Text in pages should be extracted and concatenated.
    """

    # Create a PDF with 3 pages.
    path = mock_osp.add_file(content='text', ftype='pdf')

    # Should extract the text.
    text = pdf_text(path).strip()
    assert text == 'text'