def text(self): """ Extract the raw plain text. Returns: str: The text content. """ ft = self.libmagic_file_type # Empty: if ft == 'inode/x-empty': return None # Plaintext: elif ft == 'text/plain': with open(self.path, 'r') as fh: return fh.read() # HTML/XML: elif ft == 'text/html': return utils.html_text(self.path) # PDF: elif ft == 'application/pdf': return utils.pdf_text(self.path) # Everything else: else: return utils.docx_text(self.path)
def test_extract_text(mock_osp): """ Text in pages should be extracted and concatenated. """ # Create a PDF with 3 pages. path = mock_osp.add_file(content='text', ftype='pdf') # Should extract the text. text = pdf_text(path).strip() assert text == 'text'