def _update(self, data, mime_type=None): new_digest = hashlib.md5(data).hexdigest() if new_digest == self.digest: return self.digest = new_digest self.data = data self.size = len(data) if mime_type: self.mime_type = mime_type # TODO Else: use a sniffer # TODO: This should be asynchronous if self.mime_type != "application/pdf": try: self.pdf = converter.to_pdf(self.digest, self.data, self.mime_type) except ConversionError: traceback.print_exc() else: self.pdf = self.data try: self.text = converter.to_text(self.digest, self.data, self.mime_type) except ConversionError: self.text = u"" traceback.print_exc() try: self.extra_metadata = converter.get_metadata(self.digest, self.data, self.mime_type) except ConversionError: self.extra_metadata = {} traceback.print_exc() if self.text: self.language = guessLanguageName(self.text) self.page_num = self.extra_metadata.get("PDF:Pages", 1)
def XXXtest_excel_to_text(self): blob = self.read_file("test.xls") text = converter.to_text("", blob, "application/excel")
def XXXtest_wordx_to_text(self): blob = self.read_file("test.docx") text = converter.to_text("", blob, "application/msword")
def test_pdf_to_text(self): blob = self.read_file("onepage.pdf") text = converter.to_text("", blob, "application/pdf")