예제 #1
0
파일: dm.py 프로젝트: sfermigier/yaka-crm
  def _update(self, data, mime_type=None):
    new_digest = hashlib.md5(data).hexdigest()
    if new_digest == self.digest:
      return

    self.digest = new_digest
    self.data = data
    self.size = len(data)
    if mime_type:
      self.mime_type = mime_type
    # TODO Else: use a sniffer

    # TODO: This should be asynchronous
    if self.mime_type != "application/pdf":
      try:
        self.pdf = converter.to_pdf(self.digest, self.data, self.mime_type)
      except ConversionError:
        traceback.print_exc()
    else:
      self.pdf = self.data

    try:
      self.text = converter.to_text(self.digest, self.data, self.mime_type)
    except ConversionError:
      self.text = u""
      traceback.print_exc()

    try:
      self.extra_metadata = converter.get_metadata(self.digest, self.data, self.mime_type)
    except ConversionError:
      self.extra_metadata = {}
      traceback.print_exc()

    if self.text:
      self.language = guessLanguageName(self.text)

    self.page_num = self.extra_metadata.get("PDF:Pages", 1)
예제 #2
0
 def XXXtest_excel_to_text(self):
     blob = self.read_file("test.xls")
     text = converter.to_text("", blob, "application/excel")
예제 #3
0
 def XXXtest_wordx_to_text(self):
     blob = self.read_file("test.docx")
     text = converter.to_text("", blob, "application/msword")
예제 #4
0
 def test_pdf_to_text(self):
     blob = self.read_file("onepage.pdf")
     text = converter.to_text("", blob, "application/pdf")