Пример #1
0
 def test_corrupted_doc(self):
     corrupted_doc_handle = FilesystemHandle.make_handle(
         os.path.join(test_data_path, "msoffice/corrupted/test.trunc.doc"))
     corrupted_doc = Source.from_handle(corrupted_doc_handle)
     with SourceManager() as sm:
         self.assertEqual(
             list(corrupted_doc.handles(sm)), [],
             "unrecognised CDFV2 document should be empty and wasn't")
Пример #2
0
 def test_alternative_trimming(self):
     alternative_source = MailSource(
             FilesystemHandle.make_handle(
                     os.path.join(test_data_path, "alternative.eml")))
     with SourceManager() as sm:
         self.assertEqual(
                 len(list(alternative_source.handles(sm))),
                 1,
                 "text/plain trimming failed")
Пример #3
0
 def test_libreoffice_size(self):
     large_doc_handle = FilesystemHandle.make_handle(
         os.path.join(test_data_path, "libreoffice/html-explosion.ods"))
     large_doc = Source.from_handle(large_doc_handle)
     with SourceManager() as sm:
         for h in large_doc.handles(sm):
             if h.name.endswith(".html"):
                 r = h.follow(sm)
                 self.assertLess(r.get_size().value, 1048576,
                                 "LibreOffice HTML output was too big")
Пример #4
0
try:
    cwd = Path(__file__).parent.absolute()
except:
    cwd = Path().absolute()
fpath = cwd / '../data/files/document.docx'
#fpath = cwd / '../data/files/cpr-examples.odt'

reload_content = True
reload_content = False
try:
    content
except:
    reload_content = True

if reload_content:
    h = FilesystemHandle.make_handle(fpath)
    content = get_content_from_handle(h)

# newrule = CPRRule(modulus_11=True, ignore_irrelevant=False,
#                   examine_context=True)
# newrule.extract_surrounding_words = MethodType(extract_surrounding_words_fixed, newrule)

rules = [
    (CPRSimple(modulus_11=True, ignore_irrelevant=False,
               examine_context=True), "simple w. context"),
    (CPRComplicated(modulus_11=True,
                    ignore_irrelevant=False,
                    examine_context=True), "'accepted' w. context"),
    (CPRRule(modulus_11=True, ignore_irrelevant=False,
             examine_context=True), "current w. context"),
    # (CPRRule(modulus_11=True, ignore_irrelevant=False, examine_context=True),
Пример #5
0
 def test_pdf_source(self):
     self.run_rule(
         PDFSource(
             FilesystemHandle.make_handle(
                 os.path.join(test_data_path, "pdf/embedded-cpr.pdf"))))
Пример #6
0
 def test_libreoffice_source(self):
     self.run_rule(
         LibreOfficeSource(
             FilesystemHandle.make_handle(
                 os.path.join(test_data_path,
                              "libreoffice/embedded-cpr.odt"))))
Пример #7
0
import os.path
import unittest

from os2datascanner.engine2.model.core import SourceManager
from os2datascanner.engine2.model.file import FilesystemHandle
from os2datascanner.engine2.conversions.types import OutputType
from os2datascanner.engine2.conversions.registry import convert


here_path = os.path.dirname(__file__)
image_handle = FilesystemHandle.make_handle(
        os.path.join(here_path, "data/ocr/cpr.png"))
html_handle = FilesystemHandle.make_handle(
        os.path.join(here_path, "data/html/simple.html"))



class Engine2ConversionTest(unittest.TestCase):
    def setUp(self):
        self._sm = SourceManager()

        self._ir = image_handle.follow(self._sm)
        self._hr = html_handle.follow(self._sm)

    def tearDown(self):
        self._sm.clear()

    def test_last_modified(self):
        self.assertIsNotNone(
                convert(self._ir, OutputType.LastModified).value)
import os.path
import unittest

from os2datascanner.engine2.model.core import SourceManager
from os2datascanner.engine2.model.file import FilesystemHandle


here_path = os.path.dirname(__file__)
doc_handle = FilesystemHandle.make_handle(
        os.path.join(
                here_path, "data", "msoffice", "test.doc"))
docx_handle = FilesystemHandle.make_handle(
        os.path.join(
                here_path, "data", "msoffice", "test.docx"))


class Engine2MIMETests(unittest.TestCase):
    def test_doc_mime(self):
        self.assertEqual(
                doc_handle.guess_type(),
                "application/msword",
                ".doc MIME guess is incorrect")
        with SourceManager() as sm:
            self.assertEqual(
                    doc_handle.follow(sm).compute_type(),
                    "application/msword",
                    ".doc MIME computation is incorrect")

    def test_docx_mime(self):
        self.assertEqual(
                docx_handle.guess_type(),
Пример #9
0
 def test_xlsx(self):
     self.run_rule_on_handle(
         FilesystemHandle.make_handle(
             os.path.join(test_data_path, "msoffice/test.xlsx")))
Пример #10
0
 def test_ods(self):
     self.run_rule_on_handle(
         FilesystemHandle.make_handle(
             os.path.join(test_data_path, "libreoffice/test.ods")))
Пример #11
0
 def test_docx(self):
     self.run_rule_on_handle(
         FilesystemHandle.make_handle(
             os.path.join(test_data_path, "msoffice/embedded-cpr.docx")))
Пример #12
0
 def test_odt(self):
     self.run_rule_on_handle(
         FilesystemHandle.make_handle(
             os.path.join(test_data_path, "libreoffice/embedded-cpr.odt")))
Пример #13
0
 def handle(self, **kwargs):
     with SourceManager() as sm:
         for path in kwargs['FILE']:
             guesses = guess_responsible_party(
                 FilesystemHandle.make_handle(path), sm)
             print("{0}: {1}".format(path, guesses))