def test_scanned_pdf(self):
        mock_pipeline = test_helper.get_mock_pipeline(
            [helper.DOCUMENT, helper.TEXT])
        mock_injector = test_helper.MockInjector()

        extractor = tika_extractor.Subscriber(mock_pipeline)

        expected_headers = {
            'Content-Disposition': 'attachment; filename=scanned.pdf.tiff',
            'Content-type': 'image/tiff',
            'X-Tika-OCRLanguage': 'eng+rus'
        }

        with open('config.yml') as inp:
            config = yaml.load(inp.read())

        config[helper.DATA_ROOT] = 'local_data'
        config[helper.WORKER_ID] = 1
        config[helper.OCR_LANGUAGES] = 'eng+rus'
        config[helper.INJECTOR] = mock_injector
        extractor.setup(config)

        path = self.get_test_file('scanned.pdf')

        doc = document.get_document(path)
        doc.meta['Content-Type'] = 'application/pdf'

        with open(doc.path, 'rb') as file_object:
            extractor.consume(doc, file_object)

        actual_headers = mock_injector.http_connection.request_headers

        self.assertEqual(expected_headers, actual_headers)
    def test_simple(self):
        mock_pipeline = test_helper.get_mock_pipeline(
            [helper.DOCUMENT, helper.TEXT])

        extractor = tika_extractor.Subscriber(mock_pipeline)

        expected = (
            b'This is an unstructured document containing the \nidentifier '
            b'"193.34.2.1" (ip address), stored as a PDF document.')

        with open('config.yml') as inp:
            config = yaml.load(inp.read())

        config[helper.DATA_ROOT] = 'local_data'
        config[helper.WORKER_ID] = 1
        config[helper.INJECTOR] = test_helper.MockInjector(
            response_text=expected)
        extractor.setup(config)

        path = self.get_test_file('document.pdf')

        doc = document.get_document(path)
        doc.meta['Content-Type'] = 'application/pdf'

        with open(doc.path, 'rb') as file_object:
            extractor.consume(doc, file_object)

        actual = doc.text

        self.assertEqual(expected.decode('utf-8'), actual)
示例#3
0
    def test_simple(self):
        mock_pipeline = test_helper.get_mock_pipeline(
            [helper.DOCUMENT, helper.TEXT])

        extractor = tika_extractor.Subscriber(mock_pipeline)

        expected = (
            u'This is an unstructured document containing the \nidentifier '
            u'"193.34.2.1" (ip address), stored as a PDF document.'
        ).encode('utf-8')

        with open('config.yml') as inp:
            config = yaml.load(inp.read())
            config[helper.INJECTOR] = test_helper.MockInjector(
                response_text=expected)
            extractor.setup(config)

        path = self.get_test_file('document.pdf')

        doc = document.get_document(path)

        with open(doc.path, 'rb') as file_object:
            extractor.consume(doc, file_object)

        actual = doc.text.encode('utf-8')

        self.assertEqual(expected, actual)
示例#4
0
 def test_simple(self):
   config = {
       'code_root': '.',
       helper.INJECTOR: test_helper.MockInjector(
           ner_entities=[(10, u'Hans Petter')])
   }
   find_names = _find_names.Subscriber(test_helper.get_mock_pipeline([]))
   find_names.setup(config)
   doc = document.get_document('dummy')
   doc.text = u'Dette  er Hans Petter.'
   find_names.consume(doc, None)
   expected = [(10, {
       'entity_id': u'Hans_Petter',
       'type': u'per',
       'value': u'Hans Petter'
   })]
   self.assertEqual(expected, doc.entities.get_all())
示例#5
0
    def test_simple(self):
        _file_meta = file_meta.Subscriber(test_helper.get_mock_pipeline([]))
        response = json.dumps({u'Content-Type': u'image/jpeg'}).encode('utf-8')
        _file_meta.setup({
            'code_root': '.',
            'host': 'mock',
            helper.INJECTOR: test_helper.MockInjector(response)
        })

        doc = document.get_document('mock.txt')

        _file_meta.consume(doc, StringIO(u'mock'))

        expected = u'picture'
        actual = doc.doctype

        self.assertEqual(expected, actual)
示例#6
0
    def _init(self):
        mock_pipeline = test_helper.get_mock_pipeline([])

        injector = test_helper.MockInjector('{}')

        _index_text = index_text.Subscriber(mock_pipeline)
        _index_text.setup({
            'tag': 'default',
            'context_size': 14,
            helper.INJECTOR: injector
        })

        doc = document.get_document('mock.txt')
        doc.text = 'abcd mock-value efgh'
        doc.entities.add(5, 'mock-type', 'mock-value')

        _index_text.consume(doc, None)

        _index_text.stop()

        return injector.elastic_helper._bulk
示例#7
0
 def test_run(self):
     inject = test_helper.MockInjector()
     gransk.boot.run.run(inject, ['mock'])
     self.assertEquals(True, inject.worker.called)
示例#8
0
 def setUp(self):
     gransk.boot.ui.setup({}, MockPipelineMod.pipe, MockRunMod,
                          test_helper.MockInjector())
     gransk.boot.ui._globals['test'] = True
     self.app = gransk.boot.ui.app.test_client()
     self.pipe = MockPipelineMod.pipe