def test_scanned_pdf(self):
        mock_pipeline = test_helper.get_mock_pipeline(
            [helper.DOCUMENT, helper.TEXT])
        mock_injector = test_helper.MockInjector()

        extractor = tika_extractor.Subscriber(mock_pipeline)

        expected_headers = {
            'Content-Disposition': 'attachment; filename=scanned.pdf.tiff',
            'Content-type': 'image/tiff',
            'X-Tika-OCRLanguage': 'eng+rus'
        }

        with open('config.yml') as inp:
            config = yaml.load(inp.read())

        config[helper.DATA_ROOT] = 'local_data'
        config[helper.WORKER_ID] = 1
        config[helper.OCR_LANGUAGES] = 'eng+rus'
        config[helper.INJECTOR] = mock_injector
        extractor.setup(config)

        path = self.get_test_file('scanned.pdf')

        doc = document.get_document(path)
        doc.meta['Content-Type'] = 'application/pdf'

        with open(doc.path, 'rb') as file_object:
            extractor.consume(doc, file_object)

        actual_headers = mock_injector.http_connection.request_headers

        self.assertEqual(expected_headers, actual_headers)
    def test_simple(self):
        mock_pipeline = test_helper.get_mock_pipeline(
            [helper.DOCUMENT, helper.TEXT])

        extractor = tika_extractor.Subscriber(mock_pipeline)

        expected = (
            b'This is an unstructured document containing the \nidentifier '
            b'"193.34.2.1" (ip address), stored as a PDF document.')

        with open('config.yml') as inp:
            config = yaml.load(inp.read())

        config[helper.DATA_ROOT] = 'local_data'
        config[helper.WORKER_ID] = 1
        config[helper.INJECTOR] = test_helper.MockInjector(
            response_text=expected)
        extractor.setup(config)

        path = self.get_test_file('document.pdf')

        doc = document.get_document(path)
        doc.meta['Content-Type'] = 'application/pdf'

        with open(doc.path, 'rb') as file_object:
            extractor.consume(doc, file_object)

        actual = doc.text

        self.assertEqual(expected.decode('utf-8'), actual)
예제 #3
0
    def test_config(self):
        with open('config.yml') as inp:
            config = yaml.load(inp.read())

        _find_entities = find_entities.Subscriber(
            test_helper.get_mock_pipeline([]))
        _find_entities.setup(config)

        doc = document.get_document('dummy')

        for entity_type, pattern_conf in config.get(helper.ENTITIES,
                                                    {}).items():
            if not isinstance(pattern_conf['test'], list):
                pattern_conf['test'] = [pattern_conf['test']]

            for test in pattern_conf['test']:
                doc.text = 'dum dum {} dum'.format(test)
                _find_entities.consume(doc, None)
                entities = doc.entities.get_all()

                self.assertEqual(1,
                                 len(entities),
                                 msg='regex for %s found nothing' %
                                 entity_type)
                self.assertEqual(entity_type, entities[0][1]['type'])
                self.assertEqual(test, entities[0][1]['value'])
예제 #4
0
    def test_simple(self):
        pipe = test_helper.get_mock_pipeline([])

        mock_mod = test_helper.MockSubscriber()
        mock_mod = test_helper.MockSubscriber()

        pipe.register_magic(b'\xFF\xEE\xDD', ('mock', mock_mod.consume))
        pipe.register_magic(b'\x00\x00\x00', ('mock', mock_mod.consume))

        _magic = magic.Subscriber(pipe)
        _magic.setup(None)

        doc = document.get_document('mock')

        content = b'\xFF\xEE\xDDMOCKMOCKMOCK'

        _magic.consume(doc, BytesIO(content))

        self.assertEquals(True, doc.magic_hit)
        self.assertEquals(1, len(mock_mod.produced))

        expected = content
        actual = mock_mod.produced[0][1].read()

        self.assertEquals(expected, actual)
예제 #5
0
    def test_simple(self):
        mock_pipeline = test_helper.get_mock_pipeline([])

        data_root = os.path.join('local_data', 'unittests')

        if os.path.exists(data_root):
            shutil.rmtree(data_root)

        _copy_picture = copy_picture.Subscriber(mock_pipeline)
        _copy_picture.setup({
            helper.DATA_ROOT: data_root,
            'workers': 1,
            'tag': 'default',
        })

        doc = document.get_document('mock.jpg')
        doc.meta['type'] = 'picture'

        with open(self.get_test_file('gransk-logo.png'), 'rb') as inp:
            _copy_picture.consume(doc, inp)

        expected = '6913571e-mock.jpg'

        actual = os.listdir('local_data/unittests/pictures')

        self.assertEqual([expected], actual)
        self.assertEqual(expected, doc.meta['picture'])
예제 #6
0
    def test_simple(self):
        mock_pipeline = test_helper.get_mock_pipeline(
            [helper.DOCUMENT, helper.TEXT])

        extractor = tika_extractor.Subscriber(mock_pipeline)

        expected = (
            u'This is an unstructured document containing the \nidentifier '
            u'"193.34.2.1" (ip address), stored as a PDF document.'
        ).encode('utf-8')

        with open('config.yml') as inp:
            config = yaml.load(inp.read())
            config[helper.INJECTOR] = test_helper.MockInjector(
                response_text=expected)
            extractor.setup(config)

        path = self.get_test_file('document.pdf')

        doc = document.get_document(path)

        with open(doc.path, 'rb') as file_object:
            extractor.consume(doc, file_object)

        actual = doc.text.encode('utf-8')

        self.assertEqual(expected, actual)
예제 #7
0
    def test_simple(self):
        mock_pipeline = test_helper.get_mock_pipeline([helper.FINISH_DOCUMENT])
        subscriber = related_entities.Subscriber(mock_pipeline)
        self._init(subscriber)

        actual = subscriber.get_related_to('e1', min_shared=2, min_score=0.2)

        self.assertEquals(2, len(actual[0]['shared']))
예제 #8
0
 def test_bug_2(self):
     text = """ os setup( name='recgonizer', author='Petter Christian Bjelland', version='0.3',"""
     config = {'code_root': '.', 'name_model': 'utils/names.gz'}
     find_names = _find_names.Subscriber(test_helper.get_mock_pipeline([]))
     find_names.setup(config)
     doc = document.get_document('dummy')
     doc.text = text
     find_names.consume(doc, None)
     self.assertEqual(2, len(doc.entities.get_all()))
예제 #9
0
  def setUp(self):
    self.mock_pipe = test_helper.get_mock_pipeline(
        [helper.PROCESS_FILE, helper.TEXT])

    self.detector = unpack_diskimage.Subscriber(self.mock_pipe)
    self.detector.setup({
        'max_file_size': 1,
        helper.DATA_ROOT: 'local_data',
        'code_root': '.'
    })
예제 #10
0
 def test_simple(self):
     config = {'code_root': '.', 'name_model': 'utils/names.gz'}
     find_names = _find_names.Subscriber(test_helper.get_mock_pipeline([]))
     find_names.setup(config)
     doc = document.get_document('dummy')
     doc.text = 'Dette  er Tom Martin.'
     find_names.consume(doc, None)
     expected = [(10, {
         'entity_id': 'tom_martin',
         'type': 'per',
         'value': 'Tom Martin'
     })]
     self.assertEqual(expected, doc.entities.get_all())
예제 #11
0
  def test_size_not_overriden(self):
    mock_pipeline = test_helper.get_mock_pipeline(
        [helper.PROCESS_TEXT, helper.ANALYZE, helper.FINISH_DOCUMENT])

    _process = process.Subscriber(mock_pipeline)
    _process.setup({})

    doc = document.get_document('mock')
    doc.set_size(100)
    doc.text = 'dcba'

    _process.consume(doc, None)

    self.assertEquals(100, doc.meta['size'])
    def test_simple(self):
        mock_pipeline = test_helper.get_mock_pipeline([helper.FINISH_DOCUMENT])
        subscriber = related_documents.Subscriber(mock_pipeline)
        self._init(subscriber)

        expected = [
            '{"entity_id": "e1", "type": "mock", "value": "e1"}',
            '{"entity_id": "e2", "type": "mock", "value": "e2"}'
        ]

        actual = subscriber.get_related_to('mock', min_shared=2, min_score=0.2)

        self.assertEqual(1, len(actual))
        self.assertEqual(2, len(actual[0]['shared']))
예제 #13
0
  def _run_test(self, filename):
    mock_pipeline = test_helper.get_mock_pipeline([
        helper.DOCUMENT, helper.PICTURE,
        helper.ARCHIVE, helper.DISKIMAGE])

    detector = detect_type.Subscriber(mock_pipeline)

    with open('config.yml') as inp:
      detector.setup(yaml.load(inp.read()))

    doc = document.get_document(filename)

    detector.consume(doc, StringIO('dummy'))

    return doc
예제 #14
0
    def test_bug(self):
        text = """MT-2009-12-015-W001 – SIMULATED WARRANT
Computers assigned to Jo Smith from November 13, 2009 to December 12, 2009.
"""
        config = {'code_root': '.', 'name_model': 'utils/names.gz'}
        find_names = _find_names.Subscriber(test_helper.get_mock_pipeline([]))
        find_names.setup(config)
        doc = document.get_document('dummy')
        doc.text = text
        find_names.consume(doc, None)
        expected = [(62, {
            'entity_id': 'jo_smith',
            'type': 'per',
            'value': 'Jo Smith'
        })]
        self.assertEqual(expected, doc.entities.get_all())
예제 #15
0
  def test_simple(self):
    mock_pipeline = test_helper.get_mock_pipeline(
        [helper.PROCESS_TEXT, helper.ANALYZE, helper.FINISH_DOCUMENT])

    _process = process.Subscriber(mock_pipeline)
    _process.setup({})

    doc = document.get_document('mock')
    doc.status = 'untouched'
    doc.text = 'abcd'

    _process.consume(doc, None)

    self.assertNotEqual('untouched', doc.status)
    self.assertEquals(4, doc.meta['size'])
    self.assertNotEqual(0, len(mock_pipeline.consumer.produced))
예제 #16
0
    def test_base(self):
        pipe = test_helper.get_mock_pipeline([helper.RUN_PIPELINE])
        _strings = strings.Subscriber(pipe)
        _strings.setup({'min_string_length': 4, 'max_lines': 2})

        doc = document.get_document('mock')
        doc.set_size(12345)

        _strings.consume(doc, StringIO('AAAA\x00BBBB\x00CCCC'))

        # Two child documents produced.
        self.assertEquals(2, len(pipe.consumer.produced))

        expected = 'mock.00000.child'
        actual = pipe.consumer.produced[0][0].path

        self.assertEquals(expected, actual)
예제 #17
0
    def test_simple(self):
        _file_meta = file_meta.Subscriber(test_helper.get_mock_pipeline([]))
        response = json.dumps({u'Content-Type': u'image/jpeg'}).encode('utf-8')
        _file_meta.setup({
            'code_root': '.',
            'host': 'mock',
            helper.INJECTOR: test_helper.MockInjector(response)
        })

        doc = document.get_document('mock.txt')

        _file_meta.consume(doc, StringIO(u'mock'))

        expected = u'picture'
        actual = doc.doctype

        self.assertEqual(expected, actual)
예제 #18
0
 def test_simple(self):
   config = {
       'code_root': '.',
       helper.INJECTOR: test_helper.MockInjector(
           ner_entities=[(10, u'Hans Petter')])
   }
   find_names = _find_names.Subscriber(test_helper.get_mock_pipeline([]))
   find_names.setup(config)
   doc = document.get_document('dummy')
   doc.text = u'Dette  er Hans Petter.'
   find_names.consume(doc, None)
   expected = [(10, {
       'entity_id': u'Hans_Petter',
       'type': u'per',
       'value': u'Hans Petter'
   })]
   self.assertEqual(expected, doc.entities.get_all())
예제 #19
0
    def test_encrypted(self):
        mock_pipe = test_helper.get_mock_pipeline(
            [helper.PROCESS_FILE, helper.TEXT])

        detector = unpack_archive.Subscriber(mock_pipe)
        detector.setup({
            helper.DATA_ROOT: 'local_data',
            helper.TAG: 'test',
            helper.WORKER_ID: 0
        })

        doc = document.get_document(
            test_helper.get_test_path('password-protected.zip'))

        doc.docid = '4321'

        with open(doc.path, 'rb') as inp:
            detector.consume(doc, inp)

        self.assertEqual(1, len(mock_pipe.consumer.produced))
예제 #20
0
    def test_simple(self):
        mock_pipeline = test_helper.get_mock_pipeline([])

        data_root = os.path.join('local_data', 'unittests')

        if os.path.exists(data_root):
            shutil.rmtree(data_root)

        _store_text = store_text.Subscriber(mock_pipeline)
        _store_text.setup({helper.DATA_ROOT: data_root, 'workers': 1})

        doc = document.get_document('mock')
        doc.text = 'mock-mock-mock'

        _store_text.consume(doc, None)

        expected = 'local_data/unittests/text/17404a59-mock'
        actual = doc.meta['text_file']

        self.assertEquals(expected, actual)
예제 #21
0
    def _init(self):
        mock_pipeline = test_helper.get_mock_pipeline([])

        injector = test_helper.MockInjector('{}')

        _index_text = index_text.Subscriber(mock_pipeline)
        _index_text.setup({
            'tag': 'default',
            'context_size': 14,
            helper.INJECTOR: injector
        })

        doc = document.get_document('mock.txt')
        doc.text = 'abcd mock-value efgh'
        doc.entities.add(5, 'mock-type', 'mock-value')

        _index_text.consume(doc, None)

        _index_text.stop()

        return injector.elastic_helper._bulk
예제 #22
0
  def test_get_network(self):
    mock_pipeline = test_helper.get_mock_pipeline([helper.FINISH_DOCUMENT])

    entities = related_entities.Subscriber(mock_pipeline)
    documents = related_documents.Subscriber(mock_pipeline)
    network = entity_network.Subscriber(mock_pipeline)

    config = {
        helper.DATA_ROOT: 'local_data/network',
        'worker_id': 1
    }

    if os.path.exists(config[helper.DATA_ROOT]):
      shutil.rmtree(config[helper.DATA_ROOT])

    os.makedirs(config[helper.DATA_ROOT])

    entities.setup(config)
    documents.setup(config)
    network.setup(config)

    doc1 = document.get_document('dummy1.txt')
    doc1.entities.add(0, 'mock', u'e1')
    doc1.entities.add(1, 'mock', u'e2')
    doc1.entities.add(2, 'mock', u'e3')
    mock_pipeline.produce(helper.FINISH_DOCUMENT, doc1, None)

    doc1 = document.get_document('dummy2.txt')
    doc1.entities.add(1, 'mock', u'e2')
    doc1.entities.add(2, 'mock', u'e4')
    mock_pipeline.produce(helper.FINISH_DOCUMENT, doc1, None)

    one_hop = network.get_for('e1', hops=1)
    two_hop = network.get_for('e1', hops=2)

    self.assertNotEqual(0, len(one_hop['nodes']))
    self.assertNotEqual(0, len(one_hop['links']))

    self.assertNotEqual(0, len(two_hop['nodes']))
    self.assertNotEqual(0, len(two_hop['links']))
예제 #23
0
    def test_simple(self):
        mock_pipe = test_helper.get_mock_pipeline(
            [helper.PROCESS_FILE, helper.TEXT])

        detector = unpack_archive.Subscriber(mock_pipe)
        detector.setup({
            helper.DATA_ROOT: 'local_data',
            helper.TAG: 'test',
            helper.WORKER_ID: 0
        })

        doc = document.get_document(test_helper.get_test_path('two_files.zip'))

        doc.docid = '4321'

        with open(doc.path, 'rb') as inp:
            detector.consume(doc, inp)

        self.assertEqual(2, len(mock_pipe.consumer.produced))
        self.assertEqual(
            'txt', mock_pipe.consumer.produced[1][0].path.split('/')[-1].split(
                '.')[-1])
예제 #24
0
    def test_simple(self):
        mock_pipeline = test_helper.get_mock_pipeline([])

        data_root = os.path.join('local_data', 'unittests')

        if os.path.exists(data_root):
            shutil.rmtree(data_root)

        _copy = copy_file.Subscriber(mock_pipeline)
        _copy.setup({
            helper.DATA_ROOT: data_root,
            'workers': 1,
            'tag': 'default',
            helper.COPY_EXT: ['xyz']
        })

        _copy.consume(document.get_document('mock.xyz'), BytesIO(b'juba.'))
        _copy.consume(document.get_document('ignore.doc'), BytesIO(b'mock'))

        expected = ['39bbf948-mock.xyz']

        actual = os.listdir(os.path.join(data_root, 'files', 'xyz'))

        self.assertEqual(expected, actual)