def _init(self, subscriber): config = {helper.DATA_ROOT: 'local_data/related', 'worker_id': 0} if os.path.exists(config[helper.DATA_ROOT]): shutil.rmtree(config[helper.DATA_ROOT]) os.makedirs(config[helper.DATA_ROOT]) subscriber.setup(config) doc1 = document.get_document('dummy1.txt') doc1.docid = 'mock' doc1.entities.add(0, 'mock1', 'e1') doc1.entities.add(1, 'mock2', 'e2') doc1.entities.add(2, 'mock3', 'e3') subscriber.consume(doc1, None) doc2 = document.get_document('dummy2.txt') doc2.entities.add(0, 'mock1', 'e1') doc2.entities.add(1, 'mock2', 'e2') doc2.entities.add(2, 'mock3', 'e4') subscriber.consume(doc2, None) doc3 = document.get_document('dummy3.txt') doc3.entities.add(0, 'mock1', 'e1') subscriber.consume(doc3, None) subscriber.stop()
def upload(): """Receive and process an uploaded file.""" _file = request.files.get('file') doc = document.get_document(secure_filename(_file.filename), parent=document.get_document('root')) doc.tag = 'upload' _globals['gransk'].add_file(doc, file_object=_file) return Response('ok')
def init(self, config, queue, worker_id, injector): """ Initialize worker and read paths from queue, stopping when queue is empty. :param config: Configuration object. :param queue: Multiprcessing Queue object. :param worker_id: Value identifying this worker. :param injector: Object from which to fetch dependencies. :type config: ``dict`` :type queue: ``multiprocessing.Queue`` :type worker_id: ``int`` :type injector: ``gransk.core.injector.Injector`` """ logger = logging.getLogger('worker') config[helper.WORKER_ID] = worker_id config[helper.INJECTOR] = injector pipe = pipeline.build_pipeline(config) mod = gransk.api.Subscriber(pipe) mod.setup(config) while True: try: path = queue.get(timeout=1) except Empty: logger.info('[normal stop] worker %d', worker_id) break try: doc = document.get_document( path, parent=document.get_document('root')) mod.consume(doc) except KeyboardInterrupt: logger.info('[aborting] worker %d', worker_id) break pipe.stop() with open( os.path.join(config[helper.DATA_ROOT], 'time-%s.csv' % worker_id), 'w') as out: out.write('%s;%s;%s;%s\n' % ('consumer', 'total', 'consume_count', 'avg')) for consumer, (total, consume_count, avg) in pipe.get_time_report(): out.write('%s;%.2f;%.2f;%.2f\n' % (consumer, total, consume_count, avg))
def test_config(self): with open('config.yml') as inp: config = yaml.load(inp.read()) _find_entities = find_entities.Subscriber( test_helper.get_mock_pipeline([])) _find_entities.setup(config) doc = document.get_document('dummy') for entity_type, pattern_conf in config.get(helper.ENTITIES, {}).items(): if not isinstance(pattern_conf['test'], list): pattern_conf['test'] = [pattern_conf['test']] for test in pattern_conf['test']: doc.text = 'dum dum {} dum'.format(test) _find_entities.consume(doc, None) entities = doc.entities.get_all() self.assertEqual(1, len(entities), msg='regex for %s found nothing' % entity_type) self.assertEqual(entity_type, entities[0][1]['type']) self.assertEqual(test, entities[0][1]['value'])
def _callback(self, entry, path, data_stream, doc): stat = entry.GetStat() newdoc = document.get_document(path, parent=doc) newdoc.tag = doc.tag newdoc.meta['mtime'] = stat.mtime newdoc.meta['atime'] = stat.atime newdoc.meta['ctime'] = stat.ctime newdoc.meta['size'] = stat.size if stat.size < self.max_size: file_object = None try: file_object = entry.GetFileObject(data_stream_name=data_stream) self.produce(helper.PROCESS_FILE, newdoc, file_object) doc.children += 1 except IOError as err: LOGGER.debug(u'could not read path "%s": %s' % (path, err)) doc.meta['diskimage_read_error'] = six.text_type(err) return None except Exception as err: doc.meta['diskimage_other_read_error'] = six.text_type(err) finally: if file_object: file_object.close() else: self.produce(helper.OVERSIZED_FILE, newdoc, None)
def test_simple(self): _strings = ewf_strings.Subscriber(None) _strings.setup({'min_string_length': 12}) doc = document.get_document('mock') with open('gransk/plugins/unpackers/tests/test_data/dummy.E01', 'rb') as inp: _strings.consume(doc, inp) expected = (u"IDUMMY FAT12" u"Non-system disk" u"Press any key to reboot" u"DUMMY (" u"~1 TRA\"" u"FILE-B TXT" u". 2" u"Mac OS X" u"This resource fork intentionally left blank" u". 2" u"FSEVEN~1" u"000000~1" u"000000~2" u"D3E90FC1-F0EF-427D-B874-2BECB6BEA409" u". 0" u"FILE-A TXT" u"Hi, I'm file A." u"And I'm file B.") actual = doc.text self.assertNotEqual(None, actual) self.assertEqual(re.sub(r'\s', u'', expected), re.sub(r'\s', u'', actual))
def _produce_child_doc(self, doc, text, offset): base = '%%s.%%0%sd.child' % max(len('%s' % doc.meta['size']), 1) new_doc = document.get_document(base % (doc.path, offset), parent=doc) new_doc.tag = doc.tag new_doc.text = text doc.children += 1 self.produce(helper.RUN_PIPELINE, new_doc, new_doc.text)
def test_simple(self): mock_pipeline = test_helper.get_mock_pipeline([]) data_root = os.path.join('local_data', 'unittests') if os.path.exists(data_root): shutil.rmtree(data_root) _copy_picture = copy_picture.Subscriber(mock_pipeline) _copy_picture.setup({ helper.DATA_ROOT: data_root, 'workers': 1, 'tag': 'default', }) doc = document.get_document('mock.jpg') doc.meta['type'] = 'picture' with open(self.get_test_file('gransk-logo.png'), 'rb') as inp: _copy_picture.consume(doc, inp) expected = '6913571e-mock.jpg' actual = os.listdir('local_data/unittests/pictures') self.assertEqual([expected], actual) self.assertEqual(expected, doc.meta['picture'])
def test_scanned_pdf(self): mock_pipeline = test_helper.get_mock_pipeline( [helper.DOCUMENT, helper.TEXT]) mock_injector = test_helper.MockInjector() extractor = tika_extractor.Subscriber(mock_pipeline) expected_headers = { 'Content-Disposition': 'attachment; filename=scanned.pdf.tiff', 'Content-type': 'image/tiff', 'X-Tika-OCRLanguage': 'eng+rus' } with open('config.yml') as inp: config = yaml.load(inp.read()) config[helper.DATA_ROOT] = 'local_data' config[helper.WORKER_ID] = 1 config[helper.OCR_LANGUAGES] = 'eng+rus' config[helper.INJECTOR] = mock_injector extractor.setup(config) path = self.get_test_file('scanned.pdf') doc = document.get_document(path) doc.meta['Content-Type'] = 'application/pdf' with open(doc.path, 'rb') as file_object: extractor.consume(doc, file_object) actual_headers = mock_injector.http_connection.request_headers self.assertEqual(expected_headers, actual_headers)
def test_simple(self): mock_pipeline = test_helper.get_mock_pipeline( [helper.DOCUMENT, helper.TEXT]) extractor = tika_extractor.Subscriber(mock_pipeline) expected = ( b'This is an unstructured document containing the \nidentifier ' b'"193.34.2.1" (ip address), stored as a PDF document.') with open('config.yml') as inp: config = yaml.load(inp.read()) config[helper.DATA_ROOT] = 'local_data' config[helper.WORKER_ID] = 1 config[helper.INJECTOR] = test_helper.MockInjector( response_text=expected) extractor.setup(config) path = self.get_test_file('document.pdf') doc = document.get_document(path) doc.meta['Content-Type'] = 'application/pdf' with open(doc.path, 'rb') as file_object: extractor.consume(doc, file_object) actual = doc.text self.assertEqual(expected.decode('utf-8'), actual)
def test_simple(self): pipe = test_helper.get_mock_pipeline([]) mock_mod = test_helper.MockSubscriber() mock_mod = test_helper.MockSubscriber() pipe.register_magic(b'\xFF\xEE\xDD', ('mock', mock_mod.consume)) pipe.register_magic(b'\x00\x00\x00', ('mock', mock_mod.consume)) _magic = magic.Subscriber(pipe) _magic.setup(None) doc = document.get_document('mock') content = b'\xFF\xEE\xDDMOCKMOCKMOCK' _magic.consume(doc, BytesIO(content)) self.assertEquals(True, doc.magic_hit) self.assertEquals(1, len(mock_mod.produced)) expected = content actual = mock_mod.produced[0][1].read() self.assertEquals(expected, actual)
def test_simple(self): _strings = ewf_strings.Subscriber(None) _strings.setup({'min_string_length': 12}) doc = document.get_document('mock') with open('gransk/plugins/unpackers/tests/test_data/dummy.E01', 'rb') as inp: _strings.consume(doc, inp) expected = ('IDUMMY FAT12' 'Non-system disk' 'Press any key to reboot' 'DUMMY (' '~1 TRA"' 'FILE-B TXT' '. 2' 'Mac OS X' 'This resource fork intentionally left blank' '. 2' 'FSEVEN~1' '000000~1' '000000~2' 'D3E90FC1-F0EF-427D-B874-2BECB6BEA409' '. 0' 'FILE-A TXT' "Hi, I'm file A." "And I'm file B.") actual = doc.text self.assertNotEqual(None, actual) self.assertEqual(re.sub(r'\s', '', expected), re.sub(r'\s', '', actual))
def test_simple(self): mock_pipeline = test_helper.get_mock_pipeline( [helper.DOCUMENT, helper.TEXT]) extractor = tika_extractor.Subscriber(mock_pipeline) expected = ( u'This is an unstructured document containing the \nidentifier ' u'"193.34.2.1" (ip address), stored as a PDF document.' ).encode('utf-8') with open('config.yml') as inp: config = yaml.load(inp.read()) config[helper.INJECTOR] = test_helper.MockInjector( response_text=expected) extractor.setup(config) path = self.get_test_file('document.pdf') doc = document.get_document(path) with open(doc.path, 'rb') as file_object: extractor.consume(doc, file_object) actual = doc.text.encode('utf-8') self.assertEqual(expected, actual)
def consume(self, doc, payload): """ Writes payload to disk and unpack archive using 7zip. Then adds all unpacked files to the pipeline. :param doc: Document object. :param payload: File pointer beloning to document. :type doc: ``gransk.core.document.Document`` :type payload: ``file`` """ tag = self.config[helper.TAG] if doc.tag: tag = doc.tag filename = os.path.basename(doc.path) unique_filename = '%s-%s' % (doc.docid[0:8], filename) unpack_to = os.path.join(self.config[helper.DATA_ROOT], 'archives', unique_filename) if not os.path.exists(unpack_to): os.makedirs(unpack_to) tmp_path = os.path.join( self.tmp_root, '%s-%s.%s' % (self.wid, doc.docid[0:8], doc.ext)) if not os.path.exists(self.tmp_root): os.makedirs(self.tmp_root) with open(tmp_path, 'wb') as out: payload.seek(0) out.write(payload.read()) payload.seek(0) cmd = self._get_cmd(tmp_path, unpack_to, doc.meta['Content-Type']) proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = proc.communicate() if err: doc.meta['archive_error'] = err.decode('utf-8') for folder, _, filenames in os.walk(unpack_to): for filename in filenames: path = os.path.join(folder, filename) new_doc = document.get_document(path, parent=doc) new_doc.tag = tag with open(path, "rb") as file_object: self.produce(helper.EXTRACT_META, new_doc, file_object) self.produce(helper.PROCESS_FILE, new_doc, file_object) doc.children += 1 if os.path.exists(tmp_path): os.remove(tmp_path) shutil.rmtree(unpack_to)
def test_bug_2(self): text = """ os setup( name='recgonizer', author='Petter Christian Bjelland', version='0.3',""" config = {'code_root': '.', 'name_model': 'utils/names.gz'} find_names = _find_names.Subscriber(test_helper.get_mock_pipeline([])) find_names.setup(config) doc = document.get_document('dummy') doc.text = text find_names.consume(doc, None) self.assertEqual(2, len(doc.entities.get_all()))
def test_no_match(self): _picture_meta = picture_meta.Subscriber(None) _picture_meta.setup({}) doc = document.get_document('mock') with open(self.get_test_file('document.pdf'), 'rb') as file_object: _picture_meta.consume(doc, file_object) self.assertEqual(None, doc.meta.get('img_width')) self.assertEqual(None, doc.meta.get('img_height'))
def test_get_network(self): mock_pipeline = test_helper.get_mock_pipeline([helper.FINISH_DOCUMENT]) entities = related_entities.Subscriber(mock_pipeline) documents = related_documents.Subscriber(mock_pipeline) network = entity_network.Subscriber(mock_pipeline) config = { helper.DATA_ROOT: 'local_data/network', 'worker_id': 1 } if os.path.exists(config[helper.DATA_ROOT]): shutil.rmtree(config[helper.DATA_ROOT]) os.makedirs(config[helper.DATA_ROOT]) entities.setup(config) documents.setup(config) network.setup(config) doc1 = document.get_document('dummy1.txt') doc1.entities.add(0, 'mock', u'e1') doc1.entities.add(1, 'mock', u'e2') doc1.entities.add(2, 'mock', u'e3') mock_pipeline.produce(helper.FINISH_DOCUMENT, doc1, None) doc1 = document.get_document('dummy2.txt') doc1.entities.add(1, 'mock', u'e2') doc1.entities.add(2, 'mock', u'e4') mock_pipeline.produce(helper.FINISH_DOCUMENT, doc1, None) one_hop = network.get_for('e1', hops=1) two_hop = network.get_for('e1', hops=2) self.assertNotEqual(0, len(one_hop['nodes'])) self.assertNotEqual(0, len(one_hop['links'])) self.assertNotEqual(0, len(two_hop['nodes'])) self.assertNotEqual(0, len(two_hop['links']))
def test_simple(self): config = {'code_root': '.', 'name_model': 'utils/names.gz'} find_names = _find_names.Subscriber(test_helper.get_mock_pipeline([])) find_names.setup(config) doc = document.get_document('dummy') doc.text = 'Dette er Tom Martin.' find_names.consume(doc, None) expected = [(10, { 'entity_id': 'tom_martin', 'type': 'per', 'value': 'Tom Martin' })] self.assertEqual(expected, doc.entities.get_all())
def test_size_not_overriden(self): mock_pipeline = test_helper.get_mock_pipeline( [helper.PROCESS_TEXT, helper.ANALYZE, helper.FINISH_DOCUMENT]) _process = process.Subscriber(mock_pipeline) _process.setup({}) doc = document.get_document('mock') doc.set_size(100) doc.text = 'dcba' _process.consume(doc, None) self.assertEquals(100, doc.meta['size'])
def test_simple(self): doc = document.get_document(test_helper.get_test_path('dummy.E01')) doc.docid = '4321' with open(doc.path) as inp: self.detector.consume(doc, inp) actual = [doc.path for doc, _ in self.mock_pipe.consumer.produced] expected = [ '/DUMMY (Volume Label Entry)', '/test/file-a.txt', '/file-b.txt' ] self.assertEquals(expected, actual)
def _run_test(self, filename): mock_pipeline = test_helper.get_mock_pipeline([ helper.DOCUMENT, helper.PICTURE, helper.ARCHIVE, helper.DISKIMAGE]) detector = detect_type.Subscriber(mock_pipeline) with open('config.yml') as inp: detector.setup(yaml.load(inp.read())) doc = document.get_document(filename) detector.consume(doc, StringIO('dummy')) return doc
def test_simple(self): mock_pipeline = test_helper.get_mock_pipeline([]) data_root = os.path.join('local_data', 'unittests') if os.path.exists(data_root): shutil.rmtree(data_root) _copy = copy_file.Subscriber(mock_pipeline) _copy.setup({ helper.DATA_ROOT: data_root, 'workers': 1, 'tag': 'default', helper.COPY_EXT: ['xyz'] }) _copy.consume(document.get_document('mock.xyz'), BytesIO(b'juba.')) _copy.consume(document.get_document('ignore.doc'), BytesIO(b'mock')) expected = ['39bbf948-mock.xyz'] actual = os.listdir(os.path.join(data_root, 'files', 'xyz')) self.assertEqual(expected, actual)
def test_simple(self): mock_pipeline = test_helper.get_mock_pipeline( [helper.PROCESS_TEXT, helper.ANALYZE, helper.FINISH_DOCUMENT]) _process = process.Subscriber(mock_pipeline) _process.setup({}) doc = document.get_document('mock') doc.status = 'untouched' doc.text = 'abcd' _process.consume(doc, None) self.assertNotEqual('untouched', doc.status) self.assertEquals(4, doc.meta['size']) self.assertNotEqual(0, len(mock_pipeline.consumer.produced))
def test_bug(self): text = """MT-2009-12-015-W001 – SIMULATED WARRANT Computers assigned to Jo Smith from November 13, 2009 to December 12, 2009. """ config = {'code_root': '.', 'name_model': 'utils/names.gz'} find_names = _find_names.Subscriber(test_helper.get_mock_pipeline([])) find_names.setup(config) doc = document.get_document('dummy') doc.text = text find_names.consume(doc, None) expected = [(62, { 'entity_id': 'jo_smith', 'type': 'per', 'value': 'Jo Smith' })] self.assertEqual(expected, doc.entities.get_all())
def test_base(self): pipe = test_helper.get_mock_pipeline([helper.RUN_PIPELINE]) _strings = strings.Subscriber(pipe) _strings.setup({'min_string_length': 4, 'max_lines': 2}) doc = document.get_document('mock') doc.set_size(12345) _strings.consume(doc, StringIO('AAAA\x00BBBB\x00CCCC')) # Two child documents produced. self.assertEquals(2, len(pipe.consumer.produced)) expected = 'mock.00000.child' actual = pipe.consumer.produced[0][0].path self.assertEquals(expected, actual)
def test_simple(self): config = { 'code_root': '.', helper.INJECTOR: test_helper.MockInjector( ner_entities=[(10, u'Hans Petter')]) } find_names = _find_names.Subscriber(test_helper.get_mock_pipeline([])) find_names.setup(config) doc = document.get_document('dummy') doc.text = u'Dette er Hans Petter.' find_names.consume(doc, None) expected = [(10, { 'entity_id': u'Hans_Petter', 'type': u'per', 'value': u'Hans Petter' })] self.assertEqual(expected, doc.entities.get_all())
def test_simple(self): _file_meta = file_meta.Subscriber(test_helper.get_mock_pipeline([])) response = json.dumps({u'Content-Type': u'image/jpeg'}).encode('utf-8') _file_meta.setup({ 'code_root': '.', 'host': 'mock', helper.INJECTOR: test_helper.MockInjector(response) }) doc = document.get_document('mock.txt') _file_meta.consume(doc, StringIO(u'mock')) expected = u'picture' actual = doc.doctype self.assertEqual(expected, actual)
def test_simple(self): mock_pipeline = test_helper.get_mock_pipeline([]) data_root = os.path.join('local_data', 'unittests') if os.path.exists(data_root): shutil.rmtree(data_root) _store_text = store_text.Subscriber(mock_pipeline) _store_text.setup({helper.DATA_ROOT: data_root, 'workers': 1}) doc = document.get_document('mock') doc.text = 'mock-mock-mock' _store_text.consume(doc, None) expected = 'local_data/unittests/text/17404a59-mock' actual = doc.meta['text_file'] self.assertEquals(expected, actual)
def test_encrypted(self): mock_pipe = test_helper.get_mock_pipeline( [helper.PROCESS_FILE, helper.TEXT]) detector = unpack_archive.Subscriber(mock_pipe) detector.setup({ helper.DATA_ROOT: 'local_data', helper.TAG: 'test', helper.WORKER_ID: 0 }) doc = document.get_document( test_helper.get_test_path('password-protected.zip')) doc.docid = '4321' with open(doc.path, 'rb') as inp: detector.consume(doc, inp) self.assertEqual(1, len(mock_pipe.consumer.produced))
def _init(self): mock_pipeline = test_helper.get_mock_pipeline([]) injector = test_helper.MockInjector('{}') _index_text = index_text.Subscriber(mock_pipeline) _index_text.setup({ 'tag': 'default', 'context_size': 14, helper.INJECTOR: injector }) doc = document.get_document('mock.txt') doc.text = 'abcd mock-value efgh' doc.entities.add(5, 'mock-type', 'mock-value') _index_text.consume(doc, None) _index_text.stop() return injector.elastic_helper._bulk