def test_blob_locations(client, fakedata, taskmanager): root_directory = fakedata.init() dir1 = fakedata.directory(root_directory, 'dir1') dir2 = fakedata.directory(root_directory, 'dir2') blob = fakedata.blob(b'hello world') fakedata.file(dir1, 'foo', blob) fakedata.file(dir2, 'bar', blob) taskmanager.run() def directory_id(directory): return f'_directory_{directory.pk}' api = CollectionApiClient(client) resp = api.get_locations(blob.pk) assert resp['locations'] == [ { 'filename': 'foo', 'parent_id': directory_id(dir1), 'parent_path': '/dir1', }, { 'filename': 'bar', 'parent_id': directory_id(dir2), 'parent_path': '/dir2', }, ]
def test_gpg_digest(gpg_blob, configure_gpg, client, fakedata, taskmanager): collection = fakedata.collection() fakedata.file(collection.root_directory, 'email', gpg_blob) taskmanager.run() api = CollectionApiClient(collection, client) digest = api.get_digest(gpg_blob.pk)['content'] assert digest['pgp']
def test_gpg_digest(gpg_blob, client, fakedata, taskmanager): root = fakedata.init() fakedata.file(root, 'email', gpg_blob) taskmanager.run() api = CollectionApiClient(client) digest = api.get_digest(gpg_blob.pk)['content'] assert digest['pgp']
def test_digest_image_exif(client, fakedata, taskmanager): collection = fakedata.collection() with (TESTDATA / PATH_IMAGE).open('rb') as f: blob = fakedata.blob(f.read()) fakedata.file(collection.root_directory, 'bikes.jpg', blob) taskmanager.run() api = CollectionApiClient(collection, client) digest = api.get_digest(blob.pk)['content'] assert digest['date-created'] == '2006-02-11T11:06:37Z' assert digest['location'] == '33.87546081542969, -116.3016196017795'
def test_tika_digested(fakedata, taskmanager, client): root = fakedata.init() legea_pdf = TESTDATA / './no-extension/file_doc' with legea_pdf.open('rb') as f: blob = fakedata.blob(f.read()) fakedata.file(root, 'file.doc', blob) taskmanager.run() api = CollectionApiClient(client) digest = api.get_digest(blob.pk)['content'] assert "Colors and Lines to choose" in digest['text'] assert digest['date'] == '2016-01-13T11:05:00Z' assert digest['date-created'] == '2016-01-13T11:00:00Z'
def test_digest_with_broken_dependency(fakedata, taskmanager, client): root_directory = fakedata.init() mof1_1992_233 = TESTDATA / 'disk-files/broken.pdf' with mof1_1992_233.open('rb') as f: blob = fakedata.blob(f.read()) assert blob.mime_type == 'application/pdf' fakedata.file(root_directory, 'broken.pdf', blob) taskmanager.run() api = CollectionApiClient(client) digest = api.get_digest(blob.pk)['content'] assert digest['md5'] == 'f6e0d13c5c3aaab75b4febced3e72ae0' assert digest['size'] == 1000 assert digest['text'] is None assert digest['broken'] == ['tika_http_422']
def test_digest_msg(fakedata, taskmanager, client): root_directory = fakedata.init() msg = TESTDATA / 'msg-5-outlook/DISEARĂ-Te-așteptăm-la-discuția-despre-finanțarea-culturii.msg' with msg.open('rb') as f: blob = fakedata.blob(f.read()) msg_file = fakedata.file(root_directory, 'the.msg', blob) taskmanager.run() msg_file.refresh_from_db() api = CollectionApiClient(client) digest = api.get_digest(msg_file.blob.pk)['content'] assert digest['content-type'] == 'application/vnd.ms-outlook' assert digest['filename'] == 'the.msg' assert digest['filetype'] == 'email' assert digest['md5'] == '38385c4487719fa9dd0fb695d3aad0ee' assert digest['sha1'] == '90548132e18bfc3088e81918bbcaf887a68c6acc' assert digest['size'] == 19968
def test_pdf_ocr(fakedata, taskmanager, client): ocr1_path = TESTDATA.parent / 'ocr/one' ocr.create_ocr_source('ocr1', ocr1_path) collection = fakedata.collection() mof1_1992_233 = TESTDATA / 'disk-files/pdf-for-ocr/mof1_1992_233.pdf' with mof1_1992_233.open('rb') as f: blob = fakedata.blob(f.read()) fakedata.file(collection.root_directory, 'mof1_1992_233.pdf', blob) taskmanager.run() api = CollectionApiClient(collection, client) digest = api.get_digest(blob.pk)['content'] assert "Hotărlre privind stabilirea cantităţii de gaze" in digest['text'] ocr_pdf = ocr1_path / 'foo/bar/f/d/fd41b8f1fe19c151517b3cda2a615fa8.pdf' with ocr_pdf.open('rb') as f: ocr_pdf_data = f.read() resp = client.get(f'/collections/testdata/{blob.pk}/ocr/ocr1/') assert b''.join(resp.streaming_content) == ocr_pdf_data assert resp['Content-Type'] == 'application/pdf'