def test_parse_with_archive(): data = gen_data() with tempdir() as td: td.child('class1').mkdir() td.child('class2').mkdir() td.child('class3').mkdir() td.child('class4').mkdir() files = [ td.child('class1', 'f1.jpg'), td.child('class1', 'f2.JPG'), td.child('class2', 'f1.jpg'), td.child('class2', 'f2.bMp'), td.child('class2', 'f4jpg.Jpeg'), td.child('class3', 'test.txt'), td.child('f1.jpeg'), ] for f in (files): f = open(f, 'w') f.write(data) f.close() with TempFile(suffix='.zip') as fname: with zipfile.ZipFile(fname, 'w', compression=zipfile.ZIP_DEFLATED) as z: with cwd(td): for f in (files): f = f.replace(td, './') z.write(f) meta = parse(fname) assert meta == { 'data_type': 'IMAGES', 'classes': { 'class1': 2, 'class2': 3 } }
def test_notify_data_invalid(): data = gen_data() log = """ Image dataset unpacked. Parsing... This file doesn't contain a supported data format.""" notify = mock.MagicMock() notify.send = mock.MagicMock() notify.admin_send = mock.MagicMock() with tempdir() as td: td.child('class1').mkdir() td.child('class2').mkdir() td.child('class3').mkdir() td.child('class4').mkdir() files = [ td.child('class3', 'test.txt'), td.child('1.jpg'), ] for f in (files): f = open(f, 'w') f.write(data) f.close() with TempFile(suffix='.tar.bz2') as fname: with tarfile.open(fname, 'w:bz2') as z: with cwd(td): for f in (files): f = f.replace(td, './') z.add(f) with global_notify(notify): with pytest.raises(InvalidDataFile): parse_archive(Archive(fname)) rval = '\n'.join(x[0][0] for x in notify.send.call_args_list) assert rval == '\n'.join(x.strip() for x in log.strip().split('\n')) assert not notify.admin_send.called
def test_clean_working_dir(): def create_dir(pwd): pwd.child('empty_dir').mkdir() pwd.child('not_empty_dir').mkdir() pwd.child('not_empty_dir', 'deep_dir').mkdir() open(pwd.child('not_empty_dir', 'file1'), 'w').close() open(pwd.child('not_empty_dir', 'file2'), 'w').close() open(pwd.child('not_empty_dir', 'deep_dir', 'file5'), 'w').close() open(pwd.child('file3'), 'w').close() open(pwd.child('file4'), 'w').close() with tempdir() as td: create_dir(td) clean_working_dir(exclude=['not_empty_dir/deep_dir/file5'], working_dir=td) assert Path(td).child('not_empty_dir', 'deep_dir', 'file5').exists() assert len(Path(td).listdir()) == 1 with tempdir() as td: create_dir(td) clean_working_dir( exclude=[Path(td).child('not_empty_dir', 'deep_dir', 'file5')], working_dir=td) assert Path(td).child('not_empty_dir', 'deep_dir', 'file5').exists() assert len(Path(td).listdir()) == 1 with tempdir() as td: create_dir(td) clean_working_dir(working_dir=td) assert Path(td).listdir() == [] with tempdir() as td: create_dir(td) clean_working_dir( exclude=['not_empty_dir/deep_dir/file5', 'not_empty_dir/file1'], working_dir=td) assert Path(td).child('not_empty_dir', 'deep_dir', 'file5').exists() assert Path(td).child('not_empty_dir', 'file1').exists() assert len(Path(td).listdir()) == 1 with tempdir() as td: create_dir(td) clean_working_dir(exclude=['file3'], working_dir=td) assert Path(td).child('file3').exists() assert len(Path(td).listdir()) == 1 with tempdir() as td: create_dir(td) clean_working_dir(exclude=['file3'], working_dir=td) assert Path(td).child('file3').exists() assert len(Path(td).listdir()) == 1 with pytest.raises(ValueError): with tempdir() as td: create_dir(td) clean_working_dir(exclude='file3', working_dir=td)
def test_notify_archive_csv_valid(): data = gen_data() log = """ Image dataset unpacked. Parsing... CSV file .//1.csv unpacked. Parsing CSV with whitespace (tab) as delimiter. Found 3 fields in first row, assume all the rows have this number of fields. Parsing... Analyzing data... The dataset appears to have a header. Found 2 samples.""" notify = mock.MagicMock() notify.send = mock.MagicMock() notify.admin_send = mock.MagicMock() with tempdir() as td: td.child('class1').mkdir() td.child('class2').mkdir() td.child('class3').mkdir() td.child('class4').mkdir() files = [ td.child('class1', 'f1.jpg'), td.child('class1', 'f2.JPG'), td.child('class2', 'f1.jpg'), td.child('class2', 'f2.bMp'), td.child('class2', 'f4jpg.Jpeg'), td.child('class3', 'test.txt'), td.child('1.csv'), ] for f in (files): f = open(f, 'w') f.write(data) f.close() with open(td.child('1.csv'), 'w') as f: f.write('one two free\r1 2 3\r4 5 6') with TempFile(suffix='.tar.bz2') as fname: with tarfile.open(fname, 'w:bz2') as z: with cwd(td): for f in (files): f = f.replace(td, './') z.add(f) with global_notify(notify): parse_archive(Archive(fname)) rval = '\n'.join(x[0][0] for x in notify.send.call_args_list) assert rval == '\n'.join(x.strip() for x in log.strip().split('\n')) assert not notify.admin_send.called
def test_parse_ts_archive(): data = gen_data() with tempdir() as td: td.child('class1').mkdir() td.child('class2').mkdir() td.child('class3').mkdir() td.child('class4').mkdir() files = [ td.child('class1', 'f1.jpg'), td.child('class1', 'f2.JPG'), td.child('class2', 'f1.jpg'), td.child('class2', 'f2.bMp'), td.child('class2', 'f4jpg.Jpeg'), td.child('class3', 'test.ts'), td.child('f1.jpeg'), ] for f in (files): f = open(f, 'w') f.write(data) f.close() with open(td.child('class3', 'test.ts'), 'w') as f: f.write('1,2,3|0,1; 2.3,4,1|0,1; 1.1, 0., 0.0|1,0\n\n2,2,2|0,1;') with TempFile(suffix='.tar.gz') as fname: with tarfile.open(fname, 'w:gz') as z: with cwd(td): for f in (files): f = f.replace(td, './') z.add(f) meta = parse_archive(Archive(fname)) assert meta == { 'data_type': 'TIMESERIES', 'data_rows': 2, 'empty_rows': 1, 'min_timesteps': 1, 'max_timesteps': 3, 'input_size': 3, 'output_size': 2, 'classes': { '0': 1, '1': 3 }, 'binary_input': False, 'binary_output': True, 'archive_path': './/class3/test.ts' }
def test_notify_archive_image_skipped(): data = gen_data() log = """ Image dataset unpacked. Parsing... 8 images found. Skipped 3 images with leading dot or without class. """ notify = mock.MagicMock() notify.send = mock.MagicMock() notify.admin_send = mock.MagicMock() with tempdir() as td: td.child('class1').mkdir() td.child('class2').mkdir() td.child('class2').child('.class22').mkdir() td.child('class3').mkdir() td.child('class4').mkdir() files = [ td.child('class1', 'f1.jpg'), td.child('class1', 'f2.JPG'), td.child('class2', 'f1.jpg'), td.child('class2', '.f1.jpg'), td.child('class2', 'f2.bMp'), td.child('class2', '.class22', 'ff2.jpg'), td.child('class2', 'f4jpg.Jpeg'), td.child('class3', 'test.txt'), td.child('f1.jpeg'), ] for f in (files): f = open(f, 'w') f.write(data) f.close() with TempFile(suffix='.zip') as fname: with zipfile.ZipFile(fname, 'w', compression=zipfile.ZIP_DEFLATED) as z: with cwd(td): for f in (files): f = f.replace(td, './') z.write(f) with global_notify(notify): parse_archive(Archive(fname)) rval = '\n'.join(x[0][0] for x in notify.send.call_args_list) assert rval == '\n'.join(x.strip() for x in log.strip().split('\n')) assert not notify.admin_send.called
def test_parse_nested_img_archive(): data = gen_data() with tempdir() as td: td.child('class1').mkdir() td.child('class2').mkdir() td.child('class3').mkdir() td.child('class4').mkdir() td.child('class1').child('class1B').mkdir() td.child('class2').child('class2B').mkdir() td.child('class2').child('class2B').child('class2Bi').mkdir() td.child('class3').child('class3B').mkdir() files = [ td.child('class1', 'f1.jpg'), td.child('class1', 'f1a.jpg'), td.child('class1').child('class1B', 'f2.jpg'), td.child('class2', 'f5.jpg'), td.child('class2').child('class2B').child('class2Bi', 'f3.jpg'), td.child('class3', 'f4.jpg'), ] for f in (files): f = open(f, 'w') f.write(data) f.close() with TempFile(suffix='.zip') as fname: with zipfile.ZipFile(fname, 'w', compression=zipfile.ZIP_DEFLATED) as z: with cwd(td): for f in (files): f = f.replace(td, './') z.write(f) meta = parse_archive(Archive(fname)) assert meta == { 'data_type': 'IMAGES', 'classes': { 'class1': 2, 'class1/class1B': 1, 'class2': 1, 'class2/class2B/class2Bi': 1, 'class3': 1 } }
def test_notify_archive_ts_valid(): data = gen_data() log = """ Image dataset unpacked. Parsing... Timeseries data .//class3/test.ts unpacked. Parsing... First timestep has 3 inputs and 2 outputs. Applying this requirement to the entire file.""" notify = mock.MagicMock() notify.send = mock.MagicMock() notify.admin_send = mock.MagicMock() with tempdir() as td: td.child('class1').mkdir() td.child('class2').mkdir() td.child('class3').mkdir() td.child('class4').mkdir() files = [ td.child('class1', 'f1.jpg'), td.child('class1', 'f2.JPG'), td.child('class2', 'f1.jpg'), td.child('class2', 'f2.bMp'), td.child('class2', 'f4jpg.Jpeg'), td.child('class3', 'test.ts'), td.child('f1.jpeg'), ] for f in (files): f = open(f, 'w') f.write(data) f.close() with open(td.child('class3', 'test.ts'), 'w') as f: f.write('1,2,3|0,1; 2.3,4,1|0,1; 1.1, 0., 0.0|1,0\n\n2,2,2|0,1;') with TempFile(suffix='.tar.gz') as fname: with tarfile.open(fname, 'w:gz') as z: with cwd(td): for f in (files): f = f.replace(td, './') z.add(f) with global_notify(notify): parse_archive(Archive(fname)) rval = '\n'.join(x[0][0] for x in notify.send.call_args_list) assert rval == '\n'.join(x.strip() for x in log.strip().split('\n')) assert not notify.admin_send.called
def test_invalid_archive(): data = gen_data() with tempdir() as td: td.child('class1').mkdir() td.child('class2').mkdir() td.child('class3').mkdir() td.child('class4').mkdir() files = [ td.child('class3', 'test.txt'), td.child('1.jpg'), ] for f in (files): f = open(f, 'w') f.write(data) f.close() with TempFile(suffix='.tar.bz2') as fname: with tarfile.open(fname, 'w:bz2') as z: with cwd(td): for f in (files): f = f.replace(td, './') z.add(f) with pytest.raises(InvalidDataFile) as excinfo: parse_archive(Archive(fname)) assert excinfo.value.message == 'This file doesn\'t contain a supported data format.'
def test_parse_csv_archive(): data = gen_data() with tempdir() as td: td.child('class1').mkdir() td.child('class2').mkdir() td.child('class3').mkdir() td.child('class4').mkdir() files = [ td.child('class1', 'f1.jpg'), td.child('class1', 'f2.JPG'), td.child('class2', 'f1.jpg'), td.child('class2', 'f2.bMp'), td.child('class2', 'f4jpg.Jpeg'), td.child('class3', 'test.txt'), td.child('1.csv'), ] for f in (files): f = open(f, 'w') f.write(data) f.close() with open(td.child('1.csv'), 'w') as f: f.write( 'one two free\r1 2 3\r4 5 6\r7 8 9\r1 2 3\r3 4 5\r4 5 5\r1 2 3\r0 9 9\r0 8 3\r3 3 3\r' ) with TempFile(suffix='.tar.bz2') as fname: with tarfile.open(fname, 'w:bz2') as z: with cwd(td): for f in (files): f = f.replace(td, './') z.add(f) meta = parse_archive(Archive(fname)) assert meta == { 'version': 3, 'size': 73, 'archive_path': './/1.csv', 'data_rows': 10, 'uniques_per_col': [5, 6, 4], 'data_type': 'GENERAL', 'invalid_rows': 0, 'histogram': [[5, 0, 4, 0, 1], [4, 1, 2, 0, 0, 3], [5, 2, 1, 2]], 'bins': [[0, 1.4, 2.8, 4.2, 5.6, 7], [2, 3.16667, 4.33333, 5.5, 6.66667, 7.83333, 9], [3, 4.5, 6, 7.5, 9]], 'dtypes': ['i', 'i', 'i'], 'classes': [[], [], []], 'last_column_info': { 'classes': { '9': 2, '3': 5, '5': 2, '6': 1 }, 'distrib': { '9': 0.2, '3': 0.5, '5': 0.2, '6': 0.1 }, 'min': 3, 'max': 9, 'unique': 4 }, 'names': ['one', 'two', 'free'], 'delimeter': '\s+', 'num_columns': 3, 'locked': [False, False, False], 'with_header': True, 'empty_rows': 0, 'mean': [2.4, 4.8, 4.9], 'stdev': [2.22111, 2.69979, 2.42441], 'max': [7, 9, 9], 'min': [0, 2, 3] }