def test_archive_zip(): data1 = gen_data(5, 10) data2 = gen_data().replace('\n', '\r') with TempFile(data1) as f1: with TempFile(data2) as f2: with TempFile(suffix='.zip') as fname: with zipfile.ZipFile(fname, 'w', compression=zipfile.ZIP_DEFLATED) as z: z.write(f1) z.write(f2) members = None with Archive(fname) as archive: members = archive.get_members() assert sorted(members) == sorted( [f.lstrip('/') for f in f1, f2]) with Archive(fname) as archive: member = archive.open_member(sorted(members)[0]) assert ''.join(x for x in member) == sorted( zip([f1, f2], [data1, data2]))[0][1].replace('\r', '\n') with Archive(fname) as archive: member = archive.open_member(sorted(members)[1]) assert ''.join(x for x in member) == sorted( zip([f1, f2], [data1, data2]))[1][1].replace('\r', '\n') with Archive(fname) as archive: size = archive.get_member_size(f1.lstrip('/')) assert size == 59
def test_zip_write(): data = gen_data() + ' ' * 1000 with TempFile(data) as fname: with TempFile() as zip_file: zip_write(zip_file, fname) assert os.stat(fname).st_size > os.stat(zip_file).st_size with zipfile.ZipFile(zip_file) as z: assert data == z.read(z.namelist()[0])
def test_archive_invalid(): data = gen_data() with TempFile(data) as tf: with pytest.raises(InvalidArchive): Archive(tf) with TempFile(data, suffix='.zip') as tf: with pytest.raises(InvalidArchive): Archive(tf) with TempFile(data, suffix='.tar.bz') as tf: with pytest.raises(InvalidArchive): Archive(tf)
def test_empty_archive(): with TempFile(suffix='.zip') as td: with zipfile.ZipFile(td, 'w'): pass archive = Archive(td) assert [] == archive.get_members() with TempFile(suffix='.tar.gz') as td: with tarfile.open(td, 'w:gz'): pass archive = Archive(td) assert [] == archive.get_members()
def test_open_gz(): data = gen_data() with TempFile() as fname: with gzip.open(fname, 'wb') as gz: gz.write(data) gz.close() assert data.split('\n') == list(open_gz(fname))
def test_open_bz(): data = gen_data() with TempFile() as fname: with bz2.BZ2File(fname, 'wb') as bz: bz.write(data) bz.close() assert data.split('\n') == list(open_bz(fname))
def test_parse_with_archive(): data = gen_data() with tempdir() as td: td.child('class1').mkdir() td.child('class2').mkdir() td.child('class3').mkdir() td.child('class4').mkdir() files = [ td.child('class1', 'f1.jpg'), td.child('class1', 'f2.JPG'), td.child('class2', 'f1.jpg'), td.child('class2', 'f2.bMp'), td.child('class2', 'f4jpg.Jpeg'), td.child('class3', 'test.txt'), td.child('f1.jpeg'), ] for f in (files): f = open(f, 'w') f.write(data) f.close() with TempFile(suffix='.zip') as fname: with zipfile.ZipFile(fname, 'w', compression=zipfile.ZIP_DEFLATED) as z: with cwd(td): for f in (files): f = f.replace(td, './') z.write(f) meta = parse(fname) assert meta == { 'data_type': 'IMAGES', 'classes': { 'class1': 2, 'class2': 3 } }
def test_parse_with_invalid_csv(): # Since we accept text columns, we presume that strings appear to be errors # only if it's less than 10% of them in a column. Hence it can be tested # only on a dataset with minimum 11 columns, where one contains string. # data = '1 2 3 4\n1 2 3 4\n5 6 a 8\n7 6 5 2\n8 8 8 8\n2 2 3 9\n5 6 7 8\n12 13 45 56\n12 43 6 7\n9 9 9 0\n1 2 5 0\n' log = """ Parsing CSV with whitespace (tab) as delimiter. Found 4 fields in first row, assume all the rows have this number of fields. Parsing... Analyzing data... No header found, first row contains data. Found 1 row with invalid values: - row 3, column 3 Found 10 samples. """ notify = mock.MagicMock() notify.send = mock.MagicMock() notify.admin_send = mock.MagicMock() with TempFile(data, suffix='.csv') as csv: with global_notify(notify): parse(csv) rval = '\n'.join(x[0][0] for x in notify.send.call_args_list) assert rval == '\n'.join(x.strip() for x in log.strip().split('\n')) assert not notify.admin_send.called
def test_notify_data_invalid(): data = gen_data() log = """ Image dataset unpacked. Parsing... This file doesn't contain a supported data format.""" notify = mock.MagicMock() notify.send = mock.MagicMock() notify.admin_send = mock.MagicMock() with tempdir() as td: td.child('class1').mkdir() td.child('class2').mkdir() td.child('class3').mkdir() td.child('class4').mkdir() files = [ td.child('class3', 'test.txt'), td.child('1.jpg'), ] for f in (files): f = open(f, 'w') f.write(data) f.close() with TempFile(suffix='.tar.bz2') as fname: with tarfile.open(fname, 'w:bz2') as z: with cwd(td): for f in (files): f = f.replace(td, './') z.add(f) with global_notify(notify): with pytest.raises(InvalidDataFile): parse_archive(Archive(fname)) rval = '\n'.join(x[0][0] for x in notify.send.call_args_list) assert rval == '\n'.join(x.strip() for x in log.strip().split('\n')) assert not notify.admin_send.called
def fail_template(data,log): notify = mock.MagicMock() notify.send = mock.MagicMock() notify.admin_send = mock.MagicMock() with TempFile(data, suffix='.ts') as ts: with global_notify(notify): with pytest.raises(InvalidTimeseries): parse(ts) rval = '\n'.join(x[0][0] for x in notify.send.call_args_list) assert rval == '\n'.join(x.strip() for x in log.strip().split('\n')) assert not notify.admin_send.called
def test_archive_tar_bz(): data1 = gen_data(5, 10).replace('\n', '\r') data2 = gen_data().replace('\n', '\r') with TempFile(data1) as f1: with TempFile(data2) as f2: with TempFile(suffix='.tar.bz') as fname: with tarfile.open(fname, 'w:bz2') as z: z.add(f1) z.add(f2) members = None with Archive(fname) as archive: members = archive.get_members() assert sorted(members) == sorted( [f.lstrip('/') for f in f1, f2]) with Archive(fname) as archive: member = archive.open_member(sorted(members)[1]) assert '\r'.join(x for x in member) == sorted( zip([f1, f2], [data1, data2]))[1][1] with Archive(fname) as archive: size = archive.get_member_size(f1.lstrip('/')) assert size == 59
def test_zip_get_members(): names = [ 'whitespace in name.jpg', '1/2/test 1.jpg', '33/a\'a.bin', '.test.jpg', '1/.2 3/3/test.jpg', './.1/test/test.jpg' ] with TempFile(suffix='.zip') as td: with zipfile.ZipFile(td, 'w') as z: for name in names: z.writestr(name, 'null') archive = Archive(td) members = archive.get_members() assert sorted(names) == sorted(members)
def pass_template(data,log,exp_meta): notify = mock.MagicMock() notify.send = mock.MagicMock() notify.admin_send = mock.MagicMock() with TempFile(data, suffix='.ts') as ts: with global_notify(notify): meta = parse(ts) for key in exp_meta: assert meta[key] == exp_meta[key] rval = '\n'.join(x[0][0] for x in notify.send.call_args_list) assert rval == '\n'.join(x.strip() for x in log.strip().split('\n')) assert not notify.admin_send.called
def test_notify_archive_invalid(): data = 'thequickbrownfoxjumpsoverthelazydog' log = 'Unknown file format.' notify = mock.MagicMock() notify.send = mock.MagicMock() notify.admin_send = mock.MagicMock() with TempFile(data, suffix='.foo') as foo: with global_notify(notify): with pytest.raises(InvalidArchive): parse(foo) rval = '\n'.join(x[0][0] for x in notify.send.call_args_list) assert rval == '\n'.join(x.strip() for x in log.strip().split('\n')) assert not notify.admin_send.called
def test_parse_no_data(): data = '' log = """ First row is empty, it must contain headers or data. This means your file isn't properly formatted (or you submitted another type of file). """ notify = mock.MagicMock() notify.send = mock.MagicMock() notify.admin_send = mock.MagicMock() with TempFile(data, suffix='.csv') as csv: with global_notify(notify): with pytest.raises(InvalidCSV): parse(csv) rval = '\n'.join(x[0][0] for x in notify.send.call_args_list) assert rval == '\n'.join(x.strip() for x in log.strip().split('\n')) assert not notify.admin_send.called
def test_parse_with_invalid_csv_other_file_type(): data = '%PDF-1.4\n' log = """ CSV doesn't contain a valid delimiter. This means your file isn't properly formatted (or you submitted another type of file). """ notify = mock.MagicMock() notify.send = mock.MagicMock() notify.admin_send = mock.MagicMock() with TempFile(data, suffix='.csv') as csv: with global_notify(notify): with pytest.raises(InvalidCSV): parse(csv) rval = '\n'.join(x[0][0] for x in notify.send.call_args_list) assert rval == '\n'.join(x.strip() for x in log.strip().split('\n')) assert not notify.admin_send.called
def test_parse_with_ts(): with TempFile('1,2,3|1, 0;2,3,4|1,0', suffix='.ts') as ts: meta = parse(ts) assert meta == { 'data_type': 'TIMESERIES', 'data_rows': 1, 'empty_rows': 0, 'min_timesteps': 2, 'max_timesteps': 2, 'classes': { '0': 2 }, 'binary_input': False, 'binary_output': True, 'input_size': 3, 'output_size': 2 }
def test_parse_ts_archive(): data = gen_data() with tempdir() as td: td.child('class1').mkdir() td.child('class2').mkdir() td.child('class3').mkdir() td.child('class4').mkdir() files = [ td.child('class1', 'f1.jpg'), td.child('class1', 'f2.JPG'), td.child('class2', 'f1.jpg'), td.child('class2', 'f2.bMp'), td.child('class2', 'f4jpg.Jpeg'), td.child('class3', 'test.ts'), td.child('f1.jpeg'), ] for f in (files): f = open(f, 'w') f.write(data) f.close() with open(td.child('class3', 'test.ts'), 'w') as f: f.write('1,2,3|0,1; 2.3,4,1|0,1; 1.1, 0., 0.0|1,0\n\n2,2,2|0,1;') with TempFile(suffix='.tar.gz') as fname: with tarfile.open(fname, 'w:gz') as z: with cwd(td): for f in (files): f = f.replace(td, './') z.add(f) meta = parse_archive(Archive(fname)) assert meta == { 'data_type': 'TIMESERIES', 'data_rows': 2, 'empty_rows': 1, 'min_timesteps': 1, 'max_timesteps': 3, 'input_size': 3, 'output_size': 2, 'classes': { '0': 1, '1': 3 }, 'binary_input': False, 'binary_output': True, 'archive_path': './/class3/test.ts' }
def test_notify_archive_csv_valid(): data = gen_data() log = """ Image dataset unpacked. Parsing... CSV file .//1.csv unpacked. Parsing CSV with whitespace (tab) as delimiter. Found 3 fields in first row, assume all the rows have this number of fields. Parsing... Analyzing data... The dataset appears to have a header. Found 2 samples.""" notify = mock.MagicMock() notify.send = mock.MagicMock() notify.admin_send = mock.MagicMock() with tempdir() as td: td.child('class1').mkdir() td.child('class2').mkdir() td.child('class3').mkdir() td.child('class4').mkdir() files = [ td.child('class1', 'f1.jpg'), td.child('class1', 'f2.JPG'), td.child('class2', 'f1.jpg'), td.child('class2', 'f2.bMp'), td.child('class2', 'f4jpg.Jpeg'), td.child('class3', 'test.txt'), td.child('1.csv'), ] for f in (files): f = open(f, 'w') f.write(data) f.close() with open(td.child('1.csv'), 'w') as f: f.write('one two free\r1 2 3\r4 5 6') with TempFile(suffix='.tar.bz2') as fname: with tarfile.open(fname, 'w:bz2') as z: with cwd(td): for f in (files): f = f.replace(td, './') z.add(f) with global_notify(notify): parse_archive(Archive(fname)) rval = '\n'.join(x[0][0] for x in notify.send.call_args_list) assert rval == '\n'.join(x.strip() for x in log.strip().split('\n')) assert not notify.admin_send.called
def test_parse_with_invalid_csv_other_file_type_valid_delimiter(): data = '%PDF,-1.4\nadsadsadsad' log = """ Parsing CSV with comma as delimiter. Found 2 fields in first row, assume all the rows have this number of fields. Parsing... Analyzing data... The dataset is empty or isn't properly formatted. """ notify = mock.MagicMock() notify.send = mock.MagicMock() notify.admin_send = mock.MagicMock() with TempFile(data, suffix='.csv') as csv: with global_notify(notify): with pytest.raises(InvalidCSV): parse(csv) rval = '\n'.join(x[0][0] for x in notify.send.call_args_list) assert rval == '\n'.join(x.strip() for x in log.strip().split('\n'))
def test_parse_not_enough_columns(): data = '3,\n 4,,5\n6,8,9\n' log = """ Parsing CSV with comma as delimiter. With selected delimiter found only 1 columns in first row, must be at least 2. This means your file isn't properly formatted (or you submitted another type of file). """ notify = mock.MagicMock() notify.send = mock.MagicMock() notify.admin_send = mock.MagicMock() with TempFile(data, suffix='.csv') as csv: with global_notify(notify): with pytest.raises(InvalidCSV): parse(csv) rval = '\n'.join(x[0][0] for x in notify.send.call_args_list) assert rval == '\n'.join(x.strip() for x in log.strip().split('\n')) assert not notify.admin_send.called
def test_parse_different_delimiters_per_Row(): data = '4 5\n4,6\n' log = """ Parsing CSV with whitespace (tab) as delimiter. Found 2 fields in first row, assume all the rows have this number of fields. Parsing... Analyzing data... The dataset is empty or isn't properly formatted. """ notify = mock.MagicMock() notify.send = mock.MagicMock() notify.admin_send = mock.MagicMock() with TempFile(data, suffix='.csv') as csv: with global_notify(notify): with pytest.raises(InvalidCSV): parse(csv) rval = '\n'.join(x[0][0] for x in notify.send.call_args_list) assert rval == '\n'.join(x.strip() for x in log.strip().split('\n')) assert not notify.admin_send.called
def test_notify_archive_image_skipped(): data = gen_data() log = """ Image dataset unpacked. Parsing... 8 images found. Skipped 3 images with leading dot or without class. """ notify = mock.MagicMock() notify.send = mock.MagicMock() notify.admin_send = mock.MagicMock() with tempdir() as td: td.child('class1').mkdir() td.child('class2').mkdir() td.child('class2').child('.class22').mkdir() td.child('class3').mkdir() td.child('class4').mkdir() files = [ td.child('class1', 'f1.jpg'), td.child('class1', 'f2.JPG'), td.child('class2', 'f1.jpg'), td.child('class2', '.f1.jpg'), td.child('class2', 'f2.bMp'), td.child('class2', '.class22', 'ff2.jpg'), td.child('class2', 'f4jpg.Jpeg'), td.child('class3', 'test.txt'), td.child('f1.jpeg'), ] for f in (files): f = open(f, 'w') f.write(data) f.close() with TempFile(suffix='.zip') as fname: with zipfile.ZipFile(fname, 'w', compression=zipfile.ZIP_DEFLATED) as z: with cwd(td): for f in (files): f = f.replace(td, './') z.write(f) with global_notify(notify): parse_archive(Archive(fname)) rval = '\n'.join(x[0][0] for x in notify.send.call_args_list) assert rval == '\n'.join(x.strip() for x in log.strip().split('\n')) assert not notify.admin_send.called
def test_parse_bad_column_data_first_row(): data = '3,\\x00,4,3\n4,\n6,8,9\n' log = """ Parsing CSV with comma as delimiter. Found 4 fields in first row, assume all the rows have this number of fields. Parsing... Analyzing data... The dataset is empty or isn't properly formatted. """ notify = mock.MagicMock() notify.send = mock.MagicMock() notify.admin_send = mock.MagicMock() with TempFile(data, suffix='.csv') as csv: with global_notify(notify): with pytest.raises(InvalidCSV): parse(csv) rval = '\n'.join(x[0][0] for x in notify.send.call_args_list) assert rval == '\n'.join(x.strip() for x in log.strip().split('\n')) assert not notify.admin_send.called
def test_parse_with_csv(): data = 'a,b,c,d\n\n1,2,3,4\n5,6,7,8\n' log = """ Parsing CSV with comma as delimiter. Found 4 fields in first row, assume all the rows have this number of fields. Parsing... Analyzing data... The dataset appears to have a header. Found 2 samples. """ notify = mock.MagicMock() notify.send = mock.MagicMock() notify.admin_send = mock.MagicMock() with TempFile(data, suffix='.csv') as csv: with global_notify(notify): meta = parse(csv) rval = '\n'.join(x[0][0] for x in notify.send.call_args_list) assert meta['data_type'] == 'GENERAL' assert rval == '\n'.join(x.strip() for x in log.strip().split('\n')) assert not notify.admin_send.called
def test_parse_nested_img_archive(): data = gen_data() with tempdir() as td: td.child('class1').mkdir() td.child('class2').mkdir() td.child('class3').mkdir() td.child('class4').mkdir() td.child('class1').child('class1B').mkdir() td.child('class2').child('class2B').mkdir() td.child('class2').child('class2B').child('class2Bi').mkdir() td.child('class3').child('class3B').mkdir() files = [ td.child('class1', 'f1.jpg'), td.child('class1', 'f1a.jpg'), td.child('class1').child('class1B', 'f2.jpg'), td.child('class2', 'f5.jpg'), td.child('class2').child('class2B').child('class2Bi', 'f3.jpg'), td.child('class3', 'f4.jpg'), ] for f in (files): f = open(f, 'w') f.write(data) f.close() with TempFile(suffix='.zip') as fname: with zipfile.ZipFile(fname, 'w', compression=zipfile.ZIP_DEFLATED) as z: with cwd(td): for f in (files): f = f.replace(td, './') z.write(f) meta = parse_archive(Archive(fname)) assert meta == { 'data_type': 'IMAGES', 'classes': { 'class1': 2, 'class1/class1B': 1, 'class2': 1, 'class2/class2B/class2Bi': 1, 'class3': 1 } }
def test_parse_with_csv_zip(): with TempFile(suffix='.zip') as t: with zipfile.ZipFile(t, 'w') as z: z.writestr('test.csv', '1 2 3\n2 3 4') meta = parse(t) assert meta == { 'data_type': 'GENERAL', 'version': 3, 'size': 12, 'data_rows': 2, 'empty_rows': 0, 'invalid_rows': 0, 'num_columns': 3, 'delimeter': '\s+', 'with_header': False, 'archive_path': 'test.csv', 'last_column_info': { 'classes': { '3': 1, '4': 1 }, 'distrib': { '3': 0.5, '4': 0.5 }, 'max': 4., 'min': 3., 'unique': 2, }, 'histogram': [[1, 1], [1, 1], [1, 1]], 'bins': [[1.0, 1.5, 2.0], [2.0, 2.5, 3.0], [3.0, 3.5, 4.0]], 'uniques_per_col': [2, 2, 2], 'classes': [[], [], []], 'dtypes': ['i', 'i', 'i'], 'locked': [False, False, False], 'names': ['1', '2', '3'], 'mean': [1.5, 2.5, 3.5], 'stdev': [0.707107, 0.707107, 0.707107], 'max': [2, 3, 4], 'min': [1, 2, 3] }
def test_parse_with_invalid_csv_with_null_bytes(): data = "1 2 3 4\n1 2 3 4\n5 6 \\x00 8\n7 6 5 2\n8 8 8 8\n2 2 3 9\n5 6 7 8\n12 13 45 56\n12 43 6 7\n9 9 9 0\n1 2 5 0\n" log = """ Parsing CSV with whitespace (tab) as delimiter. Found 4 fields in first row, assume all the rows have this number of fields. Parsing... Analyzing data... No header found, first row contains data. Found 1 row with invalid values: - row 3, column 3 Found 10 samples. """ notify = mock.MagicMock() notify.send = mock.MagicMock() notify.admin_send = mock.MagicMock() with TempFile(data, suffix='.csv') as csv: with global_notify(notify): parse(csv) rval = '\n'.join(x[0][0] for x in notify.send.call_args_list) assert rval == '\n'.join(x.strip() for x in log.strip().split('\n')) assert not notify.admin_send.called
def test_notify_archive_ts_valid(): data = gen_data() log = """ Image dataset unpacked. Parsing... Timeseries data .//class3/test.ts unpacked. Parsing... First timestep has 3 inputs and 2 outputs. Applying this requirement to the entire file.""" notify = mock.MagicMock() notify.send = mock.MagicMock() notify.admin_send = mock.MagicMock() with tempdir() as td: td.child('class1').mkdir() td.child('class2').mkdir() td.child('class3').mkdir() td.child('class4').mkdir() files = [ td.child('class1', 'f1.jpg'), td.child('class1', 'f2.JPG'), td.child('class2', 'f1.jpg'), td.child('class2', 'f2.bMp'), td.child('class2', 'f4jpg.Jpeg'), td.child('class3', 'test.ts'), td.child('f1.jpeg'), ] for f in (files): f = open(f, 'w') f.write(data) f.close() with open(td.child('class3', 'test.ts'), 'w') as f: f.write('1,2,3|0,1; 2.3,4,1|0,1; 1.1, 0., 0.0|1,0\n\n2,2,2|0,1;') with TempFile(suffix='.tar.gz') as fname: with tarfile.open(fname, 'w:gz') as z: with cwd(td): for f in (files): f = f.replace(td, './') z.add(f) with global_notify(notify): parse_archive(Archive(fname)) rval = '\n'.join(x[0][0] for x in notify.send.call_args_list) assert rval == '\n'.join(x.strip() for x in log.strip().split('\n')) assert not notify.admin_send.called
def test_parse_with_csv(): with TempFile('1,2,3\r\n2,3,4', suffix='.csv') as csv: meta = parse(csv) assert meta == { 'data_type': 'GENERAL', 'version': 3, 'size': 12, 'data_rows': 2, 'empty_rows': 0, 'invalid_rows': 0, 'num_columns': 3, 'delimeter': '\s*,\s*', 'with_header': False, 'last_column_info': { 'classes': { '3': 1, '4': 1 }, 'distrib': { '3': 0.5, '4': 0.5 }, 'max': 4., 'min': 3., 'unique': 2, }, 'histogram': [[1, 1], [1, 1], [1, 1]], 'bins': [[1.0, 1.5, 2.0], [2.0, 2.5, 3.0], [3.0, 3.5, 4.0]], 'uniques_per_col': [2, 2, 2], 'classes': [[], [], []], 'dtypes': ['i', 'i', 'i'], 'locked': [False, False, False], 'names': ['1', '2', '3'], 'mean': [1.5, 2.5, 3.5], 'stdev': [0.707107, 0.707107, 0.707107], 'max': [2, 3, 4], 'min': [1, 2, 3] }