def test_bulk_add_data_sample_sync_ok(self): self.add_default_data_manager() url = reverse('substrapp:data_sample-list') file_mock = MagicMock(spec=InMemoryUploadedFile) file_mock2 = MagicMock(spec=InMemoryUploadedFile) file_mock.name = 'foo.zip' file_mock2.name = 'bar.zip' file_mock.read = MagicMock(return_value=self.data_file.read()) file_mock2.read = MagicMock(return_value=self.data_file_2.read()) data = { file_mock.name: file_mock, file_mock2.name: file_mock2, 'data_manager_keys': [ get_hash(self.data_data_opener), get_hash(self.data_data_opener2) ], 'test_only': True, } extra = { 'HTTP_ACCEPT': 'application/json;version=0.0', } with mock.patch( 'substrapp.serializers.ledger.datasample.util.create_ledger_assets' ) as mcreate_ledger_assets: self.data_file.seek(0) self.data_file_2.seek(0) ledger_data = { 'pkhash': [get_dir_hash(file_mock), get_dir_hash(file_mock2)], 'validated': True } mcreate_ledger_assets.return_value = ledger_data response = self.client.post(url, data, format='multipart', **extra) r = response.json() self.assertEqual(len(r), 2) self.assertEqual(r[0]['pkhash'], get_dir_hash(file_mock)) self.assertTrue(r[0]['path'].endswith( f'/datasamples/{get_dir_hash(file_mock)}')) self.assertEqual(response.status_code, status.HTTP_201_CREATED)
def map_data_sample(paths): data_sample = [] for file_or_path in paths: if os.path.exists(file_or_path): # file case if os.path.isfile(file_or_path): with open(file_or_path, 'rb') as f: filename = path_leaf(file_or_path) file = ContentFile(f.read(), filename) pkhash = get_dir_hash(file) check(file_or_path, pkhash, data_sample) data_sample.append({'pkhash': pkhash, 'file': file}) # directory case elif os.path.isdir(file_or_path): pkhash = dirhash(file_or_path, 'sha256') check(file_or_path, pkhash, data_sample) data_sample.append({ 'pkhash': pkhash, 'path': normpath(file_or_path) }) else: raise Exception(f'{file_or_path} is not a file or a directory') else: raise Exception(f'File or Path: {file_or_path} does not exist') return data_sample
def prepare_data_sample(directory, tuple_): """Prepare data samples for tuple execution.""" from substrapp.models import DataSample for data_sample_key in tuple_['dataset']['data_sample_keys']: data_sample = DataSample.objects.get(key=data_sample_key) if not os.path.exists(data_sample.path) or not os.path.isdir( data_sample.path): raise Exception( f'Data Sample ({data_sample.path}) is missing in local storage' ) if not os.listdir(data_sample.path): raise Exception( f'Data Sample ({data_sample.path}) is empty in local storage') data_sample_checksum = get_dir_hash(data_sample.path) if data_sample_checksum != data_sample.checksum: raise Exception( 'Data Sample checksum in tuple is not the same as in local db') # create a symlink on the folder containing data data_directory = path.join(directory, 'data', data_sample_key) try: if not os.path.exists(data_directory): os.symlink(data_sample.path, data_directory) if not (os.path.realpath(data_directory) == data_sample.path): Exception( f'Sym link ({data_directory})for tuple for data sample {data_sample.path}' f'does not match (currently to {os.path.realpath(data_directory)}' ) except OSError as e: logger.exception(e) raise Exception('Failed to create sym link for tuple data sample')
def test_add_data_sample_ko_408(self): url = reverse('substrapp:data_sample-list') self.add_default_data_manager() file_mock = MagicMock(spec=InMemoryUploadedFile) file_mock.name = 'foo.zip' file_mock.read = MagicMock(return_value=self.data_file.file.read()) file_mock.open = MagicMock(return_value=file_mock) data = { 'file': file_mock, 'data_manager_keys': [get_hash(self.data_data_opener)], 'test_only': True, } extra = { 'HTTP_ACCEPT': 'application/json;version=0.0', } with mock.patch.object(zipfile, 'is_zipfile') as mis_zipfile, \ mock.patch.object(LedgerDataSampleSerializer, 'create') as mcreate: mcreate.side_effect = LedgerTimeout('Timeout') mis_zipfile.return_value = True response = self.client.post(url, data, format='multipart', **extra) r = response.json() self.assertEqual(r['message'], { 'pkhash': [get_dir_hash(file_mock)], 'validated': False }) self.assertEqual(response.status_code, status.HTTP_408_REQUEST_TIMEOUT)
def test_bulk_add_data_sample_ko_same_pkhash(self): self.add_default_data_manager() url = reverse('substrapp:data_sample-list') file_mock = MagicMock(spec=InMemoryUploadedFile) file_mock2 = MagicMock(spec=InMemoryUploadedFile) file_mock.name = 'foo.zip' file_mock2.name = 'bar.tar.gz' file_mock.read = MagicMock(return_value=self.data_file.read()) file_mock2.read = MagicMock(return_value=self.data_tar_file.read()) data = { file_mock.name: file_mock, file_mock2.name: file_mock2, 'data_manager_keys': [get_hash(self.data_data_opener)], 'test_only': True, } extra = { 'HTTP_ACCEPT': 'application/json;version=0.0', } with mock.patch('substrapp.serializers.datasample.DataSampleSerializer.get_validators') as mget_validators, \ mock.patch.object(LedgerDataSampleSerializer, 'create') as mcreate: mget_validators.return_value = [] self.data_file.seek(0) self.data_tar_file.seek(0) ledger_data = { 'pkhash': [get_dir_hash(file_mock), get_dir_hash(file_mock2)], 'validated': False } mcreate.return_value = ledger_data, status.HTTP_408_REQUEST_TIMEOUT response = self.client.post(url, data, format='multipart', **extra) r = response.json() self.assertEqual(DataSample.objects.count(), 0) self.assertEqual( r['message'], f'Your data sample archives contain same files leading to same pkhash, ' f'please review the content of your achives. ' f'Archives {file_mock2.name} and {file_mock.name} are the same' ) self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
def test_create_data(self): dir_path = os.path.dirname(os.path.realpath(__file__)) path = os.path.join( dir_path, '../../../fixtures/chunantes/datasamples/train/0024308') data_sample = DataSample.objects.create(path=path) self.assertEqual(data_sample.checksum, get_dir_hash(path)) self.assertFalse(data_sample.validated) self.assertIn(f'key {data_sample.key}', str(data_sample)) self.assertIn(f'validated {data_sample.validated}', str(data_sample))
def get_default_datasample_data(self): expected_hash = get_dir_hash(self.data_file.file) self.data_file.file.seek(0) data = { 'file': self.data_file, 'data_manager_keys': [get_hash(self.data_data_opener)], 'test_only': True, } return expected_hash, data
def compute_data(self, request): data = {} # files, should be archive for k, file in request.FILES.items(): pkhash = get_dir_hash(file) # can raise # check pkhash does not belong to the list try: existing = data[pkhash] except KeyError: pass else: raise Exception(f'Your data sample archives contain same files leading to same pkhash, please review the content of your achives. Archives {file} and {existing["file"]} are the same') data[pkhash] = { 'pkhash': pkhash, 'file': file } # path/paths case path = request.POST.get('path', None) paths = request.POST.getlist('paths', []) if path and paths: raise Exception('Cannot use path and paths together.') if path is not None: paths = [path] # paths, should be directories for path in paths: if not os.path.isdir(path): raise Exception(f'One of your paths does not exist, is not a directory or is not an absolute path: {path}') pkhash = dirhash(path, 'sha256') try: existing = data[pkhash] except KeyError: pass else: # existing can be a dict with a field path or file raise Exception(f'Your data sample directory contain same files leading to same pkhash. Invalid path: {path}.') data[pkhash] = { 'pkhash': pkhash, 'path': normpath(path) } if not data: # data empty raise Exception(f'No data sample provided.') return list(data.values())