def test_partial_dicom_download(initial_range_kb): """Partial download of DICOM files""" # test_file_name = "test_data/sample.dcm" test_file_name = str( pathlib.Path(__file__).parent.absolute() / "test_data" / "sample.dcm" ) bucket_name = "testbucket-12345" # Upload a file to S3 conn = boto3.resource("s3", region_name="us-east-1") conn.create_bucket(Bucket=bucket_name) conn.meta.client.upload_file(test_file_name, bucket_name, "sample.dcm") s3client = S3Client(bucket=bucket_name) image_data = PartialDicom( s3client, "sample.dcm", initial_range_kb=initial_range_kb ).download() # Check the local file as if it was fully downloaded with open(test_file_name, "rb") as fd: tmp = BytesIO(fd.read()) tmp.seek(0) image_data_nonpartial = pydicom.dcmread(tmp, stop_before_pixels=True) # Get the list of DICOM tags from both method k1 = set(image_data.keys()) k2 = set(image_data_nonpartial.keys()) # Compare that the two methods result in the same set of tags assert k1 ^ k2 == set()
def test_upload_text_data(): bucket_name = "testbucket-12345" conn = boto3.resource("s3", region_name="us-east-1") conn.create_bucket(Bucket=bucket_name) s3client = S3Client(bucket=bucket_name) task = "upload" key = "test.txt" content = "1234567890" * 10 ignored_inputs = [ ("something", key, content), (task, None, content), (task, key, None), ] for args in ignored_inputs: _, key, _ = args assert ( warehouseloader.upload_text_data(*args, s3client=s3client) is bonobo.constants.NOT_MODIFIED ) if key is not None: assert not s3client.object_exists(key) # Actual upload task args = task, key, content assert ( warehouseloader.upload_text_data(*args, s3client=s3client) is bonobo.constants.NOT_MODIFIED ) assert s3client.object_content(key).decode("utf-8") == content
def test_object_content(): """Test object_content helper""" bucket_name = "testbucket-12345" conn = boto3.resource("s3", region_name="us-east-1") conn.create_bucket(Bucket=bucket_name) key = "test.json" content = "1234567890" conn.meta.client.put_object(Bucket=bucket_name, Key=key, Body=content) s3client = S3Client(bucket=bucket_name) test_content = s3client.object_content(key).decode("utf-8") assert test_content == content byte_count = 5 partial_test_content = s3client.object_content( key, content_range=f"bytes=0-{byte_count-1}" ).decode("utf-8") assert partial_test_content == content[0:byte_count] byte_count = len(content) + 5 oversized_test_content = s3client.object_content( key, content_range=f"bytes=0-{byte_count}" ).decode("utf-8") assert oversized_test_content == content
def get_services(**options): """ This function builds the services dictionary, which is a simple dict of names-to-implementation used by bonobo for runtime injection. It will be used on top of the defaults provided by bonobo (fs, http, ...). You can override those defaults, or just let the framework define them. You can also define your own services and naming is up to you. Returns ------- dict Mapping of service names to objects. """ if BUCKET_NAME is None: return { "config": None, "filelist": None, "s3client": None, } config = PipelineConfig() inv_downloader = InventoryDownloader(main_bucket=BUCKET_NAME) filelist = FileList(inv_downloader) s3client = S3Client(bucket=BUCKET_NAME) return { "config": config, "filelist": filelist, "s3client": s3client, }
def test_put_object(key): """Test object_content helper""" bucket_name = "testbucket-12345" conn = boto3.resource("s3", region_name="us-east-1") conn.create_bucket(Bucket=bucket_name) content = "1234567890" * 10 s3client = S3Client(bucket=bucket_name) s3client.put_object(key, content=content) assert s3client.object_content(key).decode("utf-8") == content
def test_copy_object(old_key, new_key): bucket_name = "testbucket-12345" conn = boto3.resource("s3", region_name="us-east-1") conn.create_bucket(Bucket=bucket_name) content = "1234567890" * 10 s3client = S3Client(bucket=bucket_name) s3client.put_object(old_key, content=content) assert not s3client.object_exists(new_key) s3client.copy_object(old_key, new_key) assert s3client.object_content(old_key).decode( "utf-8" ) == s3client.object_content(new_key).decode("utf-8")
def test_object_exists(key, create): """Testing the object_exists function""" bucket_name = "testbucket-12345" conn = boto3.resource("s3", region_name="us-east-1") conn.create_bucket(Bucket=bucket_name) if create: conn.meta.client.upload_fileobj(BytesIO(), bucket_name, key) # Set up client s3client = S3Client(bucket=bucket_name) assert s3client.bucket == bucket_name assert s3client.object_exists(key) == create
def test_submittingcentres_extract_raw_data_files(): """Test the submittingcentres extract_raw_data_files function.""" bucket_name = "testbucket-12345" conn = boto3.resource("s3", region_name="us-east-1") conn.create_bucket(Bucket=bucket_name) s3client = S3Client(bucket=bucket_name) config = PipelineConfig() input_config = dict( { "raw_prefixes": [ "raw-nhs-upload/", ], "training_percentage": 0, "sites": { "split": [], "training": [], "validation": [], }, } ) conn.meta.client.put_object( Bucket=bucket_name, Key=CONFIG_KEY, Body=json.dumps(input_config) ) next(warehouseloader.load_config(s3client, config)) target_files = [ "raw-nhs-upload/2021-01-31/data/Covid1_data.json", "raw-nhs-upload/2021-01-31/data/Covid2_status.json", "raw-nhs-upload/2021-02-28/data/Covid3_data.json", "raw-nhs-upload/2021-02-28/data/Covid4_status.json", ] extra_files = [ "raw-nhs-upload/2021-03-01/data/Covid1_data.json", "test/Covid5_data.json" "raw-nhs-upload/age-0/2021-03-01/data/Covid6_data.json", "raw-elsewhere-upload/2021-03-01/data/Covid7_data.json", "training/data/Covid1/data_2021-01-31.json", ] create_inventory(target_files + extra_files, bucket_name) inv_downloader = InventoryDownloader(main_bucket=bucket_name) filelist = FileList(inv_downloader) result_list = list( submittingcentres.extract_raw_data_files(config, filelist) ) keys_list = sorted([key for _, key, _ in result_list]) assert keys_list == sorted(target_files)
def test_data_copy(task, old_key, new_key): bucket_name = "testbucket-12345" conn = boto3.resource("s3", region_name="us-east-1") conn.create_bucket(Bucket=bucket_name) s3client = S3Client(bucket=bucket_name) content = "1234567890" * 10 s3client.put_object(key=old_key, content=content) args = task, old_key, new_key warehouseloader.data_copy(*args, s3client=s3client) if task == "copy": assert s3client.object_content(old_key) == s3client.object_content( new_key ) else: assert not s3client.object_exists(new_key)
def test_get_object(): """Test get_object helper""" bucket_name = "testbucket-12345" conn = boto3.resource("s3", region_name="us-east-1") conn.create_bucket(Bucket=bucket_name) key = "test.json" content = "1234567890" conn.meta.client.put_object(Bucket=bucket_name, Key=key, Body=content) s3client = S3Client(bucket=bucket_name) test_content = s3client.get_object(key)["Body"].read().decode("utf-8") assert test_content == content with pytest.raises(ClientError): helpers.get_submitting_centre_from_key(s3client, key + ".bak")
def test_load_config(): bucket_name = "testbucket-12345" conn = boto3.resource("s3", region_name="us-east-1") conn.create_bucket(Bucket=bucket_name) s3client = S3Client(bucket=bucket_name) config = PipelineConfig() input_config = dict( { "raw_prefixes": [ "raw-nhs-upload/", ], "training_percentage": 10, "sites": { "split": ["Centre1", "Centre2"], "training": ["Centre3"], "validation": ["Centre4"], }, } ) conn.meta.client.put_object( Bucket=bucket_name, Key=CONFIG_KEY, Body=json.dumps(input_config) ) # This function yields, so have to iterate next(warehouseloader.load_config(s3client, config)) assert config.get_raw_prefixes() == set(input_config["raw_prefixes"]) assert config.get_training_percentage() == 10 assert config.get_site_group("Centre1") == "split" assert config.get_site_group("Centre2") == "split" assert config.get_site_group("Centre3") == "training" assert config.get_site_group("Centre4") == "validation" assert config.get_site_group("CentreX") is None # Invalid configuration conn.meta.client.put_object( Bucket=bucket_name, Key=CONFIG_KEY, Body=json.dumps(input_config)[:-5] ) with pytest.raises(json.decoder.JSONDecodeError): # This function yields, so have to iterate next(warehouseloader.load_config(s3client, config))
def test_dataprocess_load_clinical_files(clinical_files): bucket_name = "testbucket-12345" conn = boto3.resource("s3", region_name="us-east-1") conn.create_bucket(Bucket=bucket_name) s3client = S3Client(bucket=bucket_name) # Files per type pseudonym = "Covid1234" group = "training" file_count = 4 filenames = [] base_path = f"{group}/data/{pseudonym}/" for item in range(1, file_count + 1): date = f"2021-03-{item:02d}" if clinical_files == "mixed": file_type = "data" if item % 2 == 0 else "status" else: file_type = clinical_files filename = f"{file_type}_{date}.json" filenames += [filename] content = json.dumps({"Pseudonym": pseudonym, "key": f"value{item}"}) conn.meta.client.put_object( Bucket=bucket_name, Key=base_path + filename, Body=content ) target_result = { "filename_earliest_date": datetime.date(2021, 3, 1), "filename_latest_date": datetime.date(2021, 3, 4), "filename_covid_status": clinical_files in {"data", "mixed"}, "last_modified": datetime.date.today(), "group": group, "Pseudonym": pseudonym, "key": "value4", } data = {"group": group, "files": filenames} args = pseudonym, data result = next(dataprocess.load_clinical_files(*args, s3client=s3client)) assert result[0] == "patient" assert result[1] == target_result
def test_upload_file(tmp_path): """Test get_object helper""" bucket_name = "testbucket-12345" conn = boto3.resource("s3", region_name="us-east-1") conn.create_bucket(Bucket=bucket_name) s3client = S3Client(bucket=bucket_name) d = tmp_path / "upload" d.mkdir() p = d / "hello.txt" content = "1234567890" * 10 p.write_text(content) # Uploading a test file key = "testfile" s3client.upload_file(key, str(p)) assert s3client.object_content(key).decode("utf-8") == content # Trying to upload a nonexistent file with pytest.raises(FileNotFoundError): s3client.upload_file(key, str(d / "nonexistent.txt"))
def test_get_submitting_centre(): bucket_name = "testbucket-12345" conn = boto3.resource("s3", region_name="us-east-1") conn.create_bucket(Bucket=bucket_name) s3client = S3Client(bucket=bucket_name) key_valid = "valid.json" centre = "TestCente" content_valid = json.dumps( {"Pseudonym": "Covid123", "SubmittingCentre": centre} ) conn.meta.client.put_object( Bucket=bucket_name, Key=key_valid, Body=content_valid ) assert ( helpers.get_submitting_centre_from_key(s3client, key_valid) == centre ) key_missing = "missing.json" content_missing = json.dumps({"Pseudonym": "Covid123"}) conn.meta.client.put_object( Bucket=bucket_name, Key=key_missing, Body=content_missing ) assert ( helpers.get_submitting_centre_from_key(s3client, key_missing) is None ) key_invalid = "invalid.json" content_invalid = json.dumps({"Pseudonym": "Covid123"})[:-5] conn.meta.client.put_object( Bucket=bucket_name, Key=key_invalid, Body=content_invalid ) with pytest.raises(json.decoder.JSONDecodeError): helpers.get_submitting_centre_from_key(s3client, key_invalid) with pytest.raises(ClientError): helpers.get_submitting_centre_from_key(s3client, key_valid + ".bak")
def test_warehouseloader_e2e( clinical_centre, config_centre, config_group, final_location ): """Full pipeline run of the pipeline test Single image file, checking processing and copying going to the right place. """ test_file_name = ( "1.3.6.1.4.1.11129.5.5.110503645592756492463169821050252582267888.dcm" ) test_file_path = str( pathlib.Path(__file__).parent.absolute() / "test_data" / test_file_name ) patient_id = "Covid0000" study_id = ( "1.3.6.1.4.1.11129.5.5.112507010803284478207522016832191866964708" ) series_id = ( "1.3.6.1.4.1.11129.5.5.112630850362182468372440828755218293352329" ) # Test setup bucket_name = "testbucket-12345" conn = boto3.resource("s3", region_name="us-east-1") conn.create_bucket(Bucket=bucket_name) s3client = S3Client(bucket=bucket_name) config = PipelineConfig() input_config = dict( { "raw_prefixes": [ "raw-nhs-upload/", ], "training_percentage": 0, "sites": { "split": [], "training": [], "validation": [], }, } ) input_config["sites"][config_group] += [config_centre] conn.meta.client.put_object( Bucket=bucket_name, Key=CONFIG_KEY, Body=json.dumps(input_config) ) next(warehouseloader.load_config(s3client, config)) # Upload the test image image_file = f"raw-nhs-upload/2021-03-01/images/{test_file_name}" conn.meta.client.upload_file(test_file_path, bucket_name, image_file) clinical_files = [ "raw-nhs-upload/2021-02-15/data/Covid0000_status.json", "raw-nhs-upload/2021-03-01/data/Covid0000_data.json", ] file_content = json.dumps( {"Pseudonym": patient_id, "SubmittingCentre": clinical_centre} ) for clinical_file in clinical_files: conn.meta.client.put_object( Bucket=bucket_name, Key=clinical_file, Body=file_content ) target_files = [image_file] + clinical_files create_inventory(target_files, bucket_name) inv_downloader = InventoryDownloader(main_bucket=bucket_name) filelist = FileList(inv_downloader) patientcache = PatientCache(inv_downloader) services = { "config": config, "filelist": filelist, "patientcache": patientcache, "s3client": s3client, } bonobo.run(warehouseloader.get_graph(), services=services) if final_location is not None: # Image copied to the right place image_key = f"{final_location}/xray/{patient_id}/{study_id}/{series_id}/{test_file_name}" assert s3client.object_exists(image_key) # DICOM tags are extracted json_key = f"{final_location}/xray-metadata/{patient_id}/{study_id}/{series_id}/{test_file_name.replace('dcm', 'json')}" assert s3client.object_exists(json_key) with open(test_file_path.replace("dcm", "json"), "r") as f: test_json = f.read().replace("\n", "") assert s3client.object_content(json_key).decode("utf-8") == test_json # Clinical files copied to the right place clinical_file_status = ( f"{final_location}/data/{patient_id}/status_2021-02-15.json" ) assert s3client.object_exists(clinical_file_status) clinical_file_data = ( f"{final_location}/data/{patient_id}/data_2021-03-01.json" ) assert s3client.object_exists(clinical_file_data) else: for group in ["training", "validation"]: image_key = f"{group}/xray/{patient_id}/{study_id}/{series_id}/{test_file_name}" assert not s3client.object_exists(image_key) json_key = f"{group}/xray-metadata/{patient_id}/{study_id}/{series_id}/{test_file_name.replace('dcm', 'json')}" assert not s3client.object_exists(json_key) clinical_file_status = ( f"{group}/data/{patient_id}/status_2021-02-15.json" ) assert not s3client.object_exists(clinical_file_status) clinical_file_data = ( f"{group}/data/{patient_id}/data_2021-03-01.json" ) assert not s3client.object_exists(clinical_file_data)
def test_warehouseloader_extract_raw_files(): """Test the warehouseloader extract_raw_files_from_folder function.""" bucket_name = "testbucket-12345" conn = boto3.resource("s3", region_name="us-east-1") conn.create_bucket(Bucket=bucket_name) s3client = S3Client(bucket=bucket_name) config = PipelineConfig() input_config = dict( { "raw_prefixes": [ "raw-nhs-upload/", ], "training_percentage": 0, "sites": { "split": [], "training": [], "validation": [], }, } ) conn.meta.client.put_object( Bucket=bucket_name, Key=CONFIG_KEY, Body=json.dumps(input_config) ) next(warehouseloader.load_config(s3client, config)) uuids = [pydicom.uid.generate_uid() for _ in range(4)] target_files = [ "raw-nhs-upload/2021-01-31/data/Covid1_data.json", "raw-nhs-upload/2021-01-31/data/Covid2_status.json", "raw-nhs-upload/2021-02-28/data/Covid3_data.json", "raw-nhs-upload/2021-02-28/data/Covid4_status.json", f"raw-nhs-upload/2021-02-28/images/{uuids[0]}.dcm", f"raw-nhs-upload/2021-02-28/images/{uuids[1]}.dcm", f"raw-nhs-upload/2021-02-28/images/{uuids[2]}.dcm", ] extra_files = [ "raw-nhs-upload/2021-03-01/age-0/data/Covid6_data.json", f"raw-nhs-upload/2021-02-28/age-0/images/{pydicom.uid.generate_uid()}.dcm", "test/Covid5_data.json" "raw-elsewhere-upload/2021-03-01/data/Covid7_data.json", f"{TRAINING_PREFIX}data/Covid1/data_2021-01-31.json", # Only copied image, so list the relevant image for metadata f"{TRAINING_PREFIX}xray/Covid1/123/123/{uuids[1]}.dcm", # Only metadata, so list the relevant image f"{TRAINING_PREFIX}xray-metadata/Covid2/234/234/{uuids[2]}.json", # Skip this as it has both image copy and metadata f"raw-nhs-upload/2021-02-28/images/{uuids[3]}.dcm", f"{TRAINING_PREFIX}xray/Covid3/345/345/{uuids[3]}.dcm", f"{TRAINING_PREFIX}xray-metadata/Covid3/345/345/{uuids[3]}.json", ] create_inventory(target_files + extra_files, bucket_name) inv_downloader = InventoryDownloader(main_bucket=bucket_name) filelist = FileList(inv_downloader) result_list = list( warehouseloader.extract_raw_files_from_folder(config, filelist) ) key_set = set([key for _, key, _ in result_list]) assert key_set ^ set(target_files) == set()
def test_process_patient_data(): bucket_name = "testbucket-12345" conn = boto3.resource("s3", region_name="us-east-1") conn.create_bucket(Bucket=bucket_name) s3client = S3Client(bucket=bucket_name) config = PipelineConfig() input_config = dict( { "raw_prefixes": [ "raw-nhs-upload/", ], "training_percentage": 0, "sites": { "split": ["SplitCentre"], "training": ["TrainingCentre"], "validation": ["ValidationCentre"], }, } ) conn.meta.client.put_object( Bucket=bucket_name, Key=CONFIG_KEY, Body=json.dumps(input_config) ) next(warehouseloader.load_config(s3client, config)) processed_list = [ "training/data/Covid1/data_2021-03-01.json", "training/data/Covida9a3751d-f614-4d4a-b3ee-c3f5ca1fb858/data_2021-03-01.json", "validation/data/Covid2/data_2021-03-01.json", "validation/data/Covida23f28da6-c470-4dd2-b432-1e17724715a2/data_2021-03-01.json", ] for key in processed_list: conn.meta.client.upload_fileobj(BytesIO(), bucket_name, key) raw_folder = "raw-nhs-upload/2021-03-01/data" clinical_records = [ ("Covid10", "TrainingCentre"), ("Covid20", "ValidationCentre"), ("Covid30", "SplitCentre"), ("Covid40", "ExtraCentre"), ("Covid50", None), ] data_list = [] for pseudonym, centre in clinical_records: for file_type in ["data", "status"]: if centre is not None: content = json.dumps( {"Pseudonym": pseudonym, "SubmittingCentre": centre} ) else: content = json.dumps({"Pseudonym": pseudonym}) key = f"{raw_folder}/{pseudonym}_{file_type}.json" conn.meta.client.put_object( Bucket=bucket_name, Key=key, Body=content ) data_list += [key] file_list = processed_list + data_list create_inventory(file_list, bucket_name) inv_downloader = InventoryDownloader(main_bucket=bucket_name) patientcache = PatientCache(inv_downloader) kwargs = { "config": config, "patientcache": patientcache, "s3client": s3client, } # Not handled task args = "copy", "raw-nhs-upload/2021-03-01/data/Covid1.json", None assert ( next(warehouseloader.process_patient_data(*args, **kwargs)) is bonobo.constants.NOT_MODIFIED ) # Not handled filename args = ( "process", f"raw-nhs-upload/2021-03-01/images/{pydicom.uid.generate_uid()}.dcm", None, ) assert ( next(warehouseloader.process_patient_data(*args, **kwargs)) is bonobo.constants.NOT_MODIFIED ) # Already processed file existing args = "process", "raw-nhs-upload/2021-03-01/data/Covid1_data.json", None with pytest.raises(StopIteration): next(warehouseloader.process_patient_data(*args, **kwargs)) args = "process", "raw-nhs-upload/2021-03-01/data/Covid2_data.json", None with pytest.raises(StopIteration): next(warehouseloader.process_patient_data(*args, **kwargs)) # Training item args = "process", "raw-nhs-upload/2021-03-01/data/Covid10_data.json", None assert next(warehouseloader.process_patient_data(*args, **kwargs)) == ( "copy", "raw-nhs-upload/2021-03-01/data/Covid10_data.json", "training/data/Covid10/data_2021-03-01.json", ) # Validation item args = ( "process", "raw-nhs-upload/2021-03-01/data/Covid20_status.json", None, ) assert next(warehouseloader.process_patient_data(*args, **kwargs)) == ( "copy", "raw-nhs-upload/2021-03-01/data/Covid20_status.json", "validation/data/Covid20/status_2021-03-01.json", ) # Split item, training percentage forcing to validation args = "process", "raw-nhs-upload/2021-03-01/data/Covid30_data.json", None assert next(warehouseloader.process_patient_data(*args, **kwargs)) == ( "copy", "raw-nhs-upload/2021-03-01/data/Covid30_data.json", "validation/data/Covid30/data_2021-03-01.json", ) # Unknown submitting centre included args = "process", "raw-nhs-upload/2021-03-01/data/Covid40_data.json", None with pytest.raises(StopIteration): next(warehouseloader.process_patient_data(*args, **kwargs)) # No submitting centre included args = "process", "raw-nhs-upload/2021-03-01/data/Covid50_data.json", None with pytest.raises(StopIteration): next(warehouseloader.process_patient_data(*args, **kwargs))
def test_submittingcentres_e2e(capsys): """Full pipeline run of the submitting centres test: * create files * add content * setup & run pipeline """ bucket_name = "testbucket-12345" conn = boto3.resource("s3", region_name="us-east-1") conn.create_bucket(Bucket=bucket_name) s3client = S3Client(bucket=bucket_name) config = PipelineConfig() input_config = dict( { "raw_prefixes": [ "raw-nhs-upload/", ], "training_percentage": 0, "sites": { "split": [], "training": [], "validation": [], }, } ) conn.meta.client.put_object( Bucket=bucket_name, Key=CONFIG_KEY, Body=json.dumps(input_config) ) next(warehouseloader.load_config(s3client, config)) target_files = [ "raw-nhs-upload/2021-01-31/data/Covid1_data.json", "raw-nhs-upload/2021-01-31/data/Covid2_status.json", "raw-nhs-upload/2021-02-28/data/Covid3_data.json", "raw-nhs-upload/2021-02-28/data/Covid4_status.json", ] extra_files = [ "raw-nhs-upload/2021-03-01/data/Covid1_data.json", "test/Covid5_data.json" "raw-nhs-upload/age-0/2021-03-01/data/Covid6_data.json", "raw-elsewhere-upload/2021-03-01/data/Covid7_data.json", "training/data/Covid1/data_2021-01-31.json", ] centres = ["CentreA", "CentreB", "CentreA", "CentreC"] for target_file, centre in zip(target_files, centres): file_content = json.dumps({"SubmittingCentre": centre}) conn.meta.client.put_object( Bucket=bucket_name, Key=target_file, Body=file_content ) create_inventory(target_files + extra_files, bucket_name) inv_downloader = InventoryDownloader(main_bucket=bucket_name) filelist = FileList(inv_downloader) services = { "config": config, "filelist": filelist, "s3client": s3client, } bonobo.run(submittingcentres.get_graph(), services=services) with open("/tmp/message.txt", "r") as f: output = f.read().splitlines() assert output == ["CentreA", "CentreB", "CentreC"]