def default_arguments_explore(tmpdir_factory, utils: Utils) -> argparse.Namespace: temp_dir = tmpdir_factory.mktemp("data") utils.build_hd5s(temp_dir, pytest.MOCK_TMAPS.values(), n=pytest.N_TENSORS) hd5_dir = str(temp_dir) sys.argv = [ ".", "explore", "--tensors", hd5_dir, "--output_folder", hd5_dir, ] args = parse_args() return args
def test_pre_tensorize_explorations(temp_dir): # logging.disable(logging.CRITICAL) for file in os.listdir(pytest.bedmaster_dir): if file.startswith("bedmaster_file-"): copy2(os.path.join(pytest.bedmaster_dir, file), temp_dir) patient_csv = pd.DataFrame.from_dict({"MRN": [123, 456]}) patient_csv.to_csv(os.path.join(temp_dir, "patient_csv.csv")) sys.argv = [ ".", "pre_tensorize_explore", "--bedmaster", temp_dir, "--edw", pytest.edw_dir, "--xref", pytest.cross_ref_file, "--output_folder", temp_dir, "--detailed_bedmaster", "--patient_csv", os.path.join(temp_dir, "patient_csv.csv"), ] args = parse_args() run(args) expected_files = [ f"{args.summary_stats_base_name}_edw_demographics.csv", f"{args.summary_stats_base_name}_coverage.csv", f"{args.summary_stats_base_name}_signals_summary.csv", f"{args.summary_stats_base_name}_bedmaster_signal_stats.csv", f"{args.summary_stats_base_name}_bedmaster_files_stats.csv", "patient_csv.csv", ] output_files = [ file_name for file_name in os.listdir(args.output_folder) if file_name.endswith(".csv") ] assert sorted(expected_files) == sorted(output_files) edw_df = pd.read_csv(os.path.join(args.output_folder, expected_files[0])) cross_ref_df = pd.read_csv(os.path.join(args.output_folder, expected_files[1])) signals_df = pd.read_csv(os.path.join(args.output_folder, expected_files[2])) bedmaster_signals_df = pd.read_csv( os.path.join(args.output_folder, expected_files[3]), index_col=0, ) bedmaster_files_df = pd.read_csv( os.path.join(args.output_folder, expected_files[4]), ) assert len(edw_df.index) == 9 assert sorted(edw_df.columns) == sorted( ["field", "count", "min", "max", "mean", "total", "%"], ) assert len(cross_ref_df.index) == 10 assert sorted(cross_ref_df.columns) == sorted(["field", "count"]) assert len(signals_df.index) == len(signals_df["signal"].unique()) assert sorted(signals_df.columns) == sorted( ["signal", "count", "source", "total", "%"], ) assert len(edw_df.index) == 9 assert sorted(edw_df.columns) == sorted( ["field", "count", "min", "max", "mean", "total", "%"], ) assert len(bedmaster_signals_df.index) == 11 assert sorted(bedmaster_signals_df.columns) == sorted( [ "channel", "total_overlap_bundles", "total_overlap_bundles_%", "files", "files_%", "source", "points", "min", "mean", "max", "dataevents", "sample_freq", "multiple_freq", "units", "scale_factor", "nan_on_time", "nan_on_time_%", "nan_on_values", "nan_on_values_%", "overlapped_points", "overlapped_points_%", "string_value_bundles", "defective_signal", ], ) assert len(bedmaster_files_df.index) == 5 assert sorted(bedmaster_files_df.columns) == sorted(["issue", "count", "count_%"])
def test_tensorizer( temp_dir, monkeypatch, test_scale_units: Dict[str, Dict[str, Union[int, float, str]]], ): monkeypatch.setattr("definitions.icu.ICU_SCALE_UNITS", test_scale_units) monkeypatch.setattr( "ingest.icu.tensorizer.ICU_SCALE_UNITS", test_scale_units, ) monkeypatch.setattr( "tensorize.bedmaster.readers.ICU_SCALE_UNITS", test_scale_units, ) test_dir = os.path.dirname(__file__) sys.argv = f""" . tensorize_icu_no_edw_pull --xref {test_dir}/data/xref_file_tensorize.csv --adt {test_dir}/data/edw/adt.csv --bedmaster {test_dir}/data/bedmaster --edw {test_dir}/data/edw --alarms {test_dir}/data/bedmaster_alarms --output_folder {temp_dir}/{pytest.run_id} --tensors {os.path.join(temp_dir, pytest.run_id)} """.split() args = parse_args() output_file = os.path.join(args.tensors, f"{pytest.example_mrn}.hd5") # Make sure the file doesn't exist with pytest.raises(OSError): with h5py.File(output_file, "r") as tens_file: pass # Tensorize and check hd5 structure tensorizer = Tensorizer( args.bedmaster, args.alarms, args.edw, args.xref, args.adt, ) tensorizer.tensorize( tensors=args.tensors, overwrite_hd5=args.overwrite, num_workers=args.num_workers, allow_one_source=args.allow_one_source, ) with h5py.File(output_file, "r") as tens_file: assert sorted(list(tens_file.keys())) == ["bedmaster", "edw"] bedmaster_signals = { "waveform": ["i", "ii", "iii", "resp", "spo2", "v"], "vitals": ["co", "cuff", "hr", "spo2r", "spo2%"], "alarms": [ "arrhy_suspend", "cpp_low", "hr_hi_156", "hr_hi_160", "hr_hi_165", "ppeak_high", "spo2_probe", "tachy", "tvexp_low", ], } edw_signals = { "med": [ "aspirin_325_mg_tablet", ("cefazolin_2_gram|50_ml_in_dextrose_iso-osmotic_" "intravenous_piggyback"), "lactated_ringers_iv_bolus", "norepinephrine_infusion_syringe_in_swfi_80_mcg|ml_cmpd_central_mgh", "sodium_chloride_0.9_%_intravenous_solution", ], "flowsheet": [ "blood_pressure", "pulse", "r_phs_ob_bp_diastolic_outgoing", "r_phs_ob_bp_systolic_outgoing", "r_phs_ob_pulse_oximetry_outgoing", ], "labs": ["creatinine", "lactate_blood", "magnesium", "ph_arterial"], "surgery": ["colonoscopy", "coronary_artery_bypass_graft"], "procedures": ["hemodialysis", "hemodialysis_|_ultrafiltration"], "transfusions": [ "transfuse_cryoprecipitate", "transfuse_platelets", "transfuse_red_blood_cells", ], "events": ["rapid_response_start", "code_start"], } assert tens_file["bedmaster"].attrs["completed"] assert tens_file["edw"].attrs["completed"] assert sorted(tens_file["bedmaster/345"].keys()) == sorted( list(bedmaster_signals.keys()), ) assert sorted(tens_file["edw/345"].keys()) == sorted( list(edw_signals.keys())) bedmaster_attrs = [ "channel", "name", "scale_factor", "source", "units", ] bedmaster_sig_keys = [ "time", "time_corr_arr", "value", "samples_per_ts", "sample_freq", ] bedmaster_alarms_keys = ["duration", "start_date"] # Test units, scaling factor and sample_freq hr_dir = tens_file["bedmaster/345/vitals/hr"] assert hr_dir.attrs["units"] == "Bpm" assert hr_dir.attrs["scale_factor"] == 0.5 expected_sf = np.array([(0.5, 0)], dtype="float, int") assert np.array_equal(hr_dir["sample_freq"], expected_sf) ecg_ii = tens_file["bedmaster/345/waveform/ii"] assert ecg_ii.attrs["units"] == "mV" assert ecg_ii.attrs["scale_factor"] == 0.0243 expected_sf = np.array( [(240.0, 0), (120.0, 80), (240.0, 5760), (120.0, 5808)], dtype="float, int", ) assert np.array_equal(ecg_ii["sample_freq"], expected_sf) for sig_type, signals in bedmaster_signals.items(): sig_type_dir = tens_file["bedmaster/345"][sig_type] assert sorted(sig_type_dir.keys()) == sorted( bedmaster_signals[sig_type]) for signal in signals: # Test interbundle correction if sig_type == "vitals": assert sorted(sig_type_dir[signal].keys()) == sorted( bedmaster_sig_keys, ) assert sorted(sig_type_dir[signal].attrs.keys()) == sorted( bedmaster_attrs, ) diff = np.diff(sig_type_dir[signal]["time"][()]) if signal == "hr": # Signal has a data event 1 assert np.array_equal( np.where(diff != 2)[0], np.array([18])) else: # Test signals concatenated assert len(sig_type_dir[signal]["time"]) == 20 assert all(diff == 2) elif sig_type == "waveform": assert sorted(sig_type_dir[signal].keys()) == sorted( bedmaster_sig_keys, ) assert sorted(sig_type_dir[signal].attrs.keys()) == sorted( bedmaster_attrs, ) diff = np.diff(sig_type_dir[signal]["time"][:3349]) if signal == "v": assert np.array_equal( np.where(diff != 0.25)[0], np.array([113]), ) else: assert all(diff == 0.25) length = 32 if signal == "resp" else 160 assert len(sig_type_dir[signal]["time"]) == length else: assert sorted(sig_type_dir[signal].keys()) == sorted( bedmaster_alarms_keys, ) for key in edw_signals: assert sorted(tens_file[f"edw/345/{key}"].keys()) == sorted( edw_signals[key], )
def test_independent_args(temp_dir): for file in os.listdir(pytest.bedmaster_dir): if file.startswith("bedmaster_file-"): copy2(os.path.join(pytest.bedmaster_dir, file), temp_dir) def _get_output_files(): return [ file_name for file_name in os.listdir(temp_dir) if file_name.endswith(".csv") ] def _reset_dir(): for file in os.listdir(temp_dir): if file.endswith(".csv"): os.remove(os.path.join(temp_dir, file)) patient_csv = pd.DataFrame.from_dict({"MRN": [123, 456]}) patient_csv.to_csv(os.path.join(temp_dir, "patient_csv.csv")) base_name = "pre_tensorize" expected_files = [ f"{base_name}_edw_demographics.csv", f"{base_name}_coverage.csv", f"{base_name}_signals_summary.csv", "patient_csv.csv", f"{base_name}_bedmaster_signal_stats.csv", f"{base_name}_bedmaster_files_stats.csv", ] # Test standard sys.argv = [ ".", "pre_tensorize_explore", "--edw", pytest.edw_dir, "--xref", pytest.cross_ref_file, "--bedmaster", temp_dir, "--output_folder", temp_dir, "--patient_csv", os.path.join(temp_dir, "patient_csv.csv"), ] _reset_dir() parsed_args = parse_args() run(parsed_args) assert sorted(expected_files[:4]) == sorted(_get_output_files()) # Test Bedmaster detailed sys.argv.append("--detailed_bedmaster") parsed_args = parse_args() _reset_dir() run(parsed_args) assert sorted(expected_files) == sorted(_get_output_files()) # Test just Bedmaster sys.argv.append("--no_xref") parsed_args = parse_args() _reset_dir() run(parsed_args) assert sorted(expected_files[3:]) == sorted(_get_output_files()) # Test no_xref and no detailed sys.argv.remove("--detailed_bedmaster") _reset_dir() parsed_args = parse_args() run(parsed_args) assert sorted(expected_files[:4]) == sorted(_get_output_files()) logging.disable(logging.NOTSET)