예제 #1
0
def default_arguments_explore(tmpdir_factory,
                              utils: Utils) -> argparse.Namespace:
    temp_dir = tmpdir_factory.mktemp("data")
    utils.build_hd5s(temp_dir, pytest.MOCK_TMAPS.values(), n=pytest.N_TENSORS)
    hd5_dir = str(temp_dir)
    sys.argv = [
        ".",
        "explore",
        "--tensors",
        hd5_dir,
        "--output_folder",
        hd5_dir,
    ]
    args = parse_args()
    return args
예제 #2
0
def test_pre_tensorize_explorations(temp_dir):
    # logging.disable(logging.CRITICAL)

    for file in os.listdir(pytest.bedmaster_dir):
        if file.startswith("bedmaster_file-"):
            copy2(os.path.join(pytest.bedmaster_dir, file), temp_dir)

    patient_csv = pd.DataFrame.from_dict({"MRN": [123, 456]})
    patient_csv.to_csv(os.path.join(temp_dir, "patient_csv.csv"))
    sys.argv = [
        ".",
        "pre_tensorize_explore",
        "--bedmaster",
        temp_dir,
        "--edw",
        pytest.edw_dir,
        "--xref",
        pytest.cross_ref_file,
        "--output_folder",
        temp_dir,
        "--detailed_bedmaster",
        "--patient_csv",
        os.path.join(temp_dir, "patient_csv.csv"),
    ]
    args = parse_args()
    run(args)

    expected_files = [
        f"{args.summary_stats_base_name}_edw_demographics.csv",
        f"{args.summary_stats_base_name}_coverage.csv",
        f"{args.summary_stats_base_name}_signals_summary.csv",
        f"{args.summary_stats_base_name}_bedmaster_signal_stats.csv",
        f"{args.summary_stats_base_name}_bedmaster_files_stats.csv",
        "patient_csv.csv",
    ]

    output_files = [
        file_name
        for file_name in os.listdir(args.output_folder)
        if file_name.endswith(".csv")
    ]

    assert sorted(expected_files) == sorted(output_files)

    edw_df = pd.read_csv(os.path.join(args.output_folder, expected_files[0]))
    cross_ref_df = pd.read_csv(os.path.join(args.output_folder, expected_files[1]))
    signals_df = pd.read_csv(os.path.join(args.output_folder, expected_files[2]))
    bedmaster_signals_df = pd.read_csv(
        os.path.join(args.output_folder, expected_files[3]),
        index_col=0,
    )
    bedmaster_files_df = pd.read_csv(
        os.path.join(args.output_folder, expected_files[4]),
    )

    assert len(edw_df.index) == 9
    assert sorted(edw_df.columns) == sorted(
        ["field", "count", "min", "max", "mean", "total", "%"],
    )

    assert len(cross_ref_df.index) == 10
    assert sorted(cross_ref_df.columns) == sorted(["field", "count"])

    assert len(signals_df.index) == len(signals_df["signal"].unique())
    assert sorted(signals_df.columns) == sorted(
        ["signal", "count", "source", "total", "%"],
    )

    assert len(edw_df.index) == 9
    assert sorted(edw_df.columns) == sorted(
        ["field", "count", "min", "max", "mean", "total", "%"],
    )

    assert len(bedmaster_signals_df.index) == 11
    assert sorted(bedmaster_signals_df.columns) == sorted(
        [
            "channel",
            "total_overlap_bundles",
            "total_overlap_bundles_%",
            "files",
            "files_%",
            "source",
            "points",
            "min",
            "mean",
            "max",
            "dataevents",
            "sample_freq",
            "multiple_freq",
            "units",
            "scale_factor",
            "nan_on_time",
            "nan_on_time_%",
            "nan_on_values",
            "nan_on_values_%",
            "overlapped_points",
            "overlapped_points_%",
            "string_value_bundles",
            "defective_signal",
        ],
    )

    assert len(bedmaster_files_df.index) == 5
    assert sorted(bedmaster_files_df.columns) == sorted(["issue", "count", "count_%"])
예제 #3
0
def test_tensorizer(
    temp_dir,
    monkeypatch,
    test_scale_units: Dict[str, Dict[str, Union[int, float, str]]],
):
    monkeypatch.setattr("definitions.icu.ICU_SCALE_UNITS", test_scale_units)
    monkeypatch.setattr(
        "ingest.icu.tensorizer.ICU_SCALE_UNITS",
        test_scale_units,
    )
    monkeypatch.setattr(
        "tensorize.bedmaster.readers.ICU_SCALE_UNITS",
        test_scale_units,
    )

    test_dir = os.path.dirname(__file__)
    sys.argv = f"""
    .
    tensorize_icu_no_edw_pull
    --xref {test_dir}/data/xref_file_tensorize.csv
    --adt {test_dir}/data/edw/adt.csv
    --bedmaster {test_dir}/data/bedmaster
    --edw {test_dir}/data/edw
    --alarms {test_dir}/data/bedmaster_alarms
    --output_folder {temp_dir}/{pytest.run_id}
    --tensors {os.path.join(temp_dir, pytest.run_id)}
    """.split()

    args = parse_args()
    output_file = os.path.join(args.tensors, f"{pytest.example_mrn}.hd5")

    # Make sure the file doesn't exist
    with pytest.raises(OSError):
        with h5py.File(output_file, "r") as tens_file:
            pass

    # Tensorize and check hd5 structure
    tensorizer = Tensorizer(
        args.bedmaster,
        args.alarms,
        args.edw,
        args.xref,
        args.adt,
    )
    tensorizer.tensorize(
        tensors=args.tensors,
        overwrite_hd5=args.overwrite,
        num_workers=args.num_workers,
        allow_one_source=args.allow_one_source,
    )

    with h5py.File(output_file, "r") as tens_file:
        assert sorted(list(tens_file.keys())) == ["bedmaster", "edw"]

        bedmaster_signals = {
            "waveform": ["i", "ii", "iii", "resp", "spo2", "v"],
            "vitals": ["co", "cuff", "hr", "spo2r", "spo2%"],
            "alarms": [
                "arrhy_suspend",
                "cpp_low",
                "hr_hi_156",
                "hr_hi_160",
                "hr_hi_165",
                "ppeak_high",
                "spo2_probe",
                "tachy",
                "tvexp_low",
            ],
        }
        edw_signals = {
            "med": [
                "aspirin_325_mg_tablet",
                ("cefazolin_2_gram|50_ml_in_dextrose_iso-osmotic_"
                 "intravenous_piggyback"),
                "lactated_ringers_iv_bolus",
                "norepinephrine_infusion_syringe_in_swfi_80_mcg|ml_cmpd_central_mgh",
                "sodium_chloride_0.9_%_intravenous_solution",
            ],
            "flowsheet": [
                "blood_pressure",
                "pulse",
                "r_phs_ob_bp_diastolic_outgoing",
                "r_phs_ob_bp_systolic_outgoing",
                "r_phs_ob_pulse_oximetry_outgoing",
            ],
            "labs":
            ["creatinine", "lactate_blood", "magnesium", "ph_arterial"],
            "surgery": ["colonoscopy", "coronary_artery_bypass_graft"],
            "procedures": ["hemodialysis", "hemodialysis_|_ultrafiltration"],
            "transfusions": [
                "transfuse_cryoprecipitate",
                "transfuse_platelets",
                "transfuse_red_blood_cells",
            ],
            "events": ["rapid_response_start", "code_start"],
        }

        assert tens_file["bedmaster"].attrs["completed"]
        assert tens_file["edw"].attrs["completed"]

        assert sorted(tens_file["bedmaster/345"].keys()) == sorted(
            list(bedmaster_signals.keys()), )
        assert sorted(tens_file["edw/345"].keys()) == sorted(
            list(edw_signals.keys()))

        bedmaster_attrs = [
            "channel",
            "name",
            "scale_factor",
            "source",
            "units",
        ]
        bedmaster_sig_keys = [
            "time",
            "time_corr_arr",
            "value",
            "samples_per_ts",
            "sample_freq",
        ]
        bedmaster_alarms_keys = ["duration", "start_date"]

        # Test units, scaling factor and sample_freq
        hr_dir = tens_file["bedmaster/345/vitals/hr"]
        assert hr_dir.attrs["units"] == "Bpm"
        assert hr_dir.attrs["scale_factor"] == 0.5
        expected_sf = np.array([(0.5, 0)], dtype="float, int")
        assert np.array_equal(hr_dir["sample_freq"], expected_sf)

        ecg_ii = tens_file["bedmaster/345/waveform/ii"]
        assert ecg_ii.attrs["units"] == "mV"
        assert ecg_ii.attrs["scale_factor"] == 0.0243
        expected_sf = np.array(
            [(240.0, 0), (120.0, 80), (240.0, 5760), (120.0, 5808)],
            dtype="float, int",
        )
        assert np.array_equal(ecg_ii["sample_freq"], expected_sf)

        for sig_type, signals in bedmaster_signals.items():
            sig_type_dir = tens_file["bedmaster/345"][sig_type]
            assert sorted(sig_type_dir.keys()) == sorted(
                bedmaster_signals[sig_type])
            for signal in signals:
                # Test interbundle correction
                if sig_type == "vitals":
                    assert sorted(sig_type_dir[signal].keys()) == sorted(
                        bedmaster_sig_keys, )
                    assert sorted(sig_type_dir[signal].attrs.keys()) == sorted(
                        bedmaster_attrs, )
                    diff = np.diff(sig_type_dir[signal]["time"][()])
                    if signal == "hr":  # Signal has a data event 1
                        assert np.array_equal(
                            np.where(diff != 2)[0], np.array([18]))
                    else:
                        # Test signals concatenated
                        assert len(sig_type_dir[signal]["time"]) == 20
                        assert all(diff == 2)

                elif sig_type == "waveform":
                    assert sorted(sig_type_dir[signal].keys()) == sorted(
                        bedmaster_sig_keys, )
                    assert sorted(sig_type_dir[signal].attrs.keys()) == sorted(
                        bedmaster_attrs, )
                    diff = np.diff(sig_type_dir[signal]["time"][:3349])
                    if signal == "v":
                        assert np.array_equal(
                            np.where(diff != 0.25)[0],
                            np.array([113]),
                        )
                    else:
                        assert all(diff == 0.25)
                        length = 32 if signal == "resp" else 160
                        assert len(sig_type_dir[signal]["time"]) == length
                else:
                    assert sorted(sig_type_dir[signal].keys()) == sorted(
                        bedmaster_alarms_keys, )

        for key in edw_signals:
            assert sorted(tens_file[f"edw/345/{key}"].keys()) == sorted(
                edw_signals[key], )
예제 #4
0
def test_independent_args(temp_dir):
    for file in os.listdir(pytest.bedmaster_dir):
        if file.startswith("bedmaster_file-"):
            copy2(os.path.join(pytest.bedmaster_dir, file), temp_dir)

    def _get_output_files():
        return [
            file_name
            for file_name in os.listdir(temp_dir)
            if file_name.endswith(".csv")
        ]

    def _reset_dir():
        for file in os.listdir(temp_dir):
            if file.endswith(".csv"):
                os.remove(os.path.join(temp_dir, file))
        patient_csv = pd.DataFrame.from_dict({"MRN": [123, 456]})
        patient_csv.to_csv(os.path.join(temp_dir, "patient_csv.csv"))

    base_name = "pre_tensorize"
    expected_files = [
        f"{base_name}_edw_demographics.csv",
        f"{base_name}_coverage.csv",
        f"{base_name}_signals_summary.csv",
        "patient_csv.csv",
        f"{base_name}_bedmaster_signal_stats.csv",
        f"{base_name}_bedmaster_files_stats.csv",
    ]

    # Test standard
    sys.argv = [
        ".",
        "pre_tensorize_explore",
        "--edw",
        pytest.edw_dir,
        "--xref",
        pytest.cross_ref_file,
        "--bedmaster",
        temp_dir,
        "--output_folder",
        temp_dir,
        "--patient_csv",
        os.path.join(temp_dir, "patient_csv.csv"),
    ]
    _reset_dir()
    parsed_args = parse_args()
    run(parsed_args)
    assert sorted(expected_files[:4]) == sorted(_get_output_files())

    # Test Bedmaster detailed
    sys.argv.append("--detailed_bedmaster")
    parsed_args = parse_args()
    _reset_dir()
    run(parsed_args)

    assert sorted(expected_files) == sorted(_get_output_files())

    # Test just Bedmaster
    sys.argv.append("--no_xref")
    parsed_args = parse_args()
    _reset_dir()
    run(parsed_args)

    assert sorted(expected_files[3:]) == sorted(_get_output_files())

    # Test no_xref and no detailed
    sys.argv.remove("--detailed_bedmaster")
    _reset_dir()
    parsed_args = parse_args()
    run(parsed_args)

    assert sorted(expected_files[:4]) == sorted(_get_output_files())

    logging.disable(logging.NOTSET)