def apply_feature_filters_length_test(): """Test length filter function.""" # capture -- mean: 1; stdv: 0; median: 1; min: 1; max: 1; len: 6 capture = [1, 1, 1, 1, 1, 1] # Only length filter -- pass (edge case, inclusive high) filters = [filtering.LengthFilter(0, 6)] pass_filters = filtering.apply_feature_filters(capture, filters) assert pass_filters # Only length filter -- pass (edge case, inclusive low) filters = [filtering.LengthFilter(6, 10)] pass_filters = filtering.apply_feature_filters(capture, filters) assert pass_filters # Only length filter -- fail (too short) filters = [filtering.LengthFilter(8, 10)] pass_filters = filtering.apply_feature_filters(capture, filters) assert not pass_filters # Only length filter -- fail (too long) filters = [filtering.LengthFilter(0, 5)] pass_filters = filtering.apply_feature_filters(capture, filters) assert not pass_filters # Only length filter -- pass (no filter actually given) filters = [filtering.LengthFilter(None, None)] pass_filters = filtering.apply_feature_filters(capture, filters) assert pass_filters
def find_captures_4_unfolded_terminal_test(): """Example capture window contains 1 long terminal capturethat has unfolded. Tests: find_captures returns 1 capture """ data_file = "src/tests/data/capture_windows/test_data_capture_window_4.txt.gz" data = picoampere_signal_from_data_file(data_file) window = Window(5_511_887, 5_604_585) signal_threshold_frac = 0.7 alt_open_channel_pA = 230 terminal_capture_only = True filters = [filtering.LengthFilter(100, None)] delay = 0 end_tol = 0 channel_number = 2 captures = segment.find_captures( data, channel_number, window, signal_threshold_frac, alt_open_channel_pA, terminal_capture_only=terminal_capture_only, filters=filters, delay=delay, end_tol=end_tol, ) assert len(captures) == 1
def find_captures_3_multicapture_nonterminal_test(): """Example capture window contains 1 long terminal capture & 1 medium capture. Tests: find_captures returns... 3 captures when terminal_capture_only = False """ data_file = "src/tests/data/capture_windows/test_data_capture_window_3.txt.gz" data = picoampere_signal_from_data_file(data_file) window = Window(1_187_841, 1_280_674) actual_captures = [(1_200_088, 1_201_033, False), (1_252_611, 1_280_674, True)] signal_threshold_frac = 0.7 alt_open_channel_pA = 230 terminal_capture_only = False filters = [filtering.LengthFilter(100, None)] delay = 3 end_tol = 0 channel_number = 2 captures = segment.find_captures( data, channel_number, window, signal_threshold_frac, alt_open_channel_pA, terminal_capture_only=terminal_capture_only, filters=filters, delay=delay, end_tol=end_tol, ) assert len(captures) == len(actual_captures) for test_capture in captures: test_start = test_capture.window.start test_end = test_capture.window.end ejected = test_capture.ejected assert (test_start, test_end, ejected) in actual_captures
def find_captures_3_multicapture_terminal_test(): """Example capture window contains 1 long terminal capture & 2 medium/short captures. Tests: find_captures returns... 1 capture when terminal_capture_only = True """ data_file = "src/tests/data/capture_windows/test_data_capture_window_3.txt.gz" data = picoampere_signal_from_data_file(data_file) window = Window(1_187_841, 1_280_674) signal_threshold_frac = 0.7 alt_open_channel_pA = 230 terminal_capture_only = True filters = [filtering.LengthFilter(100, None)] delay = 0 end_tol = 0 channel_number = 2 captures = segment.find_captures( data, channel_number, window, signal_threshold_frac, alt_open_channel_pA, terminal_capture_only=terminal_capture_only, filters=filters, delay=delay, end_tol=end_tol, ) assert len(captures) == 1
def find_captures_2_nocaptures_test(): """Example capture window contains no captures. Test: find_captures returns no captures""" data_file = "src/tests/data/capture_windows/test_data_capture_window_2.txt.gz" data = picoampere_signal_from_data_file(data_file) window = Window(3_423_474, 3_516_439) signal_threshold_frac = 0.7 alt_open_channel_pA = 230 terminal_capture_only = False filters = [filtering.LengthFilter(100, None)] delay = 10 end_tol = 0 channel_number = 1 captures = segment.find_captures( data, channel_number, window, signal_threshold_frac, alt_open_channel_pA, terminal_capture_only=terminal_capture_only, filters=filters, delay=delay, end_tol=end_tol, ) assert len(captures) == 0
def find_captures_1_double_capture_noterminal_2_test(): """Example capture window contains 2 long captures, neither terminal. Also contains a few short blips. Test: terminal_capture_only = False returns 2 captures""" data_file = "src/tests/data/capture_windows/test_data_capture_window_1.txt.gz" data = picoampere_signal_from_data_file(data_file) window = Window(4_765_695, 4_858_482) signal_threshold_frac = 0.7 alt_open_channel_pA = 230 terminal_capture_only = False filters = [filtering.LengthFilter(100, None)] delay = 0 end_tol = 0 channel_number = 1 captures = segment.find_captures( data, channel_number, window, signal_threshold_frac, alt_open_channel_pA, terminal_capture_only=terminal_capture_only, filters=filters, delay=delay, end_tol=end_tol, ) assert len(captures) == 2
def find_captures_0_single_capture_terminal_test(): data_file = "src/tests/data/capture_windows/test_data_capture_window_0.txt.gz" window = Window(3_572_989, 3_665_680) data = picoampere_signal_from_data_file(data_file) actual_captures = [(33822 + window.start, 92691 + window.start, True)] signal_threshold_frac = 0.7 alt_open_channel_pA = 230 terminal_capture_only = True filters = [filtering.LengthFilter(100, None)] delay = 0 end_tol = 0 channel_number = 1 captures = segment.find_captures( data, channel_number, window, signal_threshold_frac, alt_open_channel_pA, terminal_capture_only=terminal_capture_only, filters=filters, delay=delay, end_tol=end_tol, ) assert len(captures) == len(actual_captures) for test_capture in captures: test_start = test_capture.window.start test_end = test_capture.window.end ejected = test_capture.ejected assert (test_start, test_end, ejected) in actual_captures
def segment_test(self): bulk_f5_fname = "src/tests/data/bulk_fast5_dummy.fast5" filters = [filtering.LengthFilter(100, None)] config = GeneralConfiguration(config={"n_workers": 2, "capture_directory": "src/tests"}) segment_config = { "voltage_threshold": -180, "signal_threshold_frac": 0.7, "translocation_delay": 20, "open_channel_prior_mean": 220, "open_channel_prior_stdv": 50, "good_channels": [ 1, 2, 3, ], # this will be internally overwritten by the good channels calculation, which should not include channel 2 "end_tolerance": 50, "terminal_capture_only": False, "n_captures_per_file": 1000, "bulkfast5": bulk_f5_fname, } segment_config = SegmentConfiguration(segment_config) segment.segment(bulk_f5_fname, config, segment_config, overwrite=True, filters=filters) run_id = "d0befb838f5a9a966e3c559dc3a75a6612745849" actual_n_captures = 5 n_captures = 0 capture_f5_fname = f"src/tests/{run_id}_1.fast5" with h5py.File(capture_f5_fname, "r") as f5: for grp in f5.get("/"): if "read" not in grp: continue n_captures += 1 d = f5[grp] a = d["Signal"].attrs start_time_local = a.get("start_time_local") start_time_bulk = a.get("start_time_bulk") assert start_time_local == start_time_bulk # No offset here duration = a.get("duration") len_signal = len(d["Signal"][()]) assert len_signal == duration voltage = a.get("voltage") assert voltage == segment_config.voltage_threshold print(duration, a.get("channel_number")) assert n_captures == actual_n_captures os.remove(capture_f5_fname)
def parallel_find_captures_overflow_file_test(self): bulk_f5_fname = "src/tests/data/bulk_fast5_dummy.fast5" filters = [filtering.LengthFilter(100, None)] config = GeneralConfiguration(config={"n_workers": 2, "capture_directory": "src/tests"}) segment_config = { "voltage_threshold": -180, "signal_threshold_frac": 0.7, "translocation_delay": 20, "open_channel_prior_mean": 220, "open_channel_prior_stdv": 50, "good_channels": [1, 3], "end_tolerance": 50, "terminal_capture_only": False, "n_captures_per_file": 2, "bulkfast5": bulk_f5_fname, } segment_config = SegmentConfiguration(segment_config) segment.parallel_find_captures(config, segment_config, overwrite=True, filters=filters) run_id = "d0befb838f5a9a966e3c559dc3a75a6612745849" actual_n_captures = 5 n_captures = 0 capture_f5_fnames = [ os.path.join("src/tests/", x) for x in os.listdir("src/tests/") if run_id in x ] assert len(capture_f5_fnames) == 3 for capture_f5_fname in capture_f5_fnames: with h5py.File(capture_f5_fname, "r") as f5: for grp in f5.get("/"): if "read" not in grp: continue n_captures += 1 d = f5[grp] a = d["Signal"].attrs start_time_local = a.get("start_time_local") start_time_bulk = a.get("start_time_bulk") assert start_time_local == start_time_bulk # No offset here duration = a.get("duration") len_signal = len(d["Signal"][()]) assert len_signal == duration voltage = a.get("voltage") assert voltage == segment_config.voltage_threshold print(duration, a.get("channel_number")) os.remove(capture_f5_fname) assert n_captures == actual_n_captures
def find_captures_7_capture_no_open_channel_test(): """Example capture window contains 1 long terminal capture. Open pore region is extremely, extremely short. Test by cutting off the open pore region. Tests: find_captures returns 1 capture; open pore returns alt value. """ data_file = "src/tests/data/capture_windows/test_data_capture_window_7.txt.gz" data = picoampere_signal_from_data_file(data_file)[100:] window = Window(2_919_913, 3_013_723) signal_threshold_frac = 0.7 alt_open_channel_pA = 230 terminal_capture_only = False filters = [filtering.LengthFilter(100, None)] delay = 100 end_tol = 0 channel_number = 2 captures = segment.find_captures( data, channel_number, window, signal_threshold_frac, alt_open_channel_pA, terminal_capture_only=terminal_capture_only, filters=filters, delay=delay, end_tol=end_tol, ) assert len(captures) == 1 # Rough check; should be ~229.05 & anything close is okay. # The function is nondeterministic & should return this exact value, but if # future changes are made, some tolerance can be allowed. expected_open_channel_pA = 230 open_channel_pA = np.array([capture.open_channel_pA_calculated for capture in captures]) all_currents_within_bounds = all( (np.isclose(open_channel_pA, expected_open_channel_pA, atol=0.5)) ) assert ( all_currents_within_bounds ), f"All captures should have calculated an open channel current close to {expected_open_channel_pA}."
def find_captures_5_unfolded_terminal_test(): """Example capture window contains 1 long terminal capture. It was captured almost immediately, causing a very short open pore region. Tests: find_captures returns 1 capture """ data_file = "src/tests/data/capture_windows/test_data_capture_window_5.txt.gz" data = picoampere_signal_from_data_file(data_file) window = Window(965_676, 1_059_216) signal_threshold_frac = 0.7 alt_open_channel_pA = 230 terminal_capture_only = True filters = [filtering.LengthFilter(100, None)] delay = 0 end_tol = 0 channel_number = 1 captures = segment.find_captures( data, channel_number, window, signal_threshold_frac, alt_open_channel_pA, terminal_capture_only=terminal_capture_only, filters=filters, delay=delay, end_tol=end_tol, ) assert len(captures) == 1 open_channel_pA = np.array([capture.open_channel_pA_calculated for capture in captures]) low_expected_open_channel_pA = 228.5 high_expected_open_channel_pA = 230 # Rough check; should be ~229.05 & anything close is okay. # The function is nondeterministic & should return this exact value, but if # future changes are made, some tolerance can be allowed. all_currents_within_bounds = all( (open_channel_pA > low_expected_open_channel_pA) & (open_channel_pA < high_expected_open_channel_pA) ) assert ( all_currents_within_bounds ), f"Expect all capture open channel currents to be between '{low_expected_open_channel_pA}' and '{high_expected_open_channel_pA}'."
def find_captures_8_capture_no_open_channel_test(): """Example capture window contains 2 captures: both long, 1 terminal. Test non-terminal long capture. Tests: find_captures returns 2 captures. Checks exact capture boundaries with delay = 3 """ data_file = "src/tests/data/capture_windows/test_data_capture_window_8.txt.gz" data = picoampere_signal_from_data_file(data_file) window = Window(4_875_289, 4_969_337) signal_threshold_frac = 0.7 alt_open_channel_pA = 230 terminal_capture_only = False filters = [filtering.LengthFilter(100, None)] delay = 3 end_tol = 0 channel_number = 2 captures = segment.find_captures( data, channel_number, window, signal_threshold_frac, alt_open_channel_pA, terminal_capture_only=terminal_capture_only, filters=filters, delay=delay, end_tol=end_tol, ) assert len(captures) == 2 actual_captures = [ (11310 + window.start, 22098 + window.start, False), (26617 + window.start, 94048 + window.start, True), ] for test_capture in captures: test_start = test_capture.window.start test_end = test_capture.window.end ejected = test_capture.ejected assert (test_start, test_end, ejected) in actual_captures
def find_captures_6_clog_no_open_channel_test(): """Example capture window contains 1 long terminal capture. Open pore region is extremely, extremely short. Test by cutting off the open pore region. Tests: find_captures returns 1 capture; open pore returns alt value. """ data_file = "src/tests/data/capture_windows/test_data_capture_window_6.txt.gz" data = picoampere_signal_from_data_file(data_file)[100:] window = Window(2_769_436, 2_863_265) signal_threshold_frac = 0.7 alt_open_channel_pA = 230 terminal_capture_only = False filters = [filtering.LengthFilter(100, None)] delay = 100 end_tol = 0 channel_number = 1 captures = segment.find_captures( data, channel_number, window, signal_threshold_frac, alt_open_channel_pA, terminal_capture_only=terminal_capture_only, filters=filters, delay=delay, end_tol=end_tol, ) assert len(captures) == 1 open_channel_pA = np.array([capture.open_channel_pA_calculated for capture in captures]) expected_open_channel_pA = 230 all_currents_within_bounds = all( (np.isclose(open_channel_pA, expected_open_channel_pA, atol=0.5)) ) assert ( all_currents_within_bounds ), f"All calculated open channel currents should be close to {expected_open_channel_pA}"