def test_not_constatnt_splitter(): sequence = np.array([1, 1, 1, 2, 2, 2, 2]) splitter = pasio.NotZeroSplitter(base_splitter=pasio.SquareSplitter()) splits = splitter.split(sequence, simple_scorer_factory) assert splits[1] == [0, 3] splitter = pasio.NotZeroSplitter(base_splitter=pasio.SquareSplitter()) splits = splitter.split(sequence, simple_greedy_scorer_factory) assert splits[1] == list(range(len(sequence))) splitter = pasio.NotConstantSplitter(base_splitter=pasio.SquareSplitter()) splits = splitter.split(sequence, simple_greedy_scorer_factory) assert splits[1] == [0, 3] splitter = pasio.NotConstantSplitter(base_splitter=pasio.SquareSplitter()) splits = splitter.split(sequence, simple_greedy_scorer_factory, np.array(range(len(sequence) - 1))) assert splits[1] == [0, 3] splitter = pasio.NotConstantSplitter(base_splitter=pasio.SquareSplitter()) assert np.allclose( np.array([0, 3]), splitter.get_non_constant_split_candidates(sequence, None)) assert np.allclose( np.array([0, 3]), splitter.get_non_constant_split_candidates(sequence, np.array([0, 3])))
def test_split_with_length_regularization(): # score of split 'AAA|B|AA' = 9+1+4 = 14 # with regularization = 9+1+4 - 1.5*(1/log(3+1)+1/log(1+1)+1/log(2+1)) = 9.38 # alternative split: 'AAA|BAA' gives score = 9+3 - 1.5*(1/log(3+1)+1/log(3+1)) = 9.83 sequence = 'AAABAA' splitter = pasio.SquareSplitter( length_regularization_multiplier=1.5, length_regularization_function=lambda x: 1 / np.log(1 + x)) optimal_split = splitter.split(sequence, SimpleScorer) assert optimal_split[1] == [0, 3] assert optimal_split[0] == 9 + 3 - 1.5 * (1 / np.log(3 + 1) + 1 / np.log(3 + 1)) # limiting possible splits splitter = pasio.SquareSplitter( length_regularization_multiplier=1.5, length_regularization_function=lambda x: 1 / np.log(1 + x)) optimal_split = splitter.split(sequence, SimpleScorer, split_candidates=np.array([0, 4, 5])) assert optimal_split[1] == [0, 4] assert optimal_split[0] == 4 + 4 - 1.5 * (1 / np.log(4 + 1) + 1 / np.log(2 + 1))
def test_split_into_segments_candidates(): sequence = 'AAABBB' optimal_split = pasio.SquareSplitter().split( sequence, simple_scorer_factory, split_candidates=[0, 1, 2, 3, 5, 6]) assert optimal_split[1] == [0, 3] assert optimal_split[0] == 9 + 9 sequence = 'AAABBB' optimal_split = pasio.SquareSplitter().split(sequence, simple_scorer_factory, split_candidates=[0, 3, 5, 6]) assert optimal_split[1] == [0, 3] assert optimal_split[0] == 9 + 9 sequence = 'AAABBBC' optimal_split = pasio.SquareSplitter().split(sequence, simple_scorer_factory, split_candidates=[0, 3, 7]) assert optimal_split[1] == [0, 3] assert optimal_split[0] == 9 + 4 sequence = 'AAABBBC' optimal_split = pasio.SquareSplitter().split(sequence, simple_scorer_factory, split_candidates=[0, 3]) assert optimal_split[1] == [0, 3] assert optimal_split[0] == 9 + 4 sequence = 'AAAAAA' optimal_split = pasio.SquareSplitter().split(sequence, simple_scorer_factory, split_candidates=[0, 3]) assert optimal_split[1] == [0] assert optimal_split[0] == 36
def test_split_into_segments_square(): sequence = 'A' optimal_split = pasio.SquareSplitter().split(sequence, simple_scorer_factory) assert optimal_split[1] == [0] assert optimal_split[0] == 1 sequence = 'AAA' optimal_split = pasio.SquareSplitter().split(sequence, simple_scorer_factory) assert optimal_split[1] == [0] assert optimal_split[0] == 9 sequence = 'AAABBB' optimal_split = pasio.SquareSplitter().split(sequence, simple_scorer_factory) assert optimal_split[1] == [0, 3] assert optimal_split[0] == 9 + 9 sequence = 'AAABBBC' optimal_split = pasio.SquareSplitter().split(sequence, simple_scorer_factory) assert optimal_split[1] == [0, 3, 6] assert optimal_split[0] == 9 + 9 + 1 sequence = 'ABBBC' optimal_split = pasio.SquareSplitter().split(sequence, simple_scorer_factory) assert optimal_split[1] == [0, 1, 4] assert optimal_split[0] == 1 + 9 + 1
def test_stat_split_into_segments_square(): def split_on_two_segments_or_not(counts, scorer_factory): scorer = scorer_factory(counts) best_score = scorer.score(0, len(counts)) split_point = 0 for i in range(len(counts)): current_score = scorer.score(stop=i) + scorer.score(start=i) if current_score > best_score: split_point = i best_score = current_score return best_score, split_point np.random.seed(4) scorer_factory = lambda counts, split_candidates=None: pasio.LogMarginalLikelyhoodComputer( counts, 1, 1, split_candidates) for repeat in range(5): counts = np.concatenate( [np.random.poisson(15, 100), np.random.poisson(20, 100)]) optimal_split = pasio.SquareSplitter().split(counts, scorer_factory) two_split = split_on_two_segments_or_not(counts, scorer_factory) assert optimal_split[0] >= two_split[0] assert two_split[1] in optimal_split[1] assert np.allclose( optimal_split[0], pasio.compute_score_from_splits(counts, optimal_split[1], scorer_factory)) if (two_split[1] is None): assert optimal_split[1] == [0, 200] else: assert abs(two_split[1] - 100) < 10
def test_split_into_segments_slidingwindow(): A = 'AAAAAAAAAAAAAAAA' B = 'BBBBBBBBBBBBBBBBB' sequence = A + B splitter = pasio.SlidingWindowSplitter( window_size=10, window_shift=5, base_splitter=pasio.SquareSplitter()) splits = splitter.split(sequence, simple_scorer_factory) assert splits[1] == [0, len(A)] assert splits[0] == len(A)**2 + len(B)**2 splitter = pasio.SlidingWindowSplitter( window_size=10, window_shift=5, base_splitter=pasio.SquareSplitter( split_number_regularization_multiplier=2)) splits = splitter.split(sequence, simple_scorer_factory) assert splits[1] == [0, len(A)] assert splits[0] == len(A)**2 + len(B)**2 - 2
def test_split_with_split_num_regularization(): # score of split 'AAA|B|AA' = 9+1+4 = 14 # with regularization = 9+1+4 - 3*2 = 8 # alternative split: 'AAA|BAA' gives score = 9+3-3*1 = 9 sequence = 'AAABAA' splitter = pasio.SquareSplitter( split_number_regularization_multiplier=3, split_number_regularization_function=lambda x: x) optimal_split = splitter.split(sequence, SimpleScorer) assert optimal_split[1] == [0, 3] assert optimal_split[0] == 9
def segmentation(counts, scorer, candidates=None): optimal_split = pasio.SquareSplitter().split(counts, scorer, split_candidates=candidates)