def test_lcp_child_intervals_hermans_case(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") collation.add_plain_witness("W3", "a b c d E g h i ! q r s t") _, child_lcp_intervals = collation.get_lcp_intervals() self.assertFalse(child_lcp_intervals)
def test_filter_potential_blocks(self): collation = Collation() collation.add_plain_witness("W1", "the fox jumps over the fox") collation.add_plain_witness("w2", "the fox jumps over the dog") potential_blocks = collation.calculate_potential_blocks() collation.filter_potential_blocks(potential_blocks) self.fail("TESTING!")
def testDoubleTransposition1(self): collation = Collation() collation.add_plain_witness("A", "the cat is black") collation.add_plain_witness("B", "black is the cat") alignment_table = collate(collation) self.assertEquals(["the cat", "is", "black"], alignment_table.rows[0].to_list()) self.assertEquals(["black", "is", "the cat"], alignment_table.rows[1].to_list())
def test_near_matching_accidentally_incorrect_long(self): self.maxDiff = None scheduler = Scheduler() collation = Collation() collation.add_plain_witness("A", "The brown fox jumps over this dog.") collation.add_plain_witness( "B", "The brown fox jumps over there that dog.") alignment_table = str( collate(collation, near_match=True, segmentation=False, scheduler=scheduler)) self.assertTask("build column for rank", ["this", "6"], scheduler[0]) self.assertTask("build column for rank", ["this", "7"], scheduler[1]) self.assertTask("move node from prior rank to rank with best match", ["this", "6", "7"], scheduler[2]) self.assertTask("build column for rank", ["over", "5"], scheduler[3]) self.assertTask("build column for rank", ["over", "6"], scheduler[4]) self.assertEquals(5, len(scheduler)) expected = """\ +---+-----+-------+-----+-------+------+-------+------+-----+---+ | A | The | brown | fox | jumps | over | - | this | dog | . | | B | The | brown | fox | jumps | over | there | that | dog | . | +---+-----+-------+-----+-------+------+-------+------+-----+---+""" self.assertEquals(expected, alignment_table)
def test_non_overlapping_blocks_overlap_case(self): collation = Collation() collation.add_plain_witness("W1", "in the in the bleach") collation.add_plain_witness("W2", "in the in the bleach in the") algorithm = Scorer(TokenIndex.create_token_index(collation)) blocks = algorithm._get_non_overlapping_repeating_blocks() self.assertIn(Block(RangeSet("0-4, 6-10")), blocks) # in the in the bleach
def testPlainWitness(self): plain_witness = { 'id': 'A', 'content': 'The quick brown fox jumped over the lazy dogs.' } c = Collation() c.add_witness(plain_witness) self.assertEqual(len(c.witnesses[0].tokens()), 10)
def test_witness_ranges_hermans_case(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") self.assertEquals(RangeSet("0-14"), collation.get_range_for_witness("W1")) self.assertEquals(RangeSet("17-29"), collation.get_range_for_witness("W2"))
def test_combined_string_hermans_case(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") # $ is meant to separate witnesses here self.assertEquals( "a b c d F g h i ! K ! q r s t $ 1 a b c d F g h i ! q r s t", " ".join(collation.combined_tokens))
def test_superbase_generation_multiple_short_witnesses(self): collation = Collation() collation.add_plain_witness("A", "a") collation.add_plain_witness("B", "b") collation.add_plain_witness("C", "c") aligner = EditGraphAligner(collation) graph = VariantGraph() aligner.collate(graph)
def test_exact_matching(self): collation = Collation() collation.add_plain_witness("A", "I bought this glass , because it matches those dinner plates") collation.add_plain_witness("B", "I bought those glasses") alignment_table = collate(collation) self.assertEqual(["I bought ", "this glass , because it matches ", "those ", "dinner plates"], alignment_table.rows[0].to_list_of_strings()) self.assertEqual(["I bought ", None, "those ", "glasses"], alignment_table.rows[1].to_list_of_strings())
def test_blocks_splitting_token_case(self): collation = Collation() collation.add_plain_witness("W1", "a c b c") collation.add_plain_witness("W2", "a c b") algorithm = Scorer(collation) blocks = algorithm._get_non_overlapping_repeating_blocks() block1 = Block(RangeSet("0-2, 5-7")) # a c b self.assertIn(block1, blocks)
def test_non_overlapping_blocks_black_cat(self): collation = Collation() collation.add_plain_witness("W1", "the black cat") collation.add_plain_witness("W2", "the black cat") algorithm = Scorer(collation) blocks = algorithm._get_non_overlapping_repeating_blocks() block1 = Block(RangeSet("0-2, 4-6")) self.assertEqual([block1], blocks)
def test_superbase(self): collation = Collation() collation.add_plain_witness("A", "X a b c d e f X g h i Y Z j k") collation.add_plain_witness("B", "a b c Y d e f Y Z g h i X j k") aligner = EditGraphAligner(collation) graph = VariantGraph() aligner.collate(graph, collation) superbase = aligner.new_superbase self.assertSuperbaseEquals("X a b c Y d e f X Y Z g h i Y Z X j k", superbase)
def test_filter_potential_blocks(self): collation = Collation() collation.add_plain_witness("W1", "a a") collation.add_plain_witness("w2", "a") extsufarr = collation.to_extended_suffix_array() potential_blocks = extsufarr.split_lcp_array_into_intervals() algorithm = Scorer(collation) algorithm.filter_potential_blocks(potential_blocks) self.assertFalse(potential_blocks)
def test_non_overlapping_blocks_Hermans(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") algorithm = Scorer(collation) blocks = algorithm._get_non_overlapping_repeating_blocks() self.assertIn(Block(RangeSet("0-8, 17-25")), blocks) # a b c d F g h i ! self.assertIn(Block(RangeSet("11-14, 26-29")), blocks) # q r s t
def testThisMorningExample(self): collation = Collation() collation.add_plain_witness( "A", "This morning the cat observed little birds in the trees.") collation.add_plain_witness( "B", "The cat was observing birds in the little trees this morning, it observed birds for two hours." ) alignment_table = collate(collation, detect_transpositions=True)
def test_block_witnesses_Hermans_case_two_witnesses(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") algorithm = Scorer(collation) block_witness = algorithm._get_block_witness(collation.witnesses[0]) self.assertEquals(["a b c d F g h i !", "q r s t"], block_witness.debug()) block_witness = algorithm._get_block_witness(collation.witnesses[1]) self.assertEquals(["a b c d F g h i !", "q r s t"], block_witness.debug())
def test_non_overlapping_blocks_Hermans(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") algorithm = Scorer(TokenIndex.create_token_index(collation)) blocks = algorithm._get_non_overlapping_repeating_blocks() self.assertIn(Block(RangeSet("0-8, 16-24")), blocks) # a b c d F g h i ! self.assertIn(Block(RangeSet("11-14, 25-28")), blocks) # q r s t
def test_hermans_witness_order_independence_case_two_witnesses(self): collation = Collation() collation.add_plain_witness("A", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("B", "a b c d F g h i ! q r s t") alignment_table = collate(collation) self.assertEquals(["a b c d F g h i ! ", "K ! ", "q r s t"], alignment_table.rows[0].to_list_of_strings()) self.assertEquals(["a b c d F g h i ! ", None, "q r s t"], alignment_table.rows[1].to_list_of_strings())
def test_split_lcp_intervals_into_smaller_intervals(self): collation = Collation() collation.add_plain_witness("W1", "the cat") collation.add_plain_witness("W2", "the cat") collation.add_plain_witness("W3", "the cat") extsufarr = collation.to_extended_suffix_array() split_intervals = extsufarr.split_lcp_array_into_intervals() self.assertIntervalIn(0, 2, 3, split_intervals) # the cat self.assertIntervalIn(1, 1, 3, split_intervals) # cat self.assertEqual(2, len(split_intervals), "More items: "+str(split_intervals))
def test_superbase_generation_multiple_short_witnesses(self): collation = Collation() collation.add_plain_witness("A", "a") collation.add_plain_witness("B", "b") collation.add_plain_witness("C", "c") aligner = EditGraphAligner(collation) graph = VariantGraph() aligner.collate(graph, collation) superbase = aligner.new_superbase self.assertSuperbaseEquals("a b c", superbase)
def test_lcp_intervals_number_of_witnesses_Hermans_case(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") collation.add_plain_witness("W3", "a b c d E g h i ! q r s t") token_index = TokenIndex(collation.witnesses) token_index.prepare() intervals = token_index.split_lcp_array_into_intervals() potential_block = intervals[1] # ! q r s t self.assertEqual(3, potential_block.number_of_witnesses)
def test_token_array_hermans_case(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") token_index = TokenIndex(collation.witnesses) token_index.prepare() # $ is meant to separate witnesses here self.assertTokenArray( "a b c d F g h i ! K ! q r s t $0 a b c d F g h i ! q r s t", token_index)
def test_witness_ranges_hermans_case(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") token_index = TokenIndex(collation.witnesses) token_index.prepare() self.assertEquals(RangeSet("0-14"), token_index.get_range_for_witness("W1")) self.assertEquals(RangeSet("16-28"), token_index.get_range_for_witness("W2"))
def testTokenArrayMarkersWithThreeWitnesses(self): collation = Collation() collation.add_plain_witness("W1", "interesting nice huh") collation.add_plain_witness("W2", "very nice right") collation.add_plain_witness("W3", "especially interesting") token_index = TokenIndex(collation.witnesses) token_index.prepare() self.assertTokenArray( "interesting nice huh $0 very nice right $1 especially interesting", token_index)
def testCachingOfSuffixArrayAndLCPArray(self): collation = Collation() collation.add_plain_witness("A", "content") collation.add_plain_witness("B", "content") sa1 = collation.get_sa() sa2 = collation.get_sa() self.assertEquals(sa1, sa2) collation.add_plain_witness("C", "content") sa3 = collation.get_sa() self.assertNotEquals(sa2, sa3)
def test_blocks_failing_transposition_use_case_old_algorithm(self): collation = Collation() collation.add_plain_witness("W1", "the cat and the dog") collation.add_plain_witness("W2", "the dog and the cat") algorithm = Scorer(collation) blocks = algorithm._get_non_overlapping_repeating_blocks() block1 = Block(RangeSet("0-1, 9-10")) block2 = Block(RangeSet("3-4, 6-7")) block3 = Block(RangeSet("2, 8")) self.assertEqual([block1, block2, block3], blocks)
def test_near_matching_middle(self): # Three candidates, closest is middle, match rank 2 0 1 (0 is closest) # Should go to the middle; incorrectly goes right self.maxDiff = None scheduler = Scheduler() collation = Collation() collation.add_plain_witness("A", "abcd 0123 efgh") collation.add_plain_witness("B", "abcd 0xxx 012x 01xx efgh") alignment_table = str(collate(collation, near_match=True, segmentation=False, scheduler=scheduler)) # Find the rightmost rank with a gap (rank 4); this is activeRank # Find the first witness with a gap at that rank (A) # Find first token to the left of the gap for the first gappy witness ("0123" in A at rank 2) # and check whether to move it # Calculate strength of match for all columns from the token's current rank (2) through activeRank (4) # parameters are token string and rank to check self.assertTask("build column for rank", ["0123", "2"], scheduler[0]) self.assertTask("build column for rank", ["0123", "3"], scheduler[1]) self.assertTask("build column for rank", ["0123", "4"], scheduler[2]) # The best (max()) fit of "0123" among all ranks between current rank 2 and activeRank 4 # is at rank 3, so move "0123" from current rank 2 to rank 3 self.assertTask("move node from prior rank to rank with best match", ["0123", "2", "3"], scheduler[3]) # No more gaps at activeRank 4, no gaps at rank 3, so move to next rank with a gap # (rank 2, gap in A), with "abcd" at rank 1 self.assertTask("build column for rank", ["abcd", "1"], scheduler[4]) self.assertTask("build column for rank", ["abcd", "2"], scheduler[5]) # Don't move it because it's closer to current location # No more gaps at rank 2, non gaps at rank 1, no more ranks self.assertEqual(6, len(scheduler)) expected = """\ +---+------+------+------+------+------+ | A | abcd | - | 0123 | - | efgh | | B | abcd | 0xxx | 012x | 01xx | efgh | +---+------+------+------+------+------+""" self.assertEqual(expected, alignment_table) def test_near_matching_clash(self): # If the previous rank has a vertex with more than one witness, where at least # one witness is a candidate for being moved, don't move it if any of the # witnesses has a node at the new rank. # # If there were only A and B, we'd move cce away from bbb to align with cce. # Witness C should prevent this. self.maxDiff = None collation = Collation() collation.add_plain_witness("A", "aaa bbb ccc ddd") collation.add_plain_witness("B", "aaa cce ddd") collation.add_plain_witness("C", "aaa cce ccc ddd") alignment_table = str(collate(collation, near_match=True, segmentation=False)) expected = """\ +---+-----+-----+-----+-----+ | A | aaa | bbb | ccc | ddd | | B | aaa | cce | - | ddd | | C | aaa | cce | ccc | ddd | +---+-----+-----+-----+-----+""" self.assertEqual(expected, alignment_table)
def test_heuristic_function_everything_equals(self): collation = Collation() collation.add_plain_witness("A", "everything equal") collation.add_plain_witness("B", "everything equal") aligner = ExperimentalAstarAligner(collation) aligner._create_heuristic_table(collation.witnesses[0].tokens(), collation.witnesses[1]) self.assertEqual([0, 1, 2], aligner.heuristic_table[0]) self.assertEqual([1, 0, 1], aligner.heuristic_table[1]) self.assertEqual([2, 1, 0], aligner.heuristic_table[2]) pass
def test_lcp_intervals_failing_use_case_old_algorithm(self): collation = Collation() collation.add_plain_witness("W1", "the cat and the dog") collation.add_plain_witness("W2", "the dog and the cat") parent_lcp_intervals, child_lcp_intervals = collation.get_lcp_intervals() self.assertIn((1,2), parent_lcp_intervals) self.assertIn((3,4), parent_lcp_intervals) self.assertIn((5,6), parent_lcp_intervals) self.assertIn((7, 10), parent_lcp_intervals) self.assertIn((7,8), child_lcp_intervals[7]) self.assertIn((9,10), child_lcp_intervals[7])