def test_lcp_child_intervals_hermans_case(self):
     collation = Collation()
     collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")
     collation.add_plain_witness("W2", "a b c d F g h i ! q r s t")
     collation.add_plain_witness("W3", "a b c d E g h i ! q r s t")
     _, child_lcp_intervals = collation.get_lcp_intervals()
     self.assertFalse(child_lcp_intervals)
示例#2
0
 def test_filter_potential_blocks(self):
     collation = Collation()
     collation.add_plain_witness("W1", "the fox jumps over the fox")
     collation.add_plain_witness("w2", "the fox jumps over the dog")
     potential_blocks = collation.calculate_potential_blocks()
     collation.filter_potential_blocks(potential_blocks)
     self.fail("TESTING!")
示例#3
0
 def testDoubleTransposition1(self):
     collation = Collation()
     collation.add_plain_witness("A", "the cat is black")
     collation.add_plain_witness("B", "black is the cat")
     alignment_table = collate(collation)
     self.assertEquals(["the cat", "is", "black"], alignment_table.rows[0].to_list())
     self.assertEquals(["black", "is", "the cat"], alignment_table.rows[1].to_list())
示例#4
0
    def test_near_matching_accidentally_incorrect_long(self):
        self.maxDiff = None
        scheduler = Scheduler()
        collation = Collation()
        collation.add_plain_witness("A", "The brown fox jumps over this dog.")
        collation.add_plain_witness(
            "B", "The brown fox jumps over there that dog.")
        alignment_table = str(
            collate(collation,
                    near_match=True,
                    segmentation=False,
                    scheduler=scheduler))
        self.assertTask("build column for rank", ["this", "6"], scheduler[0])
        self.assertTask("build column for rank", ["this", "7"], scheduler[1])
        self.assertTask("move node from prior rank to rank with best match",
                        ["this", "6", "7"], scheduler[2])
        self.assertTask("build column for rank", ["over", "5"], scheduler[3])
        self.assertTask("build column for rank", ["over", "6"], scheduler[4])
        self.assertEquals(5, len(scheduler))
        expected = """\
+---+-----+-------+-----+-------+------+-------+------+-----+---+
| A | The | brown | fox | jumps | over | -     | this | dog | . |
| B | The | brown | fox | jumps | over | there | that | dog | . |
+---+-----+-------+-----+-------+------+-------+------+-----+---+"""
        self.assertEquals(expected, alignment_table)
 def test_non_overlapping_blocks_overlap_case(self):
     collation = Collation()
     collation.add_plain_witness("W1", "in the in the bleach")
     collation.add_plain_witness("W2", "in the in the bleach in the")
     algorithm = Scorer(TokenIndex.create_token_index(collation))
     blocks = algorithm._get_non_overlapping_repeating_blocks()
     self.assertIn(Block(RangeSet("0-4, 6-10")),
                   blocks)  # in the in the bleach
示例#6
0
 def testPlainWitness(self):
     plain_witness = {
         'id': 'A',
         'content': 'The quick brown fox jumped over the lazy dogs.'
     }
     c = Collation()
     c.add_witness(plain_witness)
     self.assertEqual(len(c.witnesses[0].tokens()), 10)
 def test_witness_ranges_hermans_case(self):
     collation = Collation()
     collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")
     collation.add_plain_witness("W2", "a b c d F g h i ! q r s t")
     self.assertEquals(RangeSet("0-14"),
                       collation.get_range_for_witness("W1"))
     self.assertEquals(RangeSet("17-29"),
                       collation.get_range_for_witness("W2"))
 def test_combined_string_hermans_case(self):
     collation = Collation()
     collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")
     collation.add_plain_witness("W2", "a b c d F g h i ! q r s t")
     # $ is meant to separate witnesses here
     self.assertEquals(
         "a b c d F g h i ! K ! q r s t $ 1 a b c d F g h i ! q r s t",
         " ".join(collation.combined_tokens))
 def test_superbase_generation_multiple_short_witnesses(self):
     collation = Collation()
     collation.add_plain_witness("A", "a")
     collation.add_plain_witness("B", "b")
     collation.add_plain_witness("C", "c")
     aligner = EditGraphAligner(collation)
     graph = VariantGraph()
     aligner.collate(graph)
示例#10
0
 def test_exact_matching(self):
     collation = Collation()
     collation.add_plain_witness("A", "I bought this glass , because it matches those dinner plates")
     collation.add_plain_witness("B", "I bought those glasses")
     alignment_table = collate(collation)
     self.assertEqual(["I bought ", "this glass , because it matches ", "those ", "dinner plates"],
                       alignment_table.rows[0].to_list_of_strings())
     self.assertEqual(["I bought ", None, "those ", "glasses"], alignment_table.rows[1].to_list_of_strings())
示例#11
0
 def test_blocks_splitting_token_case(self):
     collation = Collation()
     collation.add_plain_witness("W1", "a c b c")
     collation.add_plain_witness("W2", "a c b")
     algorithm = Scorer(collation)
     blocks = algorithm._get_non_overlapping_repeating_blocks()
     block1 = Block(RangeSet("0-2, 5-7")) # a c b
     self.assertIn(block1, blocks)
示例#12
0
 def test_non_overlapping_blocks_black_cat(self):
     collation = Collation()
     collation.add_plain_witness("W1", "the black cat")
     collation.add_plain_witness("W2", "the black cat")
     algorithm = Scorer(collation)
     blocks = algorithm._get_non_overlapping_repeating_blocks()
     block1 = Block(RangeSet("0-2, 4-6"))
     self.assertEqual([block1], blocks)
示例#13
0
 def test_superbase(self):
     collation = Collation()
     collation.add_plain_witness("A", "X a b c d e f X g h i Y Z j k")
     collation.add_plain_witness("B", "a b c Y d e f Y Z g h i X j k")
     aligner = EditGraphAligner(collation)
     graph = VariantGraph()
     aligner.collate(graph, collation)
     superbase = aligner.new_superbase
     self.assertSuperbaseEquals("X a b c Y d e f X Y Z g h i Y Z X j k", superbase)
 def test_filter_potential_blocks(self):
     collation = Collation()
     collation.add_plain_witness("W1", "a a")
     collation.add_plain_witness("w2", "a")
     extsufarr = collation.to_extended_suffix_array()
     potential_blocks = extsufarr.split_lcp_array_into_intervals()
     algorithm = Scorer(collation)
     algorithm.filter_potential_blocks(potential_blocks)
     self.assertFalse(potential_blocks)
 def test_non_overlapping_blocks_Hermans(self):
     collation = Collation()
     collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")
     collation.add_plain_witness("W2", "a b c d F g h i ! q r s t")
     algorithm = Scorer(collation)
     blocks = algorithm._get_non_overlapping_repeating_blocks()
     self.assertIn(Block(RangeSet("0-8, 17-25")),
                   blocks)  # a b c d F g h i !
     self.assertIn(Block(RangeSet("11-14, 26-29")), blocks)  # q r s t
 def testThisMorningExample(self):
     collation = Collation()
     collation.add_plain_witness(
         "A", "This morning the cat observed little birds in the trees.")
     collation.add_plain_witness(
         "B",
         "The cat was observing birds in the little trees this morning, it observed birds for two hours."
     )
     alignment_table = collate(collation, detect_transpositions=True)
示例#17
0
 def test_block_witnesses_Hermans_case_two_witnesses(self):
     collation = Collation()
     collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")
     collation.add_plain_witness("W2", "a b c d F g h i ! q r s t")
     algorithm = Scorer(collation)
     block_witness = algorithm._get_block_witness(collation.witnesses[0])
     self.assertEquals(["a b c d F g h i !", "q r s t"], block_witness.debug())
     block_witness = algorithm._get_block_witness(collation.witnesses[1])
     self.assertEquals(["a b c d F g h i !", "q r s t"], block_witness.debug())
 def test_non_overlapping_blocks_Hermans(self):
     collation = Collation()
     collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")
     collation.add_plain_witness("W2", "a b c d F g h i ! q r s t")
     algorithm = Scorer(TokenIndex.create_token_index(collation))
     blocks = algorithm._get_non_overlapping_repeating_blocks()
     self.assertIn(Block(RangeSet("0-8, 16-24")),
                   blocks)  # a b c d F g h i !
     self.assertIn(Block(RangeSet("11-14, 25-28")), blocks)  # q r s t
示例#19
0
 def test_hermans_witness_order_independence_case_two_witnesses(self):
     collation = Collation()
     collation.add_plain_witness("A", "a b c d F g h i ! K ! q r s t")
     collation.add_plain_witness("B", "a b c d F g h i ! q r s t")
     alignment_table = collate(collation)
     self.assertEquals(["a b c d F g h i ! ", "K ! ", "q r s t"],
                       alignment_table.rows[0].to_list_of_strings())
     self.assertEquals(["a b c d F g h i ! ", None, "q r s t"],
                       alignment_table.rows[1].to_list_of_strings())
示例#20
0
 def test_split_lcp_intervals_into_smaller_intervals(self):
     collation = Collation()
     collation.add_plain_witness("W1", "the cat")
     collation.add_plain_witness("W2", "the cat")
     collation.add_plain_witness("W3", "the cat")
     extsufarr = collation.to_extended_suffix_array()
     split_intervals = extsufarr.split_lcp_array_into_intervals()
     self.assertIntervalIn(0, 2, 3, split_intervals) # the cat
     self.assertIntervalIn(1, 1, 3, split_intervals) # cat
     self.assertEqual(2, len(split_intervals), "More items: "+str(split_intervals))
示例#21
0
 def test_superbase_generation_multiple_short_witnesses(self):
     collation = Collation()
     collation.add_plain_witness("A", "a")
     collation.add_plain_witness("B", "b")
     collation.add_plain_witness("C", "c")
     aligner = EditGraphAligner(collation)
     graph = VariantGraph()
     aligner.collate(graph, collation)
     superbase = aligner.new_superbase
     self.assertSuperbaseEquals("a b c", superbase)
示例#22
0
 def test_lcp_intervals_number_of_witnesses_Hermans_case(self):
     collation = Collation()
     collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")
     collation.add_plain_witness("W2", "a b c d F g h i ! q r s t")
     collation.add_plain_witness("W3", "a b c d E g h i ! q r s t")
     token_index = TokenIndex(collation.witnesses)
     token_index.prepare()
     intervals = token_index.split_lcp_array_into_intervals()
     potential_block = intervals[1]  # ! q r s t
     self.assertEqual(3, potential_block.number_of_witnesses)
示例#23
0
 def test_token_array_hermans_case(self):
     collation = Collation()
     collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")
     collation.add_plain_witness("W2", "a b c d F g h i ! q r s t")
     token_index = TokenIndex(collation.witnesses)
     token_index.prepare()
     # $ is meant to separate witnesses here
     self.assertTokenArray(
         "a b c d F g h i ! K ! q r s t $0 a b c d F g h i ! q r s t",
         token_index)
示例#24
0
 def test_witness_ranges_hermans_case(self):
     collation = Collation()
     collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")
     collation.add_plain_witness("W2", "a b c d F g h i ! q r s t")
     token_index = TokenIndex(collation.witnesses)
     token_index.prepare()
     self.assertEquals(RangeSet("0-14"),
                       token_index.get_range_for_witness("W1"))
     self.assertEquals(RangeSet("16-28"),
                       token_index.get_range_for_witness("W2"))
示例#25
0
 def testTokenArrayMarkersWithThreeWitnesses(self):
     collation = Collation()
     collation.add_plain_witness("W1", "interesting nice huh")
     collation.add_plain_witness("W2", "very nice right")
     collation.add_plain_witness("W3", "especially interesting")
     token_index = TokenIndex(collation.witnesses)
     token_index.prepare()
     self.assertTokenArray(
         "interesting nice huh $0 very nice right $1 especially interesting",
         token_index)
示例#26
0
 def testCachingOfSuffixArrayAndLCPArray(self):
     collation = Collation()
     collation.add_plain_witness("A", "content")
     collation.add_plain_witness("B", "content")
     sa1 = collation.get_sa()
     sa2 = collation.get_sa()
     self.assertEquals(sa1, sa2)
     collation.add_plain_witness("C", "content")
     sa3 = collation.get_sa()
     self.assertNotEquals(sa2, sa3)
示例#27
0
 def test_blocks_failing_transposition_use_case_old_algorithm(self):
     collation = Collation()
     collation.add_plain_witness("W1", "the cat and the dog")
     collation.add_plain_witness("W2", "the dog and the cat")
     algorithm = Scorer(collation)
     blocks = algorithm._get_non_overlapping_repeating_blocks()
     block1 = Block(RangeSet("0-1, 9-10"))
     block2 = Block(RangeSet("3-4, 6-7"))
     block3 = Block(RangeSet("2, 8"))
     self.assertEqual([block1, block2, block3], blocks)
示例#28
0
    def test_near_matching_middle(self):
        # Three candidates, closest is middle, match rank 2 0 1 (0 is closest)
        # Should go to the middle; incorrectly goes right
        self.maxDiff = None
        scheduler = Scheduler()
        collation = Collation()
        collation.add_plain_witness("A", "abcd 0123 efgh")
        collation.add_plain_witness("B", "abcd 0xxx 012x 01xx efgh")
        alignment_table = str(collate(collation, near_match=True, segmentation=False, scheduler=scheduler))
        # Find the rightmost rank with a gap (rank 4); this is activeRank
        # Find the first witness with a gap at that rank (A)
        # Find first token to the left of the gap for the first gappy witness ("0123" in A at rank 2)
        #   and check whether to move it
        # Calculate strength of match for all columns from the token's current rank (2) through activeRank (4)
        #   parameters are token string and rank to check
        self.assertTask("build column for rank", ["0123", "2"], scheduler[0])
        self.assertTask("build column for rank", ["0123", "3"], scheduler[1])
        self.assertTask("build column for rank", ["0123", "4"], scheduler[2])
        # The best (max()) fit of "0123" among all ranks between current rank 2 and activeRank 4
        #   is at rank 3, so move "0123" from current rank 2 to rank 3
        self.assertTask("move node from prior rank to rank with best match", ["0123", "2", "3"], scheduler[3])
        # No more gaps at activeRank 4, no gaps at rank 3, so move to next rank with a gap
        #   (rank 2, gap in A), with "abcd" at rank 1
        self.assertTask("build column for rank", ["abcd", "1"], scheduler[4])
        self.assertTask("build column for rank", ["abcd", "2"], scheduler[5])
        # Don't move it because it's closer to current location
        # No more gaps at rank 2, non gaps at rank 1, no more ranks
        self.assertEqual(6, len(scheduler))
        expected = """\
+---+------+------+------+------+------+
| A | abcd | -    | 0123 | -    | efgh |
| B | abcd | 0xxx | 012x | 01xx | efgh |
+---+------+------+------+------+------+"""
        self.assertEqual(expected, alignment_table)

        def test_near_matching_clash(self):
            # If the previous rank has a vertex with more than one witness, where at least
            # one witness is a candidate for being moved, don't move it if any of the
            # witnesses has a node at the new rank.
            #
            # If there were only A and B, we'd move cce away from bbb to align with cce.
            # Witness C should prevent this.
            self.maxDiff = None
            collation = Collation()
            collation.add_plain_witness("A", "aaa bbb ccc ddd")
            collation.add_plain_witness("B", "aaa cce ddd")
            collation.add_plain_witness("C", "aaa cce ccc ddd")
            alignment_table = str(collate(collation, near_match=True, segmentation=False))
            expected = """\
    +---+-----+-----+-----+-----+
    | A | aaa | bbb | ccc | ddd |
    | B | aaa | cce | -   | ddd |
    | C | aaa | cce | ccc | ddd |
    +---+-----+-----+-----+-----+"""
            self.assertEqual(expected, alignment_table)
 def test_heuristic_function_everything_equals(self):
     collation = Collation()
     collation.add_plain_witness("A", "everything equal")
     collation.add_plain_witness("B", "everything equal")
     aligner = ExperimentalAstarAligner(collation)
     aligner._create_heuristic_table(collation.witnesses[0].tokens(),
                                     collation.witnesses[1])
     self.assertEqual([0, 1, 2], aligner.heuristic_table[0])
     self.assertEqual([1, 0, 1], aligner.heuristic_table[1])
     self.assertEqual([2, 1, 0], aligner.heuristic_table[2])
     pass
示例#30
0
 def test_lcp_intervals_failing_use_case_old_algorithm(self):
     collation = Collation()
     collation.add_plain_witness("W1", "the cat and the dog")
     collation.add_plain_witness("W2", "the dog and the cat")
     parent_lcp_intervals, child_lcp_intervals = collation.get_lcp_intervals()
     self.assertIn((1,2), parent_lcp_intervals)
     self.assertIn((3,4), parent_lcp_intervals)
     self.assertIn((5,6), parent_lcp_intervals)
     self.assertIn((7, 10), parent_lcp_intervals)
     self.assertIn((7,8), child_lcp_intervals[7])
     self.assertIn((9,10), child_lcp_intervals[7])