示例#1
0
    def test_boundary_edit_distance(self):
        '''
        Test boundary_edit_distance.
        '''

        edits = ([(1, 'b'), (1, 'b'), (1, 'b')], [], [(9, 10, 1)])
        self.assertEquals(edits, boundary_edit_distance(
            boundary_string_from_masses(self.masses_an1),
            boundary_string_from_masses(self.masses_an2)))
示例#2
0
def boundary_string_from_boundary_indices(segmentation, doc_length):
    """converts boundary indices to segeval-compatible boundary strings
    
    Args:
        segmentation (list of int): list of segmentation boundary indices
        doc_length (int): length of the segmented document

    Returns:
        tuple: Boundary string
    """

    i = 1
    tokens_in_segment = 0
    masses = []
    current_seg_index = 0
    while i < doc_length:
        tokens_in_segment += 1
        if current_seg_index < len(
                segmentation) and i > segmentation[current_seg_index] - 1:
            masses.append(tokens_in_segment)
            tokens_in_segment = 0
            current_seg_index += 1
        i += 1
    masses.append(doc_length - segmentation[-1])

    return segeval.boundary_string_from_masses(tuple(masses))
示例#3
0
    def test_boundary_string_from_masses(self):
        '''
        Test boundary_string_from_masses.
        '''

        self.assertEquals((
            frozenset([]), frozenset([]), frozenset([]), frozenset([]),
            frozenset([]), frozenset([]), frozenset([]), frozenset([]),
            frozenset([]), frozenset([]), frozenset([1]), frozenset([])),
            boundary_string_from_masses(self.masses_an1))