def test_edit_distance_compliment(self): ''' Test a mixed example of additions, substitutions and transpositions. ''' a = [ set(), set(), set(), set(), set(), set(), set(), set(), set(), set() ] b = [ set([1]), set([1]), set([1]), set([1]), set([1]), set([1]), set([1]), set([1]), set([1]), set([1]) ] additions, substitutions, transpositions = boundary_edit_distance(a, b) self.assertEqual(([(1, 'b'), (1, 'b'), (1, 'b'), (1, 'b'), (1, 'b'), (1, 'b'), (1, 'b'), (1, 'b'), (1, 'b'), (1, 'b')], [], []), (additions, substitutions, transpositions))
def test_edit_distance(self): ''' Test a mixed example of additions, substitutions and transpositions. ''' a = [ set(), set([1]), set(), set(), set([1]), set(), set(), set(), set(), set() ] b = [ set(), set([2, 3]), set(), set(), set(), set([1]), set(), set(), set([3]), set() ] additions, substitutions, transpositions = boundary_edit_distance(a, b) self.assertEqual(([(3, 'b'), (3, 'b')], [(1, 2)], [(4, 5, 1)]), (additions, substitutions, transpositions))
def test_edit_distance_two_substitutions_one_ad_into_no_transpositions( self): ''' Test two transpositions ''' a = [ set(), set(), set(), set([2]), set(), set([3]), set(), set(), set(), set() ] b = [ set(), set(), set(), set([1, 4]), set(), set([2]), set(), set(), set(), set() ] additions, substitutions, transpositions = boundary_edit_distance( a, b, n_t=3) self.assertEqual(([(4, 'b')], [(2, 1), (3, 2)], []), (additions, substitutions, transpositions))
def test_edit_distance_three_transpositions_overlapping(self): ''' Test two transpositions ''' a = [ set(), set(), set(), set([1, 2, 3]), set(), set(), set(), set(), set(), set() ] b = [ set(), set(), set(), set(), set([3]), set([1, 2]), set(), set(), set(), set() ] additions, substitutions, transpositions = boundary_edit_distance( a, b, n_t=3) self.assertEqual(([], [], [(3, 4, 3), (3, 5, 1), (3, 5, 2)]), (additions, substitutions, transpositions))
def test_edit_distance_two_transpositions_equal(self): ''' Test two transpositions ''' a = [ set(), set(), set(), set(), set([2]), set(), set([2]), set(), set(), set() ] b = [ set(), set(), set(), set(), set(), set([2]), set(), set(), set(), set() ] additions, substitutions, transpositions = boundary_edit_distance(a, b) self.assertEqual(([(2, 'a')], [], [(4, 5, 2)]), (additions, substitutions, transpositions))
def test_edit_distance_two_substitutions_into_no_transpositions(self): ''' Test two transpositions ''' a = [set(), set(), set(), set([2]), set(), set([3]), set(), set(), set(), set()] b = [set(), set(), set(), set([1]), set(), set([2]), set(), set(), set(), set()] additions, substitutions, transpositions = boundary_edit_distance(a, b, n_t=3) self.assertEqual(([], [(2,1), (3,2)], []), (additions, substitutions, transpositions))
def test_edit_distance_two_transpositions_equal(self): ''' Test two transpositions ''' a = [set(), set(), set(), set(), set([2]), set(), set([2]), set(), set(), set()] b = [set(), set(), set(), set(), set(), set([2]), set(), set(), set(), set()] additions, substitutions, transpositions = boundary_edit_distance(a, b) self.assertEqual(([(2, 'a')], [], [(4, 5, 2)]), (additions, substitutions, transpositions))
def test_edit_distance_three_transpositions_overlapping(self): ''' Test two transpositions ''' a = [set(), set(), set(), set([1,2,3]), set(), set(), set(), set(), set(), set()] b = [set(), set(), set(), set(), set([3]), set([1,2]), set(), set(), set(), set()] additions, substitutions, transpositions = boundary_edit_distance(a, b, n_t=3) self.assertEqual(([], [], [(3,4,3),(3,5,1),(3,5,2)]), (additions, substitutions, transpositions))
def __boundary_statistics__( segs_a, segs_b, boundary_types, boundary_format, n_t, weight): ''' Compute boundary similarity applying the weighting functions specified. ''' # Convert from NLTK types if boundary_format == BoundaryFormat.nltk: segs_a = convert_nltk_to_masses(segs_a) segs_b = convert_nltk_to_masses(segs_b) boundary_format = BoundaryFormat.mass # Check format if boundary_format == BoundaryFormat.sets: pass # Correct boundary format elif boundary_format == BoundaryFormat.mass: segs_a = boundary_string_from_masses(segs_a) segs_b = boundary_string_from_masses(segs_b) elif boundary_format == BoundaryFormat.position: segs_a = convert_positions_to_masses(segs_a) segs_b = convert_positions_to_masses(segs_b) segs_a = boundary_string_from_masses(segs_a) segs_b = boundary_string_from_masses(segs_b) else: raise SegmentationMetricError('Unsupported boundary format') # Check length if len(segs_a) != len(segs_b): raise SegmentationMetricError( 'Segmentations differ in length ({0} != {1})'.format( len(segs_a), len(segs_b))) # Determine the boundary types boundary_types = identify_types(segs_a, segs_b) # Calculate the total pbs pbs = len(segs_b) * len(boundary_types) # Compute edits additions, substitutions, transpositions = \ boundary_edit_distance(segs_a, segs_b, n_t=n_t) # Apply weighting functions fnc_weight_a, fnc_weight_s, fnc_weight_t = weight count_additions = fnc_weight_a(additions) count_substitutions = fnc_weight_s(substitutions, max(boundary_types), min(boundary_types)) count_transpositions = fnc_weight_t(transpositions, n_t) count_edits = count_additions + count_substitutions + count_transpositions # Compute matches = list() full_misses = list() boundaries_all = 0 for set_a, set_b in zip(segs_a, segs_b): matches.extend(set_a.intersection(set_b)) full_misses.extend(set_a.symmetric_difference(set_b)) boundaries_all += len(set_a) + len(set_b) return {'count_edits': count_edits, 'additions': additions, 'substitutions': substitutions, 'transpositions': transpositions, 'full_misses': full_misses, 'boundaries_all': boundaries_all, 'matches': matches, 'pbs': pbs, 'boundary_types': boundary_types}
def test_edit_distance(self): ''' Test a mixed example of additions, substitutions and transpositions. ''' a = [set(), set([1]), set(), set(), set([1]), set(), set(), set(), set(), set()] b = [set(), set([2, 3]), set(), set(), set(), set([1]), set(), set(), set([3]), set()] additions, substitutions, transpositions = boundary_edit_distance(a, b) self.assertEqual(([(3, 'b'), (3, 'b')], [(1, 2)], [(4, 5, 1)]), (additions, substitutions, transpositions))
def test_edit_distance_compliment(self): ''' Test a mixed example of additions, substitutions and transpositions. ''' a = [set(), set(), set(), set(), set(), set(), set(), set(), set(), set()] b = [set([1]), set([1]), set([1]), set([1]), set([1]), set([1]), set([1]), set([1]), set([1]), set([1])] additions, substitutions, transpositions = boundary_edit_distance(a, b) self.assertEqual(([(1, 'b'), (1, 'b'), (1, 'b'), (1, 'b'), (1, 'b'), (1, 'b'), (1, 'b'), (1, 'b'), (1, 'b'), (1, 'b')], [], []), (additions, substitutions, transpositions))
def test_edit_distance_identity(self): ''' Test a mixed example of additions, substitutions and transpositions. ''' a = [set(), set([1]), set(), set(), set([1]), set(), set(), set(), set(), set()] b = a additions, substitutions, transpositions = boundary_edit_distance(a, b) self.assertEqual(([], [], []), (additions, substitutions, transpositions))
def test_edit_distance_identity(self): ''' Test a mixed example of additions, substitutions and transpositions. ''' a = [ set(), set([1]), set(), set(), set([1]), set(), set(), set(), set(), set() ] b = a additions, substitutions, transpositions = boundary_edit_distance(a, b) self.assertEqual(([], [], []), (additions, substitutions, transpositions))
def __boundary_statistics__(segs_a, segs_b, boundary_types, boundary_format, n_t, weight): ''' Compute boundary similarity applying the weighting functions specified. ''' # Convert from NLTK types if boundary_format == BoundaryFormat.nltk: segs_a = convert_nltk_to_masses(segs_a) segs_b = convert_nltk_to_masses(segs_b) boundary_format = BoundaryFormat.mass # Check format if boundary_format == BoundaryFormat.sets: pass # Correct boundary format elif boundary_format == BoundaryFormat.mass: segs_a = boundary_string_from_masses(segs_a) segs_b = boundary_string_from_masses(segs_b) elif boundary_format == BoundaryFormat.position: segs_a = convert_positions_to_masses(segs_a) segs_b = convert_positions_to_masses(segs_b) segs_a = boundary_string_from_masses(segs_a) segs_b = boundary_string_from_masses(segs_b) else: raise SegmentationMetricError('Unsupported boundary format') # Check length if len(segs_a) != len(segs_b): raise SegmentationMetricError( 'Segmentations differ in length ({0} != {1})'.format( len(segs_a), len(segs_b))) # Determine the boundary types boundary_types = identify_types(segs_a, segs_b) # Calculate the total pbs pbs = len(segs_b) * len(boundary_types) # Compute edits additions, substitutions, transpositions = \ boundary_edit_distance(segs_a, segs_b, n_t=n_t) # Apply weighting functions fnc_weight_a, fnc_weight_s, fnc_weight_t = weight count_additions = fnc_weight_a(additions) count_substitutions = fnc_weight_s(substitutions, max(boundary_types), min(boundary_types)) count_transpositions = fnc_weight_t(transpositions, n_t) count_edits = count_additions + count_substitutions + count_transpositions # Compute matches = list() full_misses = list() boundaries_all = 0 for set_a, set_b in zip(segs_a, segs_b): matches.extend(set_a.intersection(set_b)) full_misses.extend(set_a.symmetric_difference(set_b)) boundaries_all += len(set_a) + len(set_b) return { 'count_edits': count_edits, 'additions': additions, 'substitutions': substitutions, 'transpositions': transpositions, 'full_misses': full_misses, 'boundaries_all': boundaries_all, 'matches': matches, 'pbs': pbs, 'boundary_types': boundary_types }