def __potential_boundaries__(segmentation_a, segmentation_b, **kwargs): boundary_format = kwargs['boundary_format'] boundary_string_a = segmentation_a boundary_string_b = segmentation_b # Convert from NLTK types if boundary_format == BoundaryFormat.nltk: boundary_string_a = convert_nltk_to_masses(segmentation_a) boundary_string_b = convert_nltk_to_masses(segmentation_b) boundary_format = BoundaryFormat.mass # Check format if boundary_format == BoundaryFormat.sets: pass elif boundary_format == BoundaryFormat.mass: boundary_string_a = boundary_string_from_masses(boundary_string_a) boundary_string_b = boundary_string_from_masses(boundary_string_b) elif boundary_format == BoundaryFormat.position: boundary_string_a = convert_positions_to_masses(boundary_string_a) boundary_string_b = convert_positions_to_masses(boundary_string_b) boundary_string_a = boundary_string_from_masses(boundary_string_a) boundary_string_b = boundary_string_from_masses(boundary_string_b) else: raise SegmentationMetricError('Unsupported boundary format') # Compute boundary types if required boundary_types = identify_types(boundary_string_a, boundary_string_b) return len(boundary_string_a) * len(boundary_types)
def __boundary_statistics__( segs_a, segs_b, boundary_types, boundary_format, n_t, weight): ''' Compute boundary similarity applying the weighting functions specified. ''' # Convert from NLTK types if boundary_format == BoundaryFormat.nltk: segs_a = convert_nltk_to_masses(segs_a) segs_b = convert_nltk_to_masses(segs_b) boundary_format = BoundaryFormat.mass # Check format if boundary_format == BoundaryFormat.sets: pass # Correct boundary format elif boundary_format == BoundaryFormat.mass: segs_a = boundary_string_from_masses(segs_a) segs_b = boundary_string_from_masses(segs_b) elif boundary_format == BoundaryFormat.position: segs_a = convert_positions_to_masses(segs_a) segs_b = convert_positions_to_masses(segs_b) segs_a = boundary_string_from_masses(segs_a) segs_b = boundary_string_from_masses(segs_b) else: raise SegmentationMetricError('Unsupported boundary format') # Check length if len(segs_a) != len(segs_b): raise SegmentationMetricError( 'Segmentations differ in length ({0} != {1})'.format( len(segs_a), len(segs_b))) # Determine the boundary types boundary_types = identify_types(segs_a, segs_b) # Calculate the total pbs pbs = len(segs_b) * len(boundary_types) # Compute edits additions, substitutions, transpositions = \ boundary_edit_distance(segs_a, segs_b, n_t=n_t) # Apply weighting functions fnc_weight_a, fnc_weight_s, fnc_weight_t = weight count_additions = fnc_weight_a(additions) count_substitutions = fnc_weight_s(substitutions, max(boundary_types), min(boundary_types)) count_transpositions = fnc_weight_t(transpositions, n_t) count_edits = count_additions + count_substitutions + count_transpositions # Compute matches = list() full_misses = list() boundaries_all = 0 for set_a, set_b in zip(segs_a, segs_b): matches.extend(set_a.intersection(set_b)) full_misses.extend(set_a.symmetric_difference(set_b)) boundaries_all += len(set_a) + len(set_b) return {'count_edits': count_edits, 'additions': additions, 'substitutions': substitutions, 'transpositions': transpositions, 'full_misses': full_misses, 'boundaries_all': boundaries_all, 'matches': matches, 'pbs': pbs, 'boundary_types': boundary_types}
def __pk__(hypothesis, reference, window_size, one_minus, boundary_format, return_parts, fnc_round): # Convert from NLTK types if boundary_format == BoundaryFormat.nltk: reference = convert_nltk_to_masses(reference) hypothesis = convert_nltk_to_masses(hypothesis) boundary_format = BoundaryFormat.mass # Convert from masses into positions if boundary_format == BoundaryFormat.mass: reference = convert_masses_to_positions(reference) hypothesis = convert_masses_to_positions(hypothesis) elif boundary_format != BoundaryFormat.position: raise SegmentationMetricError('Unsupported boundary format') # Check for input errors if len(reference) != len(hypothesis): raise SegmentationMetricError( 'Reference and hypothesis segmentations differ in position length ({0} is not {1}).' .format(len(reference), len(hypothesis))) # Compute window size to use if unspecified if window_size is None: window_size = __compute_window_size__(reference, fnc_round, BoundaryFormat.position) # Create a set of pairs of units from each segmentation to go over using a # window sum_differences = 0 # Slide window over and sum the number of varying windows measurements = 0 for i in range(0, len(reference) - (window_size)): # Create probe windows with k boundaries inside window_ref = reference[i:i + window_size + 1] window_hyp = hypothesis[i:i + window_size + 1] # Probe agreement agree_ref = window_ref[0] is window_ref[-1] agree_hyp = window_hyp[0] is window_hyp[-1] # If the windows agreements agree if agree_ref is not agree_hyp: sum_differences += 1 measurements += 1 # Perform final division value = Decimal(sum_differences) / measurements if measurements > 0 else 0 if return_parts: return sum_differences, measurements else: if one_minus: return Decimal('1.0') - value else: return value
def __pk__(hypothesis, reference, window_size, one_minus, boundary_format, return_parts, fnc_round): # Convert from NLTK types if boundary_format == BoundaryFormat.nltk: reference = convert_nltk_to_masses(reference) hypothesis = convert_nltk_to_masses(hypothesis) boundary_format = BoundaryFormat.mass # Convert from masses into positions if boundary_format == BoundaryFormat.mass: reference = convert_masses_to_positions(reference) hypothesis = convert_masses_to_positions(hypothesis) elif boundary_format != BoundaryFormat.position: raise SegmentationMetricError('Unsupported boundary format') # Check for input errors if len(reference) != len(hypothesis): raise SegmentationMetricError( 'Reference and hypothesis segmentations differ in position length ({0} is not {1}).'.format(len(reference), len(hypothesis))) # Compute window size to use if unspecified if window_size is None: window_size = __compute_window_size__(reference, fnc_round, BoundaryFormat.position) # Create a set of pairs of units from each segmentation to go over using a # window sum_differences = 0 # Slide window over and sum the number of varying windows measurements = 0 for i in range(0, len(reference) - (window_size)): # Create probe windows with k boundaries inside window_ref = reference[i:i + window_size + 1] window_hyp = hypothesis[i:i + window_size + 1] # Probe agreement agree_ref = window_ref[0] is window_ref[-1] agree_hyp = window_hyp[0] is window_hyp[-1] # If the windows agreements agree if agree_ref is not agree_hyp: sum_differences += 1 measurements += 1 # Perform final division value = Decimal(sum_differences) / measurements if measurements > 0 else 0 if return_parts: return sum_differences, measurements else: if one_minus: return Decimal('1.0') - value else: return value
def __boundaries__(segmentation, **kwargs): boundary_format = kwargs['boundary_format'] boundary_string = segmentation # Convert from NLTK types if boundary_format == BoundaryFormat.nltk: boundary_string = convert_nltk_to_masses(segmentation) boundary_format = BoundaryFormat.mass # Check format if boundary_format == BoundaryFormat.sets: pass elif boundary_format == BoundaryFormat.mass: boundary_string = boundary_string_from_masses(boundary_string) elif boundary_format == BoundaryFormat.position: boundary_string = convert_positions_to_masses(boundary_string) boundary_string = boundary_string_from_masses(boundary_string) else: raise SegmentationMetricError('Unsupported boundary format') return sum([len(position) for position in boundary_string])
def __boundary_statistics__(segs_a, segs_b, boundary_types, boundary_format, n_t, weight): ''' Compute boundary similarity applying the weighting functions specified. ''' # Convert from NLTK types if boundary_format == BoundaryFormat.nltk: segs_a = convert_nltk_to_masses(segs_a) segs_b = convert_nltk_to_masses(segs_b) boundary_format = BoundaryFormat.mass # Check format if boundary_format == BoundaryFormat.sets: pass # Correct boundary format elif boundary_format == BoundaryFormat.mass: segs_a = boundary_string_from_masses(segs_a) segs_b = boundary_string_from_masses(segs_b) elif boundary_format == BoundaryFormat.position: segs_a = convert_positions_to_masses(segs_a) segs_b = convert_positions_to_masses(segs_b) segs_a = boundary_string_from_masses(segs_a) segs_b = boundary_string_from_masses(segs_b) else: raise SegmentationMetricError('Unsupported boundary format') # Check length if len(segs_a) != len(segs_b): raise SegmentationMetricError( 'Segmentations differ in length ({0} != {1})'.format( len(segs_a), len(segs_b))) # Determine the boundary types boundary_types = identify_types(segs_a, segs_b) # Calculate the total pbs pbs = len(segs_b) * len(boundary_types) # Compute edits additions, substitutions, transpositions = \ boundary_edit_distance(segs_a, segs_b, n_t=n_t) # Apply weighting functions fnc_weight_a, fnc_weight_s, fnc_weight_t = weight count_additions = fnc_weight_a(additions) count_substitutions = fnc_weight_s(substitutions, max(boundary_types), min(boundary_types)) count_transpositions = fnc_weight_t(transpositions, n_t) count_edits = count_additions + count_substitutions + count_transpositions # Compute matches = list() full_misses = list() boundaries_all = 0 for set_a, set_b in zip(segs_a, segs_b): matches.extend(set_a.intersection(set_b)) full_misses.extend(set_a.symmetric_difference(set_b)) boundaries_all += len(set_a) + len(set_b) return { 'count_edits': count_edits, 'additions': additions, 'substitutions': substitutions, 'transpositions': transpositions, 'full_misses': full_misses, 'boundaries_all': boundaries_all, 'matches': matches, 'pbs': pbs, 'boundary_types': boundary_types }
def test_convert_nltk_to_masses_pk_long(self): ''' NLTK-style segmentations starting with a boundary. ''' self.assertEqual(convert_nltk_to_masses('0100100000'), (2, 3, 6)) self.assertEqual(convert_nltk_to_masses('0101000000'), (2, 2, 7))
def test_convert_nltk_to_masses_pk_ab(self): ''' NLTK-style segmentations starting with a boundary. ''' self.assertEqual(convert_nltk_to_masses('100'), (1,3)) self.assertEqual(convert_nltk_to_masses('010'), (2,2))
def __window_diff__(hypothesis, reference, window_size, one_minus, boundary_format, return_parts, fnc_round, lamprier_et_al_2007_fix): ''' Calculates the WindowDiff segmentation evaluation metric score for a hypothetical segmentation against a reference segmentation for a given window size. The standard method of calculating the window size is performed a window size is not specified. :param hypothesis: Hypothesis segmentation section labels sequence. :param reference: Reference segmentation section labels sequence. :param window_size: The size of the window that is slid over \ the two segmentations used to count \ mismatches (default is None and will \ use the average window size) :param one_minus: Return 1-WindowDiff to make it no longer \ a penalty-metric. :param lamprier_et_al_2007_fix: Apply a fix for improperly counted errors \ at the beginning and end of \ segmentations, provided by \ _[LamprierEtAl2007]. :param convert_from_masses: Convert the segmentations provided from \ masses into positions. :type hypothesis: list :type reference: list :type window_size: int :type one_minus: bool :type lamprier_et_al_2007_fix: bool :type convert_from_masses: bool .. note:: See :func:`segeval.convert_masses_to_positions` for an example of the input format. ''' # Convert from NLTK types if boundary_format == BoundaryFormat.nltk: reference = convert_nltk_to_masses(reference) hypothesis = convert_nltk_to_masses(hypothesis) boundary_format = BoundaryFormat.mass # Convert from masses into positions if boundary_format == BoundaryFormat.mass: reference = convert_masses_to_positions(reference) hypothesis = convert_masses_to_positions(hypothesis) elif boundary_format != BoundaryFormat.position: raise SegmentationMetricError('Unsupported boundary format') # Check for input errors if len(reference) != len(hypothesis): raise SegmentationMetricError( 'Reference and hypothesis segmentations differ in position \ length (%(ref)i is not %(hyp)i).' % { 'ref': len(reference), 'hyp': len(hypothesis) }) # Compute window size to use if unspecified if window_size is None: window_size = __compute_window_size__(reference, fnc_round, BoundaryFormat.position) # Create a set of pairs of units from each segmentation to go over using a # window units_ref_hyp = __create_paired_window__(hypothesis, reference, window_size, lamprier_et_al_2007_fix)[0] # Slide window over and sum the number of varying windows sum_differences = 0 measurements = len(units_ref_hyp) - window_size for i in range(0, measurements): window = units_ref_hyp[i:i + window_size + 1] ref_boundaries = 0 hyp_boundaries = 0 # Check that the number of loops is correct assert len(window) == window_size + 1 # For pair in window for j in range(0, len(window) - 1): ref_part, hyp_part = zip(*window[j:j + 2]) # Boundary exists in the reference segmentation if ref_part[0] is not ref_part[1]: ref_boundaries += 1 # Boundary exists in the hypothesis segmentation if hyp_part[0] is not hyp_part[1]: hyp_boundaries += 1 # If the number of boundaries per segmentation in the window differs if ref_boundaries is not hyp_boundaries: sum_differences += 1 # Perform final division n = sum(convert_positions_to_masses(reference)) denominator = n - window_size if lamprier_et_al_2007_fix: denominator = measurements + 1 win_diff = Decimal(sum_differences) / denominator # Check normalization assert denominator == measurements or lamprier_et_al_2007_fix # Check value assert win_diff <= 1 if not one_minus: if return_parts: return sum_differences, denominator else: return win_diff else: return Decimal('1.0') - win_diff
def __window_diff__(hypothesis, reference, window_size, one_minus, boundary_format, return_parts, fnc_round, lamprier_et_al_2007_fix): ''' Calculates the WindowDiff segmentation evaluation metric score for a hypothetical segmentation against a reference segmentation for a given window size. The standard method of calculating the window size is performed a window size is not specified. :param hypothesis: Hypothesis segmentation section labels sequence. :param reference: Reference segmentation section labels sequence. :param window_size: The size of the window that is slid over \ the two segmentations used to count \ mismatches (default is None and will \ use the average window size) :param one_minus: Return 1-WindowDiff to make it no longer \ a penalty-metric. :param lamprier_et_al_2007_fix: Apply a fix for improperly counted errors \ at the beginning and end of \ segmentations, provided by \ _[LamprierEtAl2007]. :param convert_from_masses: Convert the segmentations provided from \ masses into positions. :type hypothesis: list :type reference: list :type window_size: int :type one_minus: bool :type lamprier_et_al_2007_fix: bool :type convert_from_masses: bool .. note:: See :func:`segeval.convert_masses_to_positions` for an example of the input format. ''' # Convert from NLTK types if boundary_format == BoundaryFormat.nltk: reference = convert_nltk_to_masses(reference) hypothesis = convert_nltk_to_masses(hypothesis) boundary_format = BoundaryFormat.mass # Convert from masses into positions if boundary_format == BoundaryFormat.mass: reference = convert_masses_to_positions(reference) hypothesis = convert_masses_to_positions(hypothesis) elif boundary_format != BoundaryFormat.position: raise SegmentationMetricError('Unsupported boundary format') # Check for input errors if len(reference) != len(hypothesis): raise SegmentationMetricError( 'Reference and hypothesis segmentations differ in position \ length (%(ref)i is not %(hyp)i).' % {'ref': len(reference), 'hyp': len(hypothesis)}) # Compute window size to use if unspecified if window_size is None: window_size = __compute_window_size__(reference, fnc_round, BoundaryFormat.position) # Create a set of pairs of units from each segmentation to go over using a # window units_ref_hyp = __create_paired_window__(hypothesis, reference, window_size, lamprier_et_al_2007_fix)[0] # Slide window over and sum the number of varying windows sum_differences = 0 measurements = len(units_ref_hyp) - window_size for i in range(0, measurements): window = units_ref_hyp[i: i + window_size + 1] ref_boundaries = 0 hyp_boundaries = 0 # Check that the number of loops is correct assert len(window) is window_size + 1 # For pair in window for j in range(0, len(window) - 1): ref_part, hyp_part = zip(*window[j:j + 2]) # Boundary exists in the reference segmentation if ref_part[0] is not ref_part[1]: ref_boundaries += 1 # Boundary exists in the hypothesis segmentation if hyp_part[0] is not hyp_part[1]: hyp_boundaries += 1 # If the number of boundaries per segmentation in the window differs if ref_boundaries is not hyp_boundaries: sum_differences += 1 # Perform final division n = sum(convert_positions_to_masses(reference)) denominator = n - window_size if lamprier_et_al_2007_fix: denominator = measurements + 1 win_diff = Decimal(sum_differences) / denominator # Check normalization assert denominator == measurements or lamprier_et_al_2007_fix # Check value assert win_diff <= 1 if not one_minus: if return_parts: return sum_differences, denominator else: return win_diff else: return Decimal('1.0') - win_diff