def __potential_boundaries__(segmentation_a, segmentation_b, **kwargs): boundary_format = kwargs['boundary_format'] boundary_string_a = segmentation_a boundary_string_b = segmentation_b # Convert from NLTK types if boundary_format == BoundaryFormat.nltk: boundary_string_a = convert_nltk_to_masses(segmentation_a) boundary_string_b = convert_nltk_to_masses(segmentation_b) boundary_format = BoundaryFormat.mass # Check format if boundary_format == BoundaryFormat.sets: pass elif boundary_format == BoundaryFormat.mass: boundary_string_a = boundary_string_from_masses(boundary_string_a) boundary_string_b = boundary_string_from_masses(boundary_string_b) elif boundary_format == BoundaryFormat.position: boundary_string_a = convert_positions_to_masses(boundary_string_a) boundary_string_b = convert_positions_to_masses(boundary_string_b) boundary_string_a = boundary_string_from_masses(boundary_string_a) boundary_string_b = boundary_string_from_masses(boundary_string_b) else: raise SegmentationMetricError('Unsupported boundary format') # Compute boundary types if required boundary_types = identify_types(boundary_string_a, boundary_string_b) return len(boundary_string_a) * len(boundary_types)
def __boundary_statistics__( segs_a, segs_b, boundary_types, boundary_format, n_t, weight): ''' Compute boundary similarity applying the weighting functions specified. ''' # Convert from NLTK types if boundary_format == BoundaryFormat.nltk: segs_a = convert_nltk_to_masses(segs_a) segs_b = convert_nltk_to_masses(segs_b) boundary_format = BoundaryFormat.mass # Check format if boundary_format == BoundaryFormat.sets: pass # Correct boundary format elif boundary_format == BoundaryFormat.mass: segs_a = boundary_string_from_masses(segs_a) segs_b = boundary_string_from_masses(segs_b) elif boundary_format == BoundaryFormat.position: segs_a = convert_positions_to_masses(segs_a) segs_b = convert_positions_to_masses(segs_b) segs_a = boundary_string_from_masses(segs_a) segs_b = boundary_string_from_masses(segs_b) else: raise SegmentationMetricError('Unsupported boundary format') # Check length if len(segs_a) != len(segs_b): raise SegmentationMetricError( 'Segmentations differ in length ({0} != {1})'.format( len(segs_a), len(segs_b))) # Determine the boundary types boundary_types = identify_types(segs_a, segs_b) # Calculate the total pbs pbs = len(segs_b) * len(boundary_types) # Compute edits additions, substitutions, transpositions = \ boundary_edit_distance(segs_a, segs_b, n_t=n_t) # Apply weighting functions fnc_weight_a, fnc_weight_s, fnc_weight_t = weight count_additions = fnc_weight_a(additions) count_substitutions = fnc_weight_s(substitutions, max(boundary_types), min(boundary_types)) count_transpositions = fnc_weight_t(transpositions, n_t) count_edits = count_additions + count_substitutions + count_transpositions # Compute matches = list() full_misses = list() boundaries_all = 0 for set_a, set_b in zip(segs_a, segs_b): matches.extend(set_a.intersection(set_b)) full_misses.extend(set_a.symmetric_difference(set_b)) boundaries_all += len(set_a) + len(set_b) return {'count_edits': count_edits, 'additions': additions, 'substitutions': substitutions, 'transpositions': transpositions, 'full_misses': full_misses, 'boundaries_all': boundaries_all, 'matches': matches, 'pbs': pbs, 'boundary_types': boundary_types}
def __boundaries__(segmentation, **kwargs): boundary_format = kwargs['boundary_format'] boundary_string = segmentation # Convert from NLTK types if boundary_format == BoundaryFormat.nltk: boundary_string = convert_nltk_to_masses(segmentation) boundary_format = BoundaryFormat.mass # Check format if boundary_format == BoundaryFormat.sets: pass elif boundary_format == BoundaryFormat.mass: boundary_string = boundary_string_from_masses(boundary_string) elif boundary_format == BoundaryFormat.position: boundary_string = convert_positions_to_masses(boundary_string) boundary_string = boundary_string_from_masses(boundary_string) else: raise SegmentationMetricError('Unsupported boundary format') return sum([len(position) for position in boundary_string])
def input_linear_positions_tsv(filepath, delimiter=DEFAULT_DELIMITER): ''' Takes a file path. Returns segmentation mass codings as a :class:`Dataset`. :param filepath: path to the mass file containing segment position codings. :param delimiter: the delimiter used when reading a TSV file (by default, a tab, but it can also be a comma, whitespace, etc. :type filepath: str :type delimiter: str .. deprecated:: 1.0 .. warning:: This I/O function is for legacy files only and will be removed in later versions. ''' dataset = input_linear_mass_tsv(filepath, delimiter) # Convert each segment position to masses for item, coder_positions in dataset.items(): for coder, positions in coder_positions.items(): dataset[item][coder] = convert_positions_to_masses(positions) # Return return dataset
def __compute_window_size__(reference, fnc_round, boundary_format): ''' Compute a window size from a dict of segment masses. :param masses: A dict of segment masses. :type masses: dict ''' all_masses = list() # Define fnc def __list_coder_masses__(inner_coder_masses): ''' Recursively collect all masses. :param inner_coder_masses: Either a dict of dicts, or dict of a list of masses. :type inner_coder_masses: dict or list ''' if hasattr(inner_coder_masses, 'items'): for cur_inner_coder_masses in inner_coder_masses.values(): __list_coder_masses__(cur_inner_coder_masses) elif hasattr(inner_coder_masses, '__iter__') and not isinstance(inner_coder_masses, str): all_masses.extend(inner_coder_masses) else: raise SegmentationMetricError('Expected either a dict-like \ collection of segmentations or a segmentation as a list-like object') if boundary_format == BoundaryFormat.position: reference = convert_positions_to_masses(reference) # Recurse and list all masses __list_coder_masses__(reference) # Convert to floats all_masses = [Decimal(mass) for mass in all_masses] # Calculate avg = mean(all_masses) / Decimal('2') window_size = int(fnc_round(avg)) return window_size if window_size > 1 else 2
def __boundary_statistics__(segs_a, segs_b, boundary_types, boundary_format, n_t, weight): ''' Compute boundary similarity applying the weighting functions specified. ''' # Convert from NLTK types if boundary_format == BoundaryFormat.nltk: segs_a = convert_nltk_to_masses(segs_a) segs_b = convert_nltk_to_masses(segs_b) boundary_format = BoundaryFormat.mass # Check format if boundary_format == BoundaryFormat.sets: pass # Correct boundary format elif boundary_format == BoundaryFormat.mass: segs_a = boundary_string_from_masses(segs_a) segs_b = boundary_string_from_masses(segs_b) elif boundary_format == BoundaryFormat.position: segs_a = convert_positions_to_masses(segs_a) segs_b = convert_positions_to_masses(segs_b) segs_a = boundary_string_from_masses(segs_a) segs_b = boundary_string_from_masses(segs_b) else: raise SegmentationMetricError('Unsupported boundary format') # Check length if len(segs_a) != len(segs_b): raise SegmentationMetricError( 'Segmentations differ in length ({0} != {1})'.format( len(segs_a), len(segs_b))) # Determine the boundary types boundary_types = identify_types(segs_a, segs_b) # Calculate the total pbs pbs = len(segs_b) * len(boundary_types) # Compute edits additions, substitutions, transpositions = \ boundary_edit_distance(segs_a, segs_b, n_t=n_t) # Apply weighting functions fnc_weight_a, fnc_weight_s, fnc_weight_t = weight count_additions = fnc_weight_a(additions) count_substitutions = fnc_weight_s(substitutions, max(boundary_types), min(boundary_types)) count_transpositions = fnc_weight_t(transpositions, n_t) count_edits = count_additions + count_substitutions + count_transpositions # Compute matches = list() full_misses = list() boundaries_all = 0 for set_a, set_b in zip(segs_a, segs_b): matches.extend(set_a.intersection(set_b)) full_misses.extend(set_a.symmetric_difference(set_b)) boundaries_all += len(set_a) + len(set_b) return { 'count_edits': count_edits, 'additions': additions, 'substitutions': substitutions, 'transpositions': transpositions, 'full_misses': full_misses, 'boundaries_all': boundaries_all, 'matches': matches, 'pbs': pbs, 'boundary_types': boundary_types }
def test_convert_positions_to_masses_all(self): ''' Test segment position sequence conversion from masses. ''' self.assertEqual((1,1,1,1,1,1,1,1,1,1,1), convert_positions_to_masses([1,2,3,4,5,6,7,8,9,10,11]))
def test_convert_positions_to_masses_none(self): ''' Test segment position sequence conversion from masses. ''' self.assertEqual((11,), convert_positions_to_masses([1,1,1,1,1,1,1,1,1,1,1]))
def test_convert_positions_to_masses(self): ''' Test segment position sequence conversion to masses. ''' self.assertEqual((5,3,5), convert_positions_to_masses([1,1,1,1,1,2,2,2,3,3,3,3,3]))
def __window_diff__(hypothesis, reference, window_size, one_minus, boundary_format, return_parts, fnc_round, lamprier_et_al_2007_fix): ''' Calculates the WindowDiff segmentation evaluation metric score for a hypothetical segmentation against a reference segmentation for a given window size. The standard method of calculating the window size is performed a window size is not specified. :param hypothesis: Hypothesis segmentation section labels sequence. :param reference: Reference segmentation section labels sequence. :param window_size: The size of the window that is slid over \ the two segmentations used to count \ mismatches (default is None and will \ use the average window size) :param one_minus: Return 1-WindowDiff to make it no longer \ a penalty-metric. :param lamprier_et_al_2007_fix: Apply a fix for improperly counted errors \ at the beginning and end of \ segmentations, provided by \ _[LamprierEtAl2007]. :param convert_from_masses: Convert the segmentations provided from \ masses into positions. :type hypothesis: list :type reference: list :type window_size: int :type one_minus: bool :type lamprier_et_al_2007_fix: bool :type convert_from_masses: bool .. note:: See :func:`segeval.convert_masses_to_positions` for an example of the input format. ''' # Convert from NLTK types if boundary_format == BoundaryFormat.nltk: reference = convert_nltk_to_masses(reference) hypothesis = convert_nltk_to_masses(hypothesis) boundary_format = BoundaryFormat.mass # Convert from masses into positions if boundary_format == BoundaryFormat.mass: reference = convert_masses_to_positions(reference) hypothesis = convert_masses_to_positions(hypothesis) elif boundary_format != BoundaryFormat.position: raise SegmentationMetricError('Unsupported boundary format') # Check for input errors if len(reference) != len(hypothesis): raise SegmentationMetricError( 'Reference and hypothesis segmentations differ in position \ length (%(ref)i is not %(hyp)i).' % { 'ref': len(reference), 'hyp': len(hypothesis) }) # Compute window size to use if unspecified if window_size is None: window_size = __compute_window_size__(reference, fnc_round, BoundaryFormat.position) # Create a set of pairs of units from each segmentation to go over using a # window units_ref_hyp = __create_paired_window__(hypothesis, reference, window_size, lamprier_et_al_2007_fix)[0] # Slide window over and sum the number of varying windows sum_differences = 0 measurements = len(units_ref_hyp) - window_size for i in range(0, measurements): window = units_ref_hyp[i:i + window_size + 1] ref_boundaries = 0 hyp_boundaries = 0 # Check that the number of loops is correct assert len(window) == window_size + 1 # For pair in window for j in range(0, len(window) - 1): ref_part, hyp_part = zip(*window[j:j + 2]) # Boundary exists in the reference segmentation if ref_part[0] is not ref_part[1]: ref_boundaries += 1 # Boundary exists in the hypothesis segmentation if hyp_part[0] is not hyp_part[1]: hyp_boundaries += 1 # If the number of boundaries per segmentation in the window differs if ref_boundaries is not hyp_boundaries: sum_differences += 1 # Perform final division n = sum(convert_positions_to_masses(reference)) denominator = n - window_size if lamprier_et_al_2007_fix: denominator = measurements + 1 win_diff = Decimal(sum_differences) / denominator # Check normalization assert denominator == measurements or lamprier_et_al_2007_fix # Check value assert win_diff <= 1 if not one_minus: if return_parts: return sum_differences, denominator else: return win_diff else: return Decimal('1.0') - win_diff
def __window_diff__(hypothesis, reference, window_size, one_minus, boundary_format, return_parts, fnc_round, lamprier_et_al_2007_fix): ''' Calculates the WindowDiff segmentation evaluation metric score for a hypothetical segmentation against a reference segmentation for a given window size. The standard method of calculating the window size is performed a window size is not specified. :param hypothesis: Hypothesis segmentation section labels sequence. :param reference: Reference segmentation section labels sequence. :param window_size: The size of the window that is slid over \ the two segmentations used to count \ mismatches (default is None and will \ use the average window size) :param one_minus: Return 1-WindowDiff to make it no longer \ a penalty-metric. :param lamprier_et_al_2007_fix: Apply a fix for improperly counted errors \ at the beginning and end of \ segmentations, provided by \ _[LamprierEtAl2007]. :param convert_from_masses: Convert the segmentations provided from \ masses into positions. :type hypothesis: list :type reference: list :type window_size: int :type one_minus: bool :type lamprier_et_al_2007_fix: bool :type convert_from_masses: bool .. note:: See :func:`segeval.convert_masses_to_positions` for an example of the input format. ''' # Convert from NLTK types if boundary_format == BoundaryFormat.nltk: reference = convert_nltk_to_masses(reference) hypothesis = convert_nltk_to_masses(hypothesis) boundary_format = BoundaryFormat.mass # Convert from masses into positions if boundary_format == BoundaryFormat.mass: reference = convert_masses_to_positions(reference) hypothesis = convert_masses_to_positions(hypothesis) elif boundary_format != BoundaryFormat.position: raise SegmentationMetricError('Unsupported boundary format') # Check for input errors if len(reference) != len(hypothesis): raise SegmentationMetricError( 'Reference and hypothesis segmentations differ in position \ length (%(ref)i is not %(hyp)i).' % {'ref': len(reference), 'hyp': len(hypothesis)}) # Compute window size to use if unspecified if window_size is None: window_size = __compute_window_size__(reference, fnc_round, BoundaryFormat.position) # Create a set of pairs of units from each segmentation to go over using a # window units_ref_hyp = __create_paired_window__(hypothesis, reference, window_size, lamprier_et_al_2007_fix)[0] # Slide window over and sum the number of varying windows sum_differences = 0 measurements = len(units_ref_hyp) - window_size for i in range(0, measurements): window = units_ref_hyp[i: i + window_size + 1] ref_boundaries = 0 hyp_boundaries = 0 # Check that the number of loops is correct assert len(window) is window_size + 1 # For pair in window for j in range(0, len(window) - 1): ref_part, hyp_part = zip(*window[j:j + 2]) # Boundary exists in the reference segmentation if ref_part[0] is not ref_part[1]: ref_boundaries += 1 # Boundary exists in the hypothesis segmentation if hyp_part[0] is not hyp_part[1]: hyp_boundaries += 1 # If the number of boundaries per segmentation in the window differs if ref_boundaries is not hyp_boundaries: sum_differences += 1 # Perform final division n = sum(convert_positions_to_masses(reference)) denominator = n - window_size if lamprier_et_al_2007_fix: denominator = measurements + 1 win_diff = Decimal(sum_differences) / denominator # Check normalization assert denominator == measurements or lamprier_et_al_2007_fix # Check value assert win_diff <= 1 if not one_minus: if return_parts: return sum_differences, denominator else: return win_diff else: return Decimal('1.0') - win_diff