def __potential_boundaries__(segmentation_a, segmentation_b, **kwargs): boundary_format = kwargs['boundary_format'] boundary_string_a = segmentation_a boundary_string_b = segmentation_b # Convert from NLTK types if boundary_format == BoundaryFormat.nltk: boundary_string_a = convert_nltk_to_masses(segmentation_a) boundary_string_b = convert_nltk_to_masses(segmentation_b) boundary_format = BoundaryFormat.mass # Check format if boundary_format == BoundaryFormat.sets: pass elif boundary_format == BoundaryFormat.mass: boundary_string_a = boundary_string_from_masses(boundary_string_a) boundary_string_b = boundary_string_from_masses(boundary_string_b) elif boundary_format == BoundaryFormat.position: boundary_string_a = convert_positions_to_masses(boundary_string_a) boundary_string_b = convert_positions_to_masses(boundary_string_b) boundary_string_a = boundary_string_from_masses(boundary_string_a) boundary_string_b = boundary_string_from_masses(boundary_string_b) else: raise SegmentationMetricError('Unsupported boundary format') # Compute boundary types if required boundary_types = identify_types(boundary_string_a, boundary_string_b) return len(boundary_string_a) * len(boundary_types)
def __boundary_statistics__( segs_a, segs_b, boundary_types, boundary_format, n_t, weight): ''' Compute boundary similarity applying the weighting functions specified. ''' # Convert from NLTK types if boundary_format == BoundaryFormat.nltk: segs_a = convert_nltk_to_masses(segs_a) segs_b = convert_nltk_to_masses(segs_b) boundary_format = BoundaryFormat.mass # Check format if boundary_format == BoundaryFormat.sets: pass # Correct boundary format elif boundary_format == BoundaryFormat.mass: segs_a = boundary_string_from_masses(segs_a) segs_b = boundary_string_from_masses(segs_b) elif boundary_format == BoundaryFormat.position: segs_a = convert_positions_to_masses(segs_a) segs_b = convert_positions_to_masses(segs_b) segs_a = boundary_string_from_masses(segs_a) segs_b = boundary_string_from_masses(segs_b) else: raise SegmentationMetricError('Unsupported boundary format') # Check length if len(segs_a) != len(segs_b): raise SegmentationMetricError( 'Segmentations differ in length ({0} != {1})'.format( len(segs_a), len(segs_b))) # Determine the boundary types boundary_types = identify_types(segs_a, segs_b) # Calculate the total pbs pbs = len(segs_b) * len(boundary_types) # Compute edits additions, substitutions, transpositions = \ boundary_edit_distance(segs_a, segs_b, n_t=n_t) # Apply weighting functions fnc_weight_a, fnc_weight_s, fnc_weight_t = weight count_additions = fnc_weight_a(additions) count_substitutions = fnc_weight_s(substitutions, max(boundary_types), min(boundary_types)) count_transpositions = fnc_weight_t(transpositions, n_t) count_edits = count_additions + count_substitutions + count_transpositions # Compute matches = list() full_misses = list() boundaries_all = 0 for set_a, set_b in zip(segs_a, segs_b): matches.extend(set_a.intersection(set_b)) full_misses.extend(set_a.symmetric_difference(set_b)) boundaries_all += len(set_a) + len(set_b) return {'count_edits': count_edits, 'additions': additions, 'substitutions': substitutions, 'transpositions': transpositions, 'full_misses': full_misses, 'boundaries_all': boundaries_all, 'matches': matches, 'pbs': pbs, 'boundary_types': boundary_types}
def __boundary_statistics__(segs_a, segs_b, boundary_types, boundary_format, n_t, weight): ''' Compute boundary similarity applying the weighting functions specified. ''' # Convert from NLTK types if boundary_format == BoundaryFormat.nltk: segs_a = convert_nltk_to_masses(segs_a) segs_b = convert_nltk_to_masses(segs_b) boundary_format = BoundaryFormat.mass # Check format if boundary_format == BoundaryFormat.sets: pass # Correct boundary format elif boundary_format == BoundaryFormat.mass: segs_a = boundary_string_from_masses(segs_a) segs_b = boundary_string_from_masses(segs_b) elif boundary_format == BoundaryFormat.position: segs_a = convert_positions_to_masses(segs_a) segs_b = convert_positions_to_masses(segs_b) segs_a = boundary_string_from_masses(segs_a) segs_b = boundary_string_from_masses(segs_b) else: raise SegmentationMetricError('Unsupported boundary format') # Check length if len(segs_a) != len(segs_b): raise SegmentationMetricError( 'Segmentations differ in length ({0} != {1})'.format( len(segs_a), len(segs_b))) # Determine the boundary types boundary_types = identify_types(segs_a, segs_b) # Calculate the total pbs pbs = len(segs_b) * len(boundary_types) # Compute edits additions, substitutions, transpositions = \ boundary_edit_distance(segs_a, segs_b, n_t=n_t) # Apply weighting functions fnc_weight_a, fnc_weight_s, fnc_weight_t = weight count_additions = fnc_weight_a(additions) count_substitutions = fnc_weight_s(substitutions, max(boundary_types), min(boundary_types)) count_transpositions = fnc_weight_t(transpositions, n_t) count_edits = count_additions + count_substitutions + count_transpositions # Compute matches = list() full_misses = list() boundaries_all = 0 for set_a, set_b in zip(segs_a, segs_b): matches.extend(set_a.intersection(set_b)) full_misses.extend(set_a.symmetric_difference(set_b)) boundaries_all += len(set_a) + len(set_b) return { 'count_edits': count_edits, 'additions': additions, 'substitutions': substitutions, 'transpositions': transpositions, 'full_misses': full_misses, 'boundaries_all': boundaries_all, 'matches': matches, 'pbs': pbs, 'boundary_types': boundary_types }