예제 #1
0
def __potential_boundaries__(segmentation_a, segmentation_b, **kwargs):
    boundary_format = kwargs['boundary_format']
    boundary_string_a = segmentation_a
    boundary_string_b = segmentation_b
    # Convert from NLTK types
    if boundary_format == BoundaryFormat.nltk:
        boundary_string_a = convert_nltk_to_masses(segmentation_a)
        boundary_string_b = convert_nltk_to_masses(segmentation_b)
        boundary_format = BoundaryFormat.mass
    # Check format
    if boundary_format == BoundaryFormat.sets:
        pass
    elif boundary_format == BoundaryFormat.mass:
        boundary_string_a = boundary_string_from_masses(boundary_string_a)
        boundary_string_b = boundary_string_from_masses(boundary_string_b)
    elif boundary_format == BoundaryFormat.position:
        boundary_string_a = convert_positions_to_masses(boundary_string_a)
        boundary_string_b = convert_positions_to_masses(boundary_string_b)
        boundary_string_a = boundary_string_from_masses(boundary_string_a)
        boundary_string_b = boundary_string_from_masses(boundary_string_b)
    else:
        raise SegmentationMetricError('Unsupported boundary format')
    # Compute boundary types if required
    boundary_types = identify_types(boundary_string_a, boundary_string_b)
    return len(boundary_string_a) * len(boundary_types)
예제 #2
0
def __boundary_statistics__(
        segs_a, segs_b, boundary_types, boundary_format, n_t, weight):
    '''
    Compute boundary similarity applying the weighting functions specified.
    '''

    # Convert from NLTK types
    if boundary_format == BoundaryFormat.nltk:
        segs_a = convert_nltk_to_masses(segs_a)
        segs_b = convert_nltk_to_masses(segs_b)
        boundary_format = BoundaryFormat.mass
    # Check format
    if boundary_format == BoundaryFormat.sets:
        pass  # Correct boundary format
    elif boundary_format == BoundaryFormat.mass:
        segs_a = boundary_string_from_masses(segs_a)
        segs_b = boundary_string_from_masses(segs_b)
    elif boundary_format == BoundaryFormat.position:
        segs_a = convert_positions_to_masses(segs_a)
        segs_b = convert_positions_to_masses(segs_b)
        segs_a = boundary_string_from_masses(segs_a)
        segs_b = boundary_string_from_masses(segs_b)
    else:
        raise SegmentationMetricError('Unsupported boundary format')
    # Check length
    if len(segs_a) != len(segs_b):
        raise SegmentationMetricError(
            'Segmentations differ in length ({0} != {1})'.format(
                len(segs_a), len(segs_b)))
    # Determine the boundary types
    boundary_types = identify_types(segs_a, segs_b)
    # Calculate the total pbs
    pbs = len(segs_b) * len(boundary_types)
    # Compute edits
    additions, substitutions, transpositions = \
        boundary_edit_distance(segs_a, segs_b, n_t=n_t)
    # Apply weighting functions
    fnc_weight_a, fnc_weight_s, fnc_weight_t = weight
    count_additions = fnc_weight_a(additions)
    count_substitutions = fnc_weight_s(substitutions,
                                       max(boundary_types),
                                       min(boundary_types))
    count_transpositions = fnc_weight_t(transpositions, n_t)
    count_edits = count_additions + count_substitutions + count_transpositions
    # Compute
    matches = list()
    full_misses = list()
    boundaries_all = 0
    for set_a, set_b in zip(segs_a, segs_b):
        matches.extend(set_a.intersection(set_b))
        full_misses.extend(set_a.symmetric_difference(set_b))
        boundaries_all += len(set_a) + len(set_b)
    return {'count_edits': count_edits, 'additions': additions,
            'substitutions': substitutions, 'transpositions': transpositions,
            'full_misses': full_misses, 'boundaries_all': boundaries_all,
            'matches': matches, 'pbs': pbs, 'boundary_types': boundary_types}
예제 #3
0
def __boundaries__(segmentation, **kwargs):
    boundary_format = kwargs['boundary_format']
    boundary_string = segmentation
    # Convert from NLTK types
    if boundary_format == BoundaryFormat.nltk:
        boundary_string = convert_nltk_to_masses(segmentation)
        boundary_format = BoundaryFormat.mass
    # Check format
    if boundary_format == BoundaryFormat.sets:
        pass
    elif boundary_format == BoundaryFormat.mass:
        boundary_string = boundary_string_from_masses(boundary_string)
    elif boundary_format == BoundaryFormat.position:
        boundary_string = convert_positions_to_masses(boundary_string)
        boundary_string = boundary_string_from_masses(boundary_string)
    else:
        raise SegmentationMetricError('Unsupported boundary format')
    return sum([len(position) for position in boundary_string])
예제 #4
0
def input_linear_positions_tsv(filepath, delimiter=DEFAULT_DELIMITER):
    '''
    Takes a file path.  Returns segmentation mass codings as a :class:`Dataset`.

    :param filepath: path to the mass file containing segment position
                         codings.
    :param delimiter:    the delimiter used when reading a TSV file (by default,
                         a tab, but it can also be a comma, whitespace, etc.
    :type filepath: str
    :type delimiter: str

    .. deprecated:: 1.0

    .. warning:: This I/O function is for legacy files only and will be removed
        in later versions.
    '''
    dataset = input_linear_mass_tsv(filepath, delimiter)
    # Convert each segment position to masses
    for item, coder_positions in dataset.items():
        for coder, positions in coder_positions.items():
            dataset[item][coder] = convert_positions_to_masses(positions)
    # Return
    return dataset
예제 #5
0
def input_linear_positions_tsv(filepath, delimiter=DEFAULT_DELIMITER):
    '''
    Takes a file path.  Returns segmentation mass codings as a :class:`Dataset`.

    :param filepath: path to the mass file containing segment position
                         codings.
    :param delimiter:    the delimiter used when reading a TSV file (by default,
                         a tab, but it can also be a comma, whitespace, etc.
    :type filepath: str
    :type delimiter: str

    .. deprecated:: 1.0

    .. warning:: This I/O function is for legacy files only and will be removed
        in later versions.
    '''
    dataset = input_linear_mass_tsv(filepath, delimiter)
    # Convert each segment position to masses
    for item, coder_positions in dataset.items():
        for coder, positions in coder_positions.items():
            dataset[item][coder] = convert_positions_to_masses(positions)
    # Return
    return dataset
예제 #6
0
def __compute_window_size__(reference, fnc_round, boundary_format):
    '''
    Compute a window size from a dict of segment masses.

    :param masses: A dict of segment masses.
    :type masses: dict
    '''
    all_masses = list()
    # Define fnc

    def __list_coder_masses__(inner_coder_masses):
        '''
        Recursively collect all masses.

        :param inner_coder_masses: Either a dict of dicts, or dict of a list of
            masses.
        :type inner_coder_masses: dict or list
        '''
        if hasattr(inner_coder_masses, 'items'):
            for cur_inner_coder_masses in inner_coder_masses.values():
                __list_coder_masses__(cur_inner_coder_masses)
        elif hasattr(inner_coder_masses, '__iter__') and not isinstance(inner_coder_masses, str):
            all_masses.extend(inner_coder_masses)
        else:
            raise SegmentationMetricError('Expected either a dict-like \
collection of segmentations or a segmentation as a list-like object')
    if boundary_format == BoundaryFormat.position:
        reference = convert_positions_to_masses(reference)
    # Recurse and list all masses
    __list_coder_masses__(reference)
    # Convert to floats
    all_masses = [Decimal(mass) for mass in all_masses]
    # Calculate
    avg = mean(all_masses) / Decimal('2')
    window_size = int(fnc_round(avg))
    return window_size if window_size > 1 else 2
예제 #7
0
def __boundary_statistics__(segs_a, segs_b, boundary_types, boundary_format,
                            n_t, weight):
    '''
    Compute boundary similarity applying the weighting functions specified.
    '''

    # Convert from NLTK types
    if boundary_format == BoundaryFormat.nltk:
        segs_a = convert_nltk_to_masses(segs_a)
        segs_b = convert_nltk_to_masses(segs_b)
        boundary_format = BoundaryFormat.mass
    # Check format
    if boundary_format == BoundaryFormat.sets:
        pass  # Correct boundary format
    elif boundary_format == BoundaryFormat.mass:
        segs_a = boundary_string_from_masses(segs_a)
        segs_b = boundary_string_from_masses(segs_b)
    elif boundary_format == BoundaryFormat.position:
        segs_a = convert_positions_to_masses(segs_a)
        segs_b = convert_positions_to_masses(segs_b)
        segs_a = boundary_string_from_masses(segs_a)
        segs_b = boundary_string_from_masses(segs_b)
    else:
        raise SegmentationMetricError('Unsupported boundary format')
    # Check length
    if len(segs_a) != len(segs_b):
        raise SegmentationMetricError(
            'Segmentations differ in length ({0} != {1})'.format(
                len(segs_a), len(segs_b)))
    # Determine the boundary types
    boundary_types = identify_types(segs_a, segs_b)
    # Calculate the total pbs
    pbs = len(segs_b) * len(boundary_types)
    # Compute edits
    additions, substitutions, transpositions = \
        boundary_edit_distance(segs_a, segs_b, n_t=n_t)
    # Apply weighting functions
    fnc_weight_a, fnc_weight_s, fnc_weight_t = weight
    count_additions = fnc_weight_a(additions)
    count_substitutions = fnc_weight_s(substitutions, max(boundary_types),
                                       min(boundary_types))
    count_transpositions = fnc_weight_t(transpositions, n_t)
    count_edits = count_additions + count_substitutions + count_transpositions
    # Compute
    matches = list()
    full_misses = list()
    boundaries_all = 0
    for set_a, set_b in zip(segs_a, segs_b):
        matches.extend(set_a.intersection(set_b))
        full_misses.extend(set_a.symmetric_difference(set_b))
        boundaries_all += len(set_a) + len(set_b)
    return {
        'count_edits': count_edits,
        'additions': additions,
        'substitutions': substitutions,
        'transpositions': transpositions,
        'full_misses': full_misses,
        'boundaries_all': boundaries_all,
        'matches': matches,
        'pbs': pbs,
        'boundary_types': boundary_types
    }
예제 #8
0
 def test_convert_positions_to_masses_all(self):
     '''
     Test segment position sequence conversion from masses.
     '''
     self.assertEqual((1,1,1,1,1,1,1,1,1,1,1),
                      convert_positions_to_masses([1,2,3,4,5,6,7,8,9,10,11]))
예제 #9
0
 def test_convert_positions_to_masses_none(self):
     '''
     Test segment position sequence conversion from masses.
     '''
     self.assertEqual((11,),
                      convert_positions_to_masses([1,1,1,1,1,1,1,1,1,1,1]))
예제 #10
0
 def test_convert_positions_to_masses(self):
     '''
     Test segment position sequence conversion to masses.
     '''
     self.assertEqual((5,3,5),
                      convert_positions_to_masses([1,1,1,1,1,2,2,2,3,3,3,3,3]))
예제 #11
0
def __window_diff__(hypothesis, reference, window_size, one_minus,
                    boundary_format, return_parts, fnc_round,
                    lamprier_et_al_2007_fix):
    '''
    Calculates the WindowDiff segmentation evaluation metric score for a
    hypothetical segmentation against a reference segmentation for a given
    window size.  The standard method of calculating the window size
    is performed a window size is not specified.

    :param hypothesis:     Hypothesis segmentation section labels
                                        sequence.
    :param reference:      Reference segmentation section labels
                                        sequence.
    :param window_size:              The size of the window that is slid over \
                                        the two segmentations used to count \
                                        mismatches (default is None and will \
                                        use the average window size)
    :param one_minus:                Return 1-WindowDiff to make it no longer \
                                         a penalty-metric.
    :param lamprier_et_al_2007_fix:  Apply a fix for improperly counted errors \
                                        at the beginning and end of \
                                        segmentations, provided by \
                                        _[LamprierEtAl2007].
    :param convert_from_masses:      Convert the segmentations provided from \
                                        masses into positions.
    :type hypothesis: list
    :type reference: list
    :type window_size: int
    :type one_minus: bool
    :type lamprier_et_al_2007_fix: bool
    :type convert_from_masses: bool

    .. note:: See :func:`segeval.convert_masses_to_positions` for an example of
              the input format.
    '''
    # Convert from NLTK types
    if boundary_format == BoundaryFormat.nltk:
        reference = convert_nltk_to_masses(reference)
        hypothesis = convert_nltk_to_masses(hypothesis)
        boundary_format = BoundaryFormat.mass
    # Convert from masses into positions
    if boundary_format == BoundaryFormat.mass:
        reference = convert_masses_to_positions(reference)
        hypothesis = convert_masses_to_positions(hypothesis)
    elif boundary_format != BoundaryFormat.position:
        raise SegmentationMetricError('Unsupported boundary format')
    # Check for input errors
    if len(reference) != len(hypothesis):
        raise SegmentationMetricError(
            'Reference and hypothesis segmentations differ in position \
length (%(ref)i is not %(hyp)i).' % {
                'ref': len(reference),
                'hyp': len(hypothesis)
            })
    # Compute window size to use if unspecified
    if window_size is None:
        window_size = __compute_window_size__(reference, fnc_round,
                                              BoundaryFormat.position)
    # Create a set of pairs of units from each segmentation to go over using a
    # window
    units_ref_hyp = __create_paired_window__(hypothesis, reference,
                                             window_size,
                                             lamprier_et_al_2007_fix)[0]
    # Slide window over and sum the number of varying windows
    sum_differences = 0
    measurements = len(units_ref_hyp) - window_size
    for i in range(0, measurements):
        window = units_ref_hyp[i:i + window_size + 1]
        ref_boundaries = 0
        hyp_boundaries = 0
        # Check that the number of loops is correct
        assert len(window) == window_size + 1
        # For pair in window
        for j in range(0, len(window) - 1):
            ref_part, hyp_part = zip(*window[j:j + 2])
            # Boundary exists in the reference segmentation
            if ref_part[0] is not ref_part[1]:
                ref_boundaries += 1
            # Boundary exists in the hypothesis segmentation
            if hyp_part[0] is not hyp_part[1]:
                hyp_boundaries += 1
        # If the number of boundaries per segmentation in the window differs
        if ref_boundaries is not hyp_boundaries:
            sum_differences += 1
    # Perform final division
    n = sum(convert_positions_to_masses(reference))
    denominator = n - window_size
    if lamprier_et_al_2007_fix:
        denominator = measurements + 1
    win_diff = Decimal(sum_differences) / denominator
    # Check normalization
    assert denominator == measurements or lamprier_et_al_2007_fix
    # Check value
    assert win_diff <= 1
    if not one_minus:
        if return_parts:
            return sum_differences, denominator
        else:
            return win_diff
    else:
        return Decimal('1.0') - win_diff
예제 #12
0
def __window_diff__(hypothesis, reference, window_size, one_minus,
                    boundary_format, return_parts, fnc_round,
                    lamprier_et_al_2007_fix):
    '''
    Calculates the WindowDiff segmentation evaluation metric score for a
    hypothetical segmentation against a reference segmentation for a given
    window size.  The standard method of calculating the window size
    is performed a window size is not specified.

    :param hypothesis:     Hypothesis segmentation section labels
                                        sequence.
    :param reference:      Reference segmentation section labels
                                        sequence.
    :param window_size:              The size of the window that is slid over \
                                        the two segmentations used to count \
                                        mismatches (default is None and will \
                                        use the average window size)
    :param one_minus:                Return 1-WindowDiff to make it no longer \
                                         a penalty-metric.
    :param lamprier_et_al_2007_fix:  Apply a fix for improperly counted errors \
                                        at the beginning and end of \
                                        segmentations, provided by \
                                        _[LamprierEtAl2007].
    :param convert_from_masses:      Convert the segmentations provided from \
                                        masses into positions.
    :type hypothesis: list
    :type reference: list
    :type window_size: int
    :type one_minus: bool
    :type lamprier_et_al_2007_fix: bool
    :type convert_from_masses: bool

    .. note:: See :func:`segeval.convert_masses_to_positions` for an example of
              the input format.
    '''
    # Convert from NLTK types
    if boundary_format == BoundaryFormat.nltk:
        reference = convert_nltk_to_masses(reference)
        hypothesis = convert_nltk_to_masses(hypothesis)
        boundary_format = BoundaryFormat.mass
    # Convert from masses into positions
    if boundary_format == BoundaryFormat.mass:
        reference = convert_masses_to_positions(reference)
        hypothesis = convert_masses_to_positions(hypothesis)
    elif boundary_format != BoundaryFormat.position:
        raise SegmentationMetricError('Unsupported boundary format')
    # Check for input errors
    if len(reference) != len(hypothesis):
        raise SegmentationMetricError(
            'Reference and hypothesis segmentations differ in position \
length (%(ref)i is not %(hyp)i).' % {'ref': len(reference),
                                     'hyp': len(hypothesis)})
    # Compute window size to use if unspecified
    if window_size is None:
        window_size = __compute_window_size__(reference, fnc_round,
                                              BoundaryFormat.position)
    # Create a set of pairs of units from each segmentation to go over using a
    # window
    units_ref_hyp = __create_paired_window__(hypothesis, reference,
                                             window_size,
                                             lamprier_et_al_2007_fix)[0]
    # Slide window over and sum the number of varying windows
    sum_differences = 0
    measurements = len(units_ref_hyp) - window_size
    for i in range(0, measurements):
        window = units_ref_hyp[i: i + window_size + 1]
        ref_boundaries = 0
        hyp_boundaries = 0
        # Check that the number of loops is correct
        assert len(window) is window_size + 1
        # For pair in window
        for j in range(0, len(window) - 1):
            ref_part, hyp_part = zip(*window[j:j + 2])
            # Boundary exists in the reference segmentation
            if ref_part[0] is not ref_part[1]:
                ref_boundaries += 1
            # Boundary exists in the hypothesis segmentation
            if hyp_part[0] is not hyp_part[1]:
                hyp_boundaries += 1
        # If the number of boundaries per segmentation in the window differs
        if ref_boundaries is not hyp_boundaries:
            sum_differences += 1
    # Perform final division
    n = sum(convert_positions_to_masses(reference))
    denominator = n - window_size
    if lamprier_et_al_2007_fix:
        denominator = measurements + 1
    win_diff = Decimal(sum_differences) / denominator
    # Check normalization
    assert denominator == measurements or lamprier_et_al_2007_fix
    # Check value
    assert win_diff <= 1
    if not one_minus:
        if return_parts:
            return sum_differences, denominator
        else:
            return win_diff
    else:
        return Decimal('1.0') - win_diff