Exemplo n.º 1
0
def iob_transitions_legality(tags: Set[str],
                             start: str = TokenFunction.GO,
                             end: str = TokenFunction.EOS) -> List[Transition]:
    """Get transition legality when processing `IOB` tags.

    There are a few rules the govern `IOB` tagging. Spans are allowed to begin with an `I-`
    so a lot of the rules other span encoding formats about not transitioning from `O` to
    and `I-` don't apply. The main rules are around the use of the `B-` token. In `IOB` we
    are only allowed to start a token with a `B-` when it is the start of a new span that
    directly follows (touches) a previous span of the same time. This translates into rules
    that `B-` tokens can only follow tags that have the same type (either `B-` or `I-`)

    Args:
        tags: The tags that we can assign to tokens.
        start: A special tag representing the start of all sequences.
        end: A special tag representing the end of all sequences.

    Returns:
        The list of transitions.
    """
    transitions = []
    for src in chain(tags, [start, end]):
        src_func = extract_function(src)
        src_type = extract_type(src)
        for tgt in chain(tags, [start, end]):
            tgt_func = extract_function(tgt)
            tgt_type = extract_type(tgt)
            # Can't transition to start
            if tgt == start:
                transitions.append(Transition(src, tgt, False))
                continue
            # Can't transition from start
            if src == end:
                transitions.append(Transition(src, tgt, False))
                continue
            # Can't transition from start to B because B needs to be between two spans of the same type
            elif src == start:
                if tgt_func == TokenFunction.BEGIN:
                    transitions.append(Transition(src, tgt, False))
                    continue
            elif src_func == TokenFunction.BEGIN:
                # Can only go from B to B of the same type
                if tgt_func == TokenFunction.BEGIN:
                    if src_type != tgt_type:
                        transitions.append(Transition(src, tgt, False))
                        continue
            elif src_func == TokenFunction.INSIDE:
                # Can only go from I to B of the same type
                if tgt_func == TokenFunction.BEGIN:
                    if src_type != tgt_type:
                        transitions.append(Transition(src, tgt, False))
                        continue
            elif src_func == TokenFunction.OUTSIDE:
                # Can't start a span with B unless preceded by another span
                if tgt_func == TokenFunction.BEGIN:
                    transitions.append(Transition(src, tgt, False))
                    continue
            transitions.append(Transition(src, tgt, True))
    return transitions
Exemplo n.º 2
0
def bio_transitions_legality(tags: Set[str],
                             start: str = TokenFunction.GO,
                             end: str = TokenFunction.EOS) -> List[Transition]:
    """Get transition legality when processing `BIO` tags.

    **TODO**

    Args:
        tags: The tags that we can assign to tokens.
        start: A special tag representing the start of all sequences.
        end: A special tag representing the end of all sequences.

    Returns:
        The list of transitions.
    """
    transitions = []
    for src in chain(tags, [start, end]):
        src_func = extract_function(src)
        src_type = extract_type(src)
        for tgt in chain(tags, [start, end]):
            tgt_func = extract_function(tgt)
            tgt_type = extract_type(tgt)
            # Can't transition to start
            if tgt == start:
                transitions.append(Transition(src, tgt, False))
                continue
            # Can't transition from end
            if src == end:
                transitions.append(Transition(src, tgt, False))
                continue
            elif src == start:
                # Can't go from start to an I
                if tgt_func == BIO.INSIDE:
                    transitions.append(Transition(src, tgt, False))
                    continue
            elif src_func == BIO.BEGIN:
                # Can only go from B to I of same type
                if tgt_func == BIO.INSIDE:
                    if src_type != tgt_type:
                        transitions.append(Transition(src, tgt, False))
                        continue
            elif src_func == BIO.INSIDE:
                # Can only go from I to I of same type
                if tgt_func == BIO.INSIDE:
                    if src_type != tgt_type:
                        transitions.append(Transition(src, tgt, False))
                        continue
            elif src_func == TokenFunction.OUTSIDE:
                # Can't start an entity with I
                if tgt_func == BIO.INSIDE:
                    transitions.append(Transition(src, tgt, False))
                    continue
            transitions.append(Transition(src, tgt, True))
    return transitions
Exemplo n.º 3
0
def parse_spans_with_end_with_errors(
        seq: List[str],
        span_format: SpanFormat) -> Tuple[List[Span], List[Error]]:
    """Parse a sequence of labels into a list of spans but return any violations of the encoding scheme.

    Note:
        In the case where labels violate the span encoded scheme, for example the
        tag is a new type (like ``I-ORG``) in the middle of a span of another type
        (like ``PER``) without a proper starting token (``B-ORG``) we will finish
        the initial span and start a new one, resulting in two spans. This follows
        the ``conlleval.pl`` script.

    Note:
        Span are returned sorted by their starting location. Due to the fact that
        spans are not allowed to overlap there is no resolution policy when two
        spans have same starting location.

    Note:
        Errors are returned sorted by the location where the violation occurred. In the
        case a single transition triggered multiple errors they are sorted lexically based
        on the error type.

    Note:
        This is a generic function that can parse IOBES, BILOU, and BMEWO formats.

    Args:
        seq: The sequence of labels

    Returns:
        A list of spans and a list of errors.
    """
    errors = []
    spans = []
    # The type of the span we are building
    span = None
    # The tokens of the span we are building
    tokens = []
    for i, s in enumerate(seq):
        func = extract_function(s)
        _type = extract_type(s)
        # A `B` ends any current span and starts a new span
        if func == span_format.BEGIN:
            if span is not None:
                # There was a previously active span, This is an error, the span should have been closed by
                # either an `E` or and `S` before starting a new one.
                if i > 0:
                    prev_func = extract_function(seq[i - 1])
                    if prev_func not in (span_format.END, span_format.SINGLE):
                        LOGGER.warning("Illegal Label: `%s` ends span at %d",
                                       prev_func, i - 1)
                        errors.append(
                            Error(i - 1, "Illegal End", safe_get(seq, i - 1),
                                  safe_get(seq, i - 2), s))
                spans.append(
                    Span(span,
                         start=tokens[0],
                         end=tokens[-1] + 1,
                         tokens=tuple(tokens)))
            span = _type
            tokens = [i]
            # Checking if this `B` causes errors.
            if i < len(seq) - 1:
                next_func = extract_function(seq[i + 1])
                # Look ahead to see if `B` token should actual be can `S` because it is only a single token
                # We only check for `B`, `S` and `O` because an illegal transition to an `I` or `E` will get
                # warned when we actually process that token
                if next_func in (span_format.BEGIN, span_format.SINGLE,
                                 TokenFunction.OUTSIDE):
                    LOGGER.warning(
                        "Illegal Label: Single `B` token span at %d", i)
                    errors.append(
                        Error(i, "Illegal Single", s, safe_get(seq, i - 1),
                              safe_get(seq, i + 1)))
            # A `B` as the last token is an error because it would result in a single span of a `B`
            elif i == len(seq) - 1:
                LOGGER.warning("Illegal Label: `B` as final token %d", i)
                errors.append(
                    Error(i, "Illegal Final", s, safe_get(seq, i - 1),
                          safe_get(seq, i + 1)))
        # A `S` ends any active span and creates a new single token span
        elif func == span_format.SINGLE:
            # There was a previously active span, This is an error, the span should have been closed by
            # either an `E` or and `S` before starting a new one.
            if span is not None:
                if i > 0:
                    prev_func = extract_function(seq[i - 1])
                    if prev_func not in (span_format.END, span_format.SINGLE):
                        LOGGER.warning("Illegal Label: `%s` ends span at %d",
                                       prev_func, i - 1)
                        errors.append(
                            Error(i - 1, "Illegal End", safe_get(seq, i - 1),
                                  safe_get(seq, i - 2), s))
                # Flush this current span
                spans.append(
                    Span(span,
                         start=tokens[0],
                         end=tokens[-1] + 1,
                         tokens=tuple(tokens)))
            # Create a new span that covers this `S`
            spans.append(Span(_type, start=i, end=i + 1, tokens=(i, )))
            # Set the active span to None
            span = None
            tokens = []
        # An `I` will continue a span when the types match and start a new one otherwise.
        elif func == span_format.INSIDE:
            if span is not None:
                # Continue the entity
                if _type == span:
                    tokens.append(i)
                # Out types mismatch, save the current span and start a new one
                else:
                    LOGGER.warning(
                        "Illegal Label: `I` doesn't match previous token at %d",
                        i)
                    errors.append(
                        Error(i, "Illegal Transition", s, safe_get(seq, i - 1),
                              safe_get(seq, i + 1)))
                    spans.append(
                        Span(span,
                             start=tokens[0],
                             end=tokens[-1] + 1,
                             tokens=tuple(tokens)))
                    span = _type
                    tokens = [i]
            # There was no previous entity we start one with this `I` but this is an error
            else:
                LOGGER.warning("Illegal Label: starting a span with `I` at %d",
                               i)
                errors.append(
                    Error(i, "Illegal Start", s, safe_get(seq, i - 1),
                          safe_get(seq, i + 1)))
                span = _type
                tokens = [i]
            # Look ahead to see if this `I` is the last token. This will causes an illegal span because we
            # won't close the span so log this error.
            if i == len(seq) - 1:
                LOGGER.warning("Illegal Label: `I` as final token at %d", i)
                errors.append(
                    Error(i, "Illegal Final", s, safe_get(seq, i - 1),
                          safe_get(seq, i + 1)))
        # An `E` will close the currently active span if the type matches. Otherwise we close the current span,
        # create a new span, and immediately close it because we are an `E`
        elif func == span_format.END:
            if span is not None:
                # Type matches to close the span correctly
                if _type == span:
                    tokens.append(i)
                    spans.append(
                        Span(span,
                             start=tokens[0],
                             end=tokens[-1] + 1,
                             tokens=tuple(tokens)))
                    span = None
                    tokens = []
                # Type mismatch
                else:
                    # Log an error that the `E` doesn't match
                    LOGGER.warning(
                        "Illegal Label: `E` doesn't match previous token at %d",
                        i)
                    errors.append(
                        Error(i, "Illegal Transition", s, safe_get(seq, i - 1),
                              safe_get(seq, i + 1)))
                    # Save out the active span
                    spans.append(
                        Span(span,
                             start=tokens[0],
                             end=i,
                             tokens=tuple(tokens)))
                    # Save out the new span this `E` opens and closes
                    spans.append(Span(_type, start=i, end=i + 1, tokens=(i, )))
                    # Set the active span to None
                    span = None
                    tokens = []
            # There was no span so start and end it with this `E`
            else:
                LOGGER.warning("Illegal Label: starting a span with `E` at %d",
                               i)
                errors.append(
                    Error(i, "Illegal Start", s, safe_get(seq, i - 1),
                          safe_get(seq, i + 1)))
                spans.append(Span(_type, start=i, end=i + 1, tokens=(i, )))
                span = None
                tokens = []
        # An `O` cuts off the active entity
        else:
            # There was a previously active span, This is an error, the span should have been closed by
            # either an `E` or and `S` before having an O
            if span is not None:
                if i > 0:
                    prev_func = extract_function(seq[i - 1])
                    if prev_func not in (span_format.END, span_format.SINGLE):
                        LOGGER.warning("Illegal Label: `%s` ends span at %d",
                                       prev_func, i - 1)
                        errors.append(
                            Error(i - 1, "Illegal End", safe_get(seq, i - 1),
                                  safe_get(seq, i - 2), s))
                spans.append(
                    Span(span,
                         start=tokens[0],
                         end=tokens[-1] + 1,
                         tokens=tuple(tokens)))
                span = None
                tokens = []
    if span is not None:
        # There was an active entity that fell off the end of the sequence. This should be an error because
        # it means that the span hasn't ended with an `E` or an `S` but we catch these errors by looking
        # ahead in the B or I section instead if doing it here.
        spans.append(
            Span(span,
                 start=tokens[0],
                 end=tokens[-1] + 1,
                 tokens=tuple(tokens)))
        span = None
        tokens = []
    return sort_spans(spans), sort_errors(errors)
Exemplo n.º 4
0
def parse_spans_bio_with_errors(
        seq: List[str]) -> Tuple[List[Span], List[Error]]:
    """Parse a sequence of BIO labels into a list of spans but return any violations of the encoding scheme.

    Note:
        In the case where labels violate the span encoded scheme, for example the
        tag is a new type (like ``I-ORG``) in the middle of a span of another type
        (like ``PER``) without a proper starting token (``B-ORG``) we will finish
        the initial span and start a new one, resulting in two spans. This follows
        the ``conlleval.pl`` script.

    Note:
        Span are returned sorted by their starting location. Due to the fact that
        spans are not allowed to overlap there is no resolution policy when two
        spans have same starting location.

    Note:
        Errors are returned sorted by the location where the violation occurred. In the
        case a single transition triggered multiple errors they are sorted lexically based
        on the error type.

    Args:
        seq: The sequence of labels

    Returns:
        A list of spans and a list of errors.
    """
    errors = []
    spans = []
    # This tracks the type of the span we are building out
    span = None
    # This tracks the tokens of the span we are building out
    tokens = []
    for i, s in enumerate(seq):
        func = extract_function(s)
        _type = extract_type(s)
        # A `B` ends a span and starts a new one
        if func == BIO.BEGIN:
            # Save out the old span
            if span is not None:
                spans.append(
                    Span(span,
                         start=tokens[0],
                         end=tokens[-1] + 1,
                         tokens=tuple(tokens)))
            # Start the new span
            span = _type
            tokens = [i]
        # An `I` will continue a span when types match and start a new one otherwise.
        elif func == BIO.INSIDE:
            # A span is already being built
            if span is not None:
                # The types match so we just add to the current span
                if span == _type:
                    tokens.append(i)
                # Types mismatch so create a new span
                else:
                    # Log error from type mismatch
                    LOGGER.warning(
                        "Illegal Label: I doesn't match previous token at %d",
                        i)
                    errors.append(
                        Error(i, "Illegal Transition", s, safe_get(seq, i - 1),
                              safe_get(seq, i + 1)))
                    # Save out the previous span
                    spans.append(
                        Span(span,
                             start=tokens[0],
                             end=tokens[-1] + 1,
                             tokens=tuple(tokens)))
                    # Start a new span
                    span = _type
                    tokens = [i]
            # No span was being build so start a new one with this I
            else:
                # Log error from starting with I
                LOGGER.warning("Illegal Label: starting a span with `I` at %d",
                               i)
                errors.append(
                    Error(i, "Illegal Start", s, safe_get(seq, i - 1),
                          safe_get(seq, i + 1)))
                span = _type
                tokens = [i]
        # An `O` will cut off a span being built out.
        else:
            if span is not None:
                spans.append(
                    Span(span,
                         start=tokens[0],
                         end=tokens[-1] + 1,
                         tokens=tuple(tokens)))
            # Set so no span is being built
            span = None
            tokens = []
    # If we fell off the end so save the entity that we were making.
    if span is not None:
        spans.append(
            Span(span,
                 start=tokens[0],
                 end=tokens[-1] + 1,
                 tokens=tuple(tokens)))
    return sort_spans(spans), sort_errors(errors)
Exemplo n.º 5
0
def parse_spans_iob_with_errors(
        seq: List[str]) -> Tuple[List[Span], List[Error]]:
    """Parse a sequence of IOB encoded labels into a list of spans but return any violations of the encoding scheme.

    Note:
        Span are returned sorted by their starting location. Due to the fact that
        spans are not allowed to overlap there is no resolution policy when two
        spans have same starting location.

    Note:
        Errors are returned sorted by the location where the violation occurred. In the
        case a single transition triggered multiple errors they are sorted lexically based
        on the error type.

    Args:
        seq: The sequence of labels

    Returns:
        A list of spans and a list of errors.
    """
    errors = []
    spans = []
    # This tracks the type of the span we are currently building
    span = None
    # This tracks the tokens that make up the span we are building
    tokens = []
    for i, s in enumerate(seq):
        func = extract_function(s)
        _type = extract_type(s)
        # A `B` ends a current span but starts a new one
        if func == TokenFunction.BEGIN:
            prev_type = extract_type(seq[i - 1]) if i > 0 else None
            # In `iob` `B` is only allowed to mark the boundary between to spans of the same type that touch
            # `B` isn't allowed to arbitrary start and entity which would happen when `B` is the first token
            # or the last token was an outside
            if i == 0 or prev_type == TokenFunction.OUTSIDE:
                LOGGER.warning("Invalid label: `B` starting an entity at %d",
                               i)
                errors.append(
                    Error(i, "Illegal Start", s, safe_get(seq, i - 1),
                          safe_get(seq, i + 1)))
            # If the previous type isn't the same as our type we should have just used an `I` to transition
            elif prev_type != _type:
                LOGGER.warning(
                    "Invalid label: `B` starting and entity after a %s at %d",
                    prev_type, i)
                errors.append(
                    Error(i, "Illegal Transition", s, safe_get(seq, i - 1),
                          safe_get(seq, i + 1)))
            # If there is a span getting built save it out.
            if span is not None:
                spans.append(
                    Span(span,
                         start=tokens[0],
                         end=tokens[-1] + 1,
                         tokens=tuple(tokens)))
            # Create a new span starting with this B
            span = _type
            tokens = [i]
        # An `I` will continue a span when the types match and force a new one otherwise
        elif func == TokenFunction.INSIDE:
            # There is already a span being build
            if span is not None:
                # If we match types are are a continuation of that span
                if span == _type:
                    tokens.append(i)
                # If we don't match types then we are starting a new span. Save old and start a new one.
                else:
                    spans.append(
                        Span(span,
                             start=tokens[0],
                             end=tokens[-1] + 1,
                             tokens=tuple(tokens)))
                    span = _type
                    tokens = [i]
            # This I starts a new entity
            else:
                span = _type
                tokens = [i]
        # An `O` will end an entity being built
        else:
            # If a span was being made cut it here and save the span out.
            if span is not None:
                spans.append(
                    Span(span,
                         start=tokens[0],
                         end=tokens[-1] + 1,
                         tokens=tuple(tokens)))
            span = None
            tokens = []
    # If we fell off the end save the span that was being made
    if span is not None:
        spans.append(
            Span(span,
                 start=tokens[0],
                 end=tokens[-1] + 1,
                 tokens=tuple(tokens)))
    return sort_spans(spans), sort_errors(errors)
Exemplo n.º 6
0
def with_end_transitions_legality(
        tags: Set[str],
        span_format: SpanFormat,
        start: str = TokenFunction.GO,
        end: str = TokenFunction.EOS) -> List[Transition]:
    """Get transition legality when processing tags when the encoding scheme has a `end` token function.

    Span encoding schemes that have special token prefixes for tokens that are the start, middle, and end
    of a span (and a specific prefix for a token that represents a single token span) have quite a few more
    rule. These can mostly be summed up as spans need to start with the starting prefix and end with the
    ending prefix. What this means that things like the inside tokens can't follow an outside and can't be
    followed by an outside. It also has rules like the beginning token can't be followed by an ending
    token that is a different type.

    Note:
        Several span formats like `IOBES`, `BILOU`, and `BMEOW` are the same except for the value
        of some of the `TokenFunction` (`IOBES` has `E` for the end while `BILOU` has `L`). Other
        than these differences these all behave the same way. This function parses all of these
        formats by comparing to the things like the `SpanFormat.BEGIN` instead of the literal
        string. This is the underlying implementation but the user facing function to get the
        transitions for a specific encoding scheme should be used.

    Args:
        tags: The tags that we can assign to tokens.
        span_format: The `SpanFormat` we are using for these tags.
        start: A special tag representing the start of all sequences.
        end: A special tag representing the end of all sequences.

    Returns:
        The list of transitions.
    """
    transitions = []
    for src in chain(tags, [start, end]):
        src_func = extract_function(src)
        src_type = extract_type(src)
        for tgt in chain(tags, [start, end]):
            tgt_func = extract_function(tgt)
            tgt_type = extract_type(tgt)
            # Can't transition to start
            if tgt == start:
                transitions.append(Transition(src, tgt, False))
                continue
            # Can't transition from end
            if src == end:
                transitions.append(Transition(src, tgt, False))
                continue
            elif src == start:
                # Can't start span with I or E
                if tgt_func in (span_format.INSIDE, span_format.END):
                    transitions.append(Transition(src, tgt, False))
                    continue
            elif src_func == span_format.BEGIN:
                # Can't go from B to B, S, or O because we didn't close the entity
                if tgt_func in (span_format.BEGIN, span_format.SINGLE,
                                TokenFunction.OUTSIDE) or tgt_func == end:
                    transitions.append(Transition(src, tgt, False))
                    continue
                # Can only go from B to I or E of the same type
                elif tgt_func in (span_format.INSIDE, span_format.END):
                    if src_type != tgt_type:
                        transitions.append(Transition(src, tgt, False))
                        continue
            elif src_func == span_format.INSIDE:
                # Can't from from I to B, S, or O because we didin't close the entity
                if tgt_func in (span_format.BEGIN, span_format.SINGLE,
                                TokenFunction.OUTSIDE) or tgt == end:
                    transitions.append(Transition(src, tgt, False))
                    continue
                # Can only go from I to I or E of the same Type
                elif tgt_func in (span_format.INSIDE, span_format.END):
                    if src_type != tgt_type:
                        transitions.append(Transition(src, tgt, False))
                        continue
            elif src_func in (span_format.END, span_format.SINGLE,
                              TokenFunction.OUTSIDE):
                # Going from outside an entity (or ending it) to one that was inside the entity (I/E) is illegal
                if tgt_func in (span_format.INSIDE, span_format.END):
                    transitions.append(Transition(src, tgt, False))
                    continue
            # Other transitions are allowed
            transitions.append(Transition(src, tgt, True))
    return transitions