예제 #1
0
def _spacy_token_predictions(raw_text, tokens, probas, positions):
    """
    Go from GPT subtoken level predictions, to spacy token predictions
    """
    to_combine = []

    spacy_token_starts, spacy_token_ends = zip(*[(token.idx,
                                                  token.idx + len(token.text))
                                                 for token in NLP(raw_text)])
    spacy_token_idx = 0
    spacy_results = []

    for token, prob, (start, end) in zip(tokens, probas, positions):
        to_combine.append({
            "start": start,
            "end": end,
            "token": token,
            "probabilities": prob
        })

        try:
            end_match = spacy_token_ends.index(end, spacy_token_idx)
            start = spacy_token_starts[end_match]
            spacy_token_idx = end_match
        except ValueError:
            continue

        spacy_results.append(
            _combine_and_format(to_combine,
                                start=start,
                                end=end,
                                raw_text=raw_text))
        to_combine = []

    return spacy_results
예제 #2
0
    def _encode(self, texts, labels=None):
        """
        Convert a batch of raw text to a batch of byte-pair encoded token indices.
        """
        self._lazy_init()
        batch_tokens = []
        batch_token_idxs = []
        batch_label_idxs = []
        batch_character_locs = []
        label = None

        for i, text in enumerate(texts):
            if labels is not None:
                label = labels[i]
            raw_text = text.lower()
            tokens = NLP(_text_standardize(text))
            subtokens = []
            subtoken_idxs = []
            tok_pos = []
            token_start = 0

            for j, token in enumerate(tokens):
                bpe_toks = self.bpe(token.text).split(' ')

                try:
                    if token.text.strip():
                        token_start = raw_text.index(token.text.strip(),
                                                     token_start)
                except ValueError:
                    # text_standardization oddity
                    continue

                subtokens.extend(bpe_toks)
                subtoken_idxs.extend([
                    self.encoder.get(SUBS.get(t, t), self.UNK_IDX)
                    for t in bpe_toks
                ])

                assert len("".join(bpe_toks).replace("</w>", "")) == len(
                    token.text.replace(' ', ''))
                subtoken_positions = np.cumsum(
                    [len(tok.replace("</w>", ''))
                     for tok in bpe_toks]) + token_start

                token_start += len(token.text.strip())

                tok_pos.extend(subtoken_positions)

            batch_tokens.append(subtokens)
            batch_token_idxs.append(subtoken_idxs)
            batch_character_locs.append(tok_pos)
            if labels is not None:
                batch_label_idxs.append([label] * len(subtoken_idxs))

        return EncodedOutput(
            token_ids=batch_token_idxs,
            tokens=batch_tokens,
            labels=batch_label_idxs,
            char_locs=batch_character_locs,
        )
예제 #3
0
def _convert_to_token_list(annotations, doc_idx=None):
    tokens = []

    for annotation in annotations:
        start_idx = annotation.get('start')
        tokens.extend([{
            'start': start_idx + token.idx,
            'end': start_idx + token.idx + len(token.text),
            'text': token.text,
            'label': annotation.get('label'),
            'doc_idx': doc_idx
        } for token in NLP(annotation.get('text'))])

    return tokens
예제 #4
0
def finetune_to_indico_sequence(
    raw_texts,
    subseqs,
    labels,
    probs=None,
    none_value=None,
    subtoken_predictions=False,
    associations=None,
):
    """
    Maps from the labeled substring format into the 'indico' format. This is the exact inverse operation to
    :meth indico_to_finetune_sequence:.

    The indico format is as follows:
        Raw text for X,
        Labels as a list of dicts, with each dict in the form:
        {
            'start': <Character index of the start_token of the labeled sequence>,
            'end': <Character index of the end of the labeled sequence>,
            'label': <A categorical label (int or string) that represents the category of the subsequence,
            'text': <Optionally, a field with the subsequence contained between the start and end.
        }

    The Labeled substring, or finetune internal, format is as follows.
    Each item of the data is a list strings of the form:
        ["The quick brown", "fox", "jumped over the lazy", ...]
    With the corresponding labels:
        ["PAD", "animal", "PAD", ...]

    It is the :param none_value: that is used to populate the PAD labels.
    :param data: A list of segmented text of the form list(list(str))
    :param labels: Categorical labels for each sub-string in data.
    :param none_value: The none value used to encode the input format.
    :return: Texts, annoatations both in the 'indico' format.
    """
    annotations = []
    loop_vals = zip(raw_texts, subseqs, labels, probs
                    or [None] * len(raw_texts))
    for doc_idx, (raw_text, doc_seq, label_seq,
                  prob_seq) in enumerate(loop_vals):
        spacy_tokens = NLP(raw_text)
        spacy_token_starts = [token.idx for token in spacy_tokens]
        spacy_token_ends = [
            token.idx + len(token.text) for token in spacy_tokens
        ]
        doc_annotations = []
        annotation_ranges = set()
        raw_annotation_start = 0
        subtoken_to_label_idx = []
        for i, (sub_str, raw_label, confidences) in enumerate(
                zip(doc_seq, label_seq, prob_seq or [None] * len(doc_seq))):
            subtoken_to_label_idx.append(len(doc_annotations))
            if not isinstance(raw_label, tuple):
                label_list = [raw_label]
            else:
                label_list = raw_label

            for label_idx, label in enumerate(label_list):
                stripped_text = sub_str.strip()

                if subtoken_predictions:
                    raw_annotation_start = raw_text.find(
                        sub_str, raw_annotation_start)
                    raw_annotation_end = raw_annotation_start + len(sub_str)
                else:
                    raw_annotation_start = raw_text.find(
                        stripped_text, raw_annotation_start)
                    raw_annotation_end = raw_annotation_start + len(
                        stripped_text)

                if raw_annotation_start == -1:
                    warnings.warn(
                        "Failed to find predicted sequence: {} in text: {}.".
                        format(sub_str, raw_text))
                    continue

                extended_existing_label = False
                for item in doc_annotations:
                    # handle case where we extend existing annotation
                    if (
                            # same label
                            item["label"] == label
                            # and only separated by whitespace
                            and item["end"] <= raw_annotation_end and
                            not raw_text[item["end"]:raw_annotation_start].
                            strip()):
                        item["end"] = raw_annotation_end
                        item["text"] = raw_text[
                            item["start"]:raw_annotation_end]
                        if "confidence" in item and confidences is not None:
                            item["confidence"].append(confidences)
                        extended_existing_label = True
                        break

                if extended_existing_label or label == none_value:
                    continue

                annotation_start, annotation_end = (
                    int(raw_annotation_start),
                    int(raw_annotation_end),
                )

                annotation = {
                    "start": int(annotation_start),
                    "end": int(annotation_end),
                    "label": label,
                    "text": raw_text[annotation_start:annotation_end],
                }

                # if we don't want to allow subtoken predictions, adjust start and end to match
                # the start and ends of the nearest full tokens
                if not subtoken_predictions:
                    round_to_nearest_start_and_end(annotation,
                                                   spacy_token_starts,
                                                   spacy_token_ends, raw_text)

                if confidences is not None:
                    annotation["confidence"] = [confidences]

                if annotation["start"] >= annotation["end"]:
                    continue

                # prevent duplicate annotation edge case
                annotation_tuple = (annotation["start"], annotation["end"],
                                    label)
                if annotation_tuple not in annotation_ranges:
                    annotation_ranges.add(annotation_tuple)
                    doc_annotations.append(annotation)

        if associations:
            associations_seq = assign_associations(associations[doc_idx],
                                                   none_value,
                                                   subtoken_to_label_idx)
            for label_i, annotation in enumerate(doc_annotations):
                if label_i in associations_seq:
                    index, relationship, prob = associations_seq[label_i]
                    annotation["associations"] = {
                        "index": index,
                        "relationship": relationship,
                        "prob": prob,
                    }

        doc_annotations = sorted([dict(items) for items in doc_annotations],
                                 key=lambda x: span(x))

        for annotation in doc_annotations:
            _merge_confidences(annotation)

        annotations.append(doc_annotations)
    return raw_texts, annotations
예제 #5
0
def overlap_handler(current_annotation, annotation, text, multi_label):
    """
    Scenarios:
        <> --> current_annotation
        [] --> annotation
        
    1) < [ > ]
    2) [ < > ]
    3) < [ ] >
    """
    if current_annotation["start"] <= annotation["start"]:
        first, second = current_annotation, annotation
    else:
        first, second = annotation, current_annotation

    final_delimiter = min(first["end"], second["end"])
    final_label = second["label"] if (
        second["end"] > first["end"]) else first["label"]
    overlapping_text = text[second["start"]:final_delimiter]
    end = max(first["end"], second["end"])

    first_chunk = {
        "start": first["start"],
        "end": second["start"],
        "label": first["label"],
        "text": text[first["start"]:second["start"]],
    }

    if multi_label:
        second_label = first["label"] | second["label"]
    else:
        if first["label"] != second["label"] and (len(overlapping_text.strip())
                                                  > 1):
            warnings.warn(
                "Found overlapping annotations: {} and {}. \n"
                "Consider setting `multi_label_sequences` to `True` in your config."
                .format(annotation, current_annotation))
        spacy_tokens = NLP(text)
        spacy_token_starts = [token.idx for token in spacy_tokens]
        if second["label"] in spacy_token_starts:
            second_label = second["label"]
        elif final_delimiter in spacy_token_starts:
            second_label = first["label"]
        else:
            second_label = first["label"]

    second_chunk = {
        "start": second["start"],
        "end": final_delimiter,
        "label": second_label,
        "text": overlapping_text,
    }

    third_chunk = {
        "start": final_delimiter,
        "end": end,
        "label": final_label,
        "text": text[final_delimiter:end],
    }
    chunks = [first_chunk, second_chunk, third_chunk]
    chunks = [c for c in chunks if c["start"] != c["end"]]
    return chunks
예제 #6
0
def overlap_handler(
    current_annotation,
    annotation,
    text,
    multi_label,
):
    """
    Scenarios:
        <> --> current_annotation
        [] --> annotation
        
    1) < [ > ]
    2) [ < > ]
    3) < [ ] >
    """
    if current_annotation['start'] <= annotation['start']:
        first, second = current_annotation, annotation
    else:
        first, second = annotation, current_annotation

    final_delimiter = min(first['end'], second['end'])
    final_label = second['label'] if (
        second['end'] > first['end']) else first['label']
    overlapping_text = text[second['start']:final_delimiter]
    end = max(first['end'], second['end'])

    first_chunk = {
        'start': first['start'],
        'end': second['start'],
        'label': first['label'],
        'text': text[first['start']:second['start']]
    }

    if multi_label:
        second_label = first['label'] | second['label']
    else:
        if first['label'] != second['label'] and (len(overlapping_text.strip())
                                                  > 1):
            warnings.warn(
                "Found overlapping annotations: {} and {}. \n"
                "Consider setting `multi_label_sequences` to `True` in your config."
                .format(annotation, current_annotation))
        spacy_tokens = NLP(text)
        spacy_token_starts = [token.idx for token in spacy_tokens]
        if second['start'] in spacy_token_starts:
            second_label = second['label']
        elif final_delimiter in spacy_token_starts:
            second_label = first['label']
        else:
            second_label = first['label']

    second_chunk = {
        'start': second['start'],
        'end': final_delimiter,
        'label': second_label,
        'text': overlapping_text
    }

    third_chunk = {
        'start': final_delimiter,
        'end': end,
        'label': final_label,
        'text': text[final_delimiter:end]
    }
    chunks = [first_chunk, second_chunk, third_chunk]
    chunks = [c for c in chunks if c['start'] != c['end']]
    return chunks
예제 #7
0
파일: encoder.py 프로젝트: tc-wolf/finetune
    def _encode(self, texts, labels=None):
        """
        Convert a batch of raw text to a batch of byte-pair encoded token indices.
        """

        self._lazy_init()
        batch_tokens = []
        batch_token_idxs = []
        batch_label_idxs = []
        batch_char_ends = (
            []
        )  # to account for the fact that some BPEs have different lengths than their original tokens (e.g. special characters such as bullets)
        batch_char_starts = []
        label = None
        skipped = 0
        for i, text in enumerate(texts):
            if labels is not None:
                label = labels[i]

            raw_text = text.lower()
            
            # Only fine to apply this fix because it preserves character locations
            ftfy_text = uncurl_quotes(raw_text)
            tokens = NLP(_text_standardize(text))
            if not tokens:
                skipped += 1
                continue
            i -= skipped
            subtokens = []
            subtoken_idxs = []
            char_starts = []
            char_ends = []
            token_start = 0

            for j, token in enumerate(tokens):
                bpe_toks = self.bpe(token.text).split(" ")

                try:
                    if token.text.strip():
                        token_start = ftfy_text.index((token.text.strip()), token_start)
                except ValueError:
                    warnings.warn(
                        "Failed to find token `{}` in text.".format(token.text)
                    )
                    continue

                subtokens.extend(bpe_toks)
                subtoken_idxs.extend(
                    [self.encoder.get(SUBS.get(t, t), self.UNK_IDX) for t in bpe_toks]
                )

                assert len("".join(bpe_toks).replace("</w>", "")) == len(
                    token.text.replace(" ", "")
                )

                if np.sum([len(tok.replace("</w>", "")) for tok in bpe_toks]) > len(
                    token
                ):  # the BPEs comprising a token are longer than the token itself
                    token_char_ends = (
                        np.asarray([len(token.text.strip()) for tok in bpe_toks])
                        + token_start
                    )
                else:
                    token_char_ends = (
                        np.cumsum([len(tok.replace("</w>", "")) for tok in bpe_toks])
                        + token_start
                    )
                
                token_char_starts = [token_start] + token_char_ends[:-1].tolist()
                token_start += len(token.text.strip())
                char_ends.extend(token_char_ends)
                char_starts.extend(token_char_starts)

            batch_tokens.append(subtokens)
            batch_token_idxs.append(subtoken_idxs)
            batch_char_ends.append(char_ends)
            batch_char_starts.append(char_starts)
            if labels is not None:
                batch_label_idxs.append([label] * len(subtoken_idxs))

        return EncodedOutput(
            token_ids=batch_token_idxs,
            tokens=batch_tokens,
            labels=batch_label_idxs,
            char_locs=batch_char_ends,
            char_starts=batch_char_starts,
        )