def _spacy_token_predictions(raw_text, tokens, probas, positions): """ Go from GPT subtoken level predictions, to spacy token predictions """ to_combine = [] spacy_token_starts, spacy_token_ends = zip(*[(token.idx, token.idx + len(token.text)) for token in NLP(raw_text)]) spacy_token_idx = 0 spacy_results = [] for token, prob, (start, end) in zip(tokens, probas, positions): to_combine.append({ "start": start, "end": end, "token": token, "probabilities": prob }) try: end_match = spacy_token_ends.index(end, spacy_token_idx) start = spacy_token_starts[end_match] spacy_token_idx = end_match except ValueError: continue spacy_results.append( _combine_and_format(to_combine, start=start, end=end, raw_text=raw_text)) to_combine = [] return spacy_results
def _encode(self, texts, labels=None): """ Convert a batch of raw text to a batch of byte-pair encoded token indices. """ self._lazy_init() batch_tokens = [] batch_token_idxs = [] batch_label_idxs = [] batch_character_locs = [] label = None for i, text in enumerate(texts): if labels is not None: label = labels[i] raw_text = text.lower() tokens = NLP(_text_standardize(text)) subtokens = [] subtoken_idxs = [] tok_pos = [] token_start = 0 for j, token in enumerate(tokens): bpe_toks = self.bpe(token.text).split(' ') try: if token.text.strip(): token_start = raw_text.index(token.text.strip(), token_start) except ValueError: # text_standardization oddity continue subtokens.extend(bpe_toks) subtoken_idxs.extend([ self.encoder.get(SUBS.get(t, t), self.UNK_IDX) for t in bpe_toks ]) assert len("".join(bpe_toks).replace("</w>", "")) == len( token.text.replace(' ', '')) subtoken_positions = np.cumsum( [len(tok.replace("</w>", '')) for tok in bpe_toks]) + token_start token_start += len(token.text.strip()) tok_pos.extend(subtoken_positions) batch_tokens.append(subtokens) batch_token_idxs.append(subtoken_idxs) batch_character_locs.append(tok_pos) if labels is not None: batch_label_idxs.append([label] * len(subtoken_idxs)) return EncodedOutput( token_ids=batch_token_idxs, tokens=batch_tokens, labels=batch_label_idxs, char_locs=batch_character_locs, )
def _convert_to_token_list(annotations, doc_idx=None): tokens = [] for annotation in annotations: start_idx = annotation.get('start') tokens.extend([{ 'start': start_idx + token.idx, 'end': start_idx + token.idx + len(token.text), 'text': token.text, 'label': annotation.get('label'), 'doc_idx': doc_idx } for token in NLP(annotation.get('text'))]) return tokens
def finetune_to_indico_sequence( raw_texts, subseqs, labels, probs=None, none_value=None, subtoken_predictions=False, associations=None, ): """ Maps from the labeled substring format into the 'indico' format. This is the exact inverse operation to :meth indico_to_finetune_sequence:. The indico format is as follows: Raw text for X, Labels as a list of dicts, with each dict in the form: { 'start': <Character index of the start_token of the labeled sequence>, 'end': <Character index of the end of the labeled sequence>, 'label': <A categorical label (int or string) that represents the category of the subsequence, 'text': <Optionally, a field with the subsequence contained between the start and end. } The Labeled substring, or finetune internal, format is as follows. Each item of the data is a list strings of the form: ["The quick brown", "fox", "jumped over the lazy", ...] With the corresponding labels: ["PAD", "animal", "PAD", ...] It is the :param none_value: that is used to populate the PAD labels. :param data: A list of segmented text of the form list(list(str)) :param labels: Categorical labels for each sub-string in data. :param none_value: The none value used to encode the input format. :return: Texts, annoatations both in the 'indico' format. """ annotations = [] loop_vals = zip(raw_texts, subseqs, labels, probs or [None] * len(raw_texts)) for doc_idx, (raw_text, doc_seq, label_seq, prob_seq) in enumerate(loop_vals): spacy_tokens = NLP(raw_text) spacy_token_starts = [token.idx for token in spacy_tokens] spacy_token_ends = [ token.idx + len(token.text) for token in spacy_tokens ] doc_annotations = [] annotation_ranges = set() raw_annotation_start = 0 subtoken_to_label_idx = [] for i, (sub_str, raw_label, confidences) in enumerate( zip(doc_seq, label_seq, prob_seq or [None] * len(doc_seq))): subtoken_to_label_idx.append(len(doc_annotations)) if not isinstance(raw_label, tuple): label_list = [raw_label] else: label_list = raw_label for label_idx, label in enumerate(label_list): stripped_text = sub_str.strip() if subtoken_predictions: raw_annotation_start = raw_text.find( sub_str, raw_annotation_start) raw_annotation_end = raw_annotation_start + len(sub_str) else: raw_annotation_start = raw_text.find( stripped_text, raw_annotation_start) raw_annotation_end = raw_annotation_start + len( stripped_text) if raw_annotation_start == -1: warnings.warn( "Failed to find predicted sequence: {} in text: {}.". format(sub_str, raw_text)) continue extended_existing_label = False for item in doc_annotations: # handle case where we extend existing annotation if ( # same label item["label"] == label # and only separated by whitespace and item["end"] <= raw_annotation_end and not raw_text[item["end"]:raw_annotation_start]. strip()): item["end"] = raw_annotation_end item["text"] = raw_text[ item["start"]:raw_annotation_end] if "confidence" in item and confidences is not None: item["confidence"].append(confidences) extended_existing_label = True break if extended_existing_label or label == none_value: continue annotation_start, annotation_end = ( int(raw_annotation_start), int(raw_annotation_end), ) annotation = { "start": int(annotation_start), "end": int(annotation_end), "label": label, "text": raw_text[annotation_start:annotation_end], } # if we don't want to allow subtoken predictions, adjust start and end to match # the start and ends of the nearest full tokens if not subtoken_predictions: round_to_nearest_start_and_end(annotation, spacy_token_starts, spacy_token_ends, raw_text) if confidences is not None: annotation["confidence"] = [confidences] if annotation["start"] >= annotation["end"]: continue # prevent duplicate annotation edge case annotation_tuple = (annotation["start"], annotation["end"], label) if annotation_tuple not in annotation_ranges: annotation_ranges.add(annotation_tuple) doc_annotations.append(annotation) if associations: associations_seq = assign_associations(associations[doc_idx], none_value, subtoken_to_label_idx) for label_i, annotation in enumerate(doc_annotations): if label_i in associations_seq: index, relationship, prob = associations_seq[label_i] annotation["associations"] = { "index": index, "relationship": relationship, "prob": prob, } doc_annotations = sorted([dict(items) for items in doc_annotations], key=lambda x: span(x)) for annotation in doc_annotations: _merge_confidences(annotation) annotations.append(doc_annotations) return raw_texts, annotations
def overlap_handler(current_annotation, annotation, text, multi_label): """ Scenarios: <> --> current_annotation [] --> annotation 1) < [ > ] 2) [ < > ] 3) < [ ] > """ if current_annotation["start"] <= annotation["start"]: first, second = current_annotation, annotation else: first, second = annotation, current_annotation final_delimiter = min(first["end"], second["end"]) final_label = second["label"] if ( second["end"] > first["end"]) else first["label"] overlapping_text = text[second["start"]:final_delimiter] end = max(first["end"], second["end"]) first_chunk = { "start": first["start"], "end": second["start"], "label": first["label"], "text": text[first["start"]:second["start"]], } if multi_label: second_label = first["label"] | second["label"] else: if first["label"] != second["label"] and (len(overlapping_text.strip()) > 1): warnings.warn( "Found overlapping annotations: {} and {}. \n" "Consider setting `multi_label_sequences` to `True` in your config." .format(annotation, current_annotation)) spacy_tokens = NLP(text) spacy_token_starts = [token.idx for token in spacy_tokens] if second["label"] in spacy_token_starts: second_label = second["label"] elif final_delimiter in spacy_token_starts: second_label = first["label"] else: second_label = first["label"] second_chunk = { "start": second["start"], "end": final_delimiter, "label": second_label, "text": overlapping_text, } third_chunk = { "start": final_delimiter, "end": end, "label": final_label, "text": text[final_delimiter:end], } chunks = [first_chunk, second_chunk, third_chunk] chunks = [c for c in chunks if c["start"] != c["end"]] return chunks
def overlap_handler( current_annotation, annotation, text, multi_label, ): """ Scenarios: <> --> current_annotation [] --> annotation 1) < [ > ] 2) [ < > ] 3) < [ ] > """ if current_annotation['start'] <= annotation['start']: first, second = current_annotation, annotation else: first, second = annotation, current_annotation final_delimiter = min(first['end'], second['end']) final_label = second['label'] if ( second['end'] > first['end']) else first['label'] overlapping_text = text[second['start']:final_delimiter] end = max(first['end'], second['end']) first_chunk = { 'start': first['start'], 'end': second['start'], 'label': first['label'], 'text': text[first['start']:second['start']] } if multi_label: second_label = first['label'] | second['label'] else: if first['label'] != second['label'] and (len(overlapping_text.strip()) > 1): warnings.warn( "Found overlapping annotations: {} and {}. \n" "Consider setting `multi_label_sequences` to `True` in your config." .format(annotation, current_annotation)) spacy_tokens = NLP(text) spacy_token_starts = [token.idx for token in spacy_tokens] if second['start'] in spacy_token_starts: second_label = second['label'] elif final_delimiter in spacy_token_starts: second_label = first['label'] else: second_label = first['label'] second_chunk = { 'start': second['start'], 'end': final_delimiter, 'label': second_label, 'text': overlapping_text } third_chunk = { 'start': final_delimiter, 'end': end, 'label': final_label, 'text': text[final_delimiter:end] } chunks = [first_chunk, second_chunk, third_chunk] chunks = [c for c in chunks if c['start'] != c['end']] return chunks
def _encode(self, texts, labels=None): """ Convert a batch of raw text to a batch of byte-pair encoded token indices. """ self._lazy_init() batch_tokens = [] batch_token_idxs = [] batch_label_idxs = [] batch_char_ends = ( [] ) # to account for the fact that some BPEs have different lengths than their original tokens (e.g. special characters such as bullets) batch_char_starts = [] label = None skipped = 0 for i, text in enumerate(texts): if labels is not None: label = labels[i] raw_text = text.lower() # Only fine to apply this fix because it preserves character locations ftfy_text = uncurl_quotes(raw_text) tokens = NLP(_text_standardize(text)) if not tokens: skipped += 1 continue i -= skipped subtokens = [] subtoken_idxs = [] char_starts = [] char_ends = [] token_start = 0 for j, token in enumerate(tokens): bpe_toks = self.bpe(token.text).split(" ") try: if token.text.strip(): token_start = ftfy_text.index((token.text.strip()), token_start) except ValueError: warnings.warn( "Failed to find token `{}` in text.".format(token.text) ) continue subtokens.extend(bpe_toks) subtoken_idxs.extend( [self.encoder.get(SUBS.get(t, t), self.UNK_IDX) for t in bpe_toks] ) assert len("".join(bpe_toks).replace("</w>", "")) == len( token.text.replace(" ", "") ) if np.sum([len(tok.replace("</w>", "")) for tok in bpe_toks]) > len( token ): # the BPEs comprising a token are longer than the token itself token_char_ends = ( np.asarray([len(token.text.strip()) for tok in bpe_toks]) + token_start ) else: token_char_ends = ( np.cumsum([len(tok.replace("</w>", "")) for tok in bpe_toks]) + token_start ) token_char_starts = [token_start] + token_char_ends[:-1].tolist() token_start += len(token.text.strip()) char_ends.extend(token_char_ends) char_starts.extend(token_char_starts) batch_tokens.append(subtokens) batch_token_idxs.append(subtoken_idxs) batch_char_ends.append(char_ends) batch_char_starts.append(char_starts) if labels is not None: batch_label_idxs.append([label] * len(subtoken_idxs)) return EncodedOutput( token_ids=batch_token_idxs, tokens=batch_tokens, labels=batch_label_idxs, char_locs=batch_char_ends, char_starts=batch_char_starts, )