def _make_span(self, doc: Doc, start: int, end: int, label: str, is_char: bool, retok: bool): span: Span if is_char: if label is None: span = doc.char_span(start, end) else: span = doc.char_span(start, end, label=label) else: if label is None: span = Span(doc, start, end) else: span = Span(doc, start, end, label=label) if logger.isEnabledFor(logging.DEBUG): logger.debug(f'span ({start}, {end}) for {label}: {span}') if span is not None: # this is a span object or none if match doesn't map to valid token # sequence if logger.isEnabledFor(logging.DEBUG): logger.debug(f'match: {span.text}') if label is not None: doc.ents += (span, ) if retok: # https://github.com/explosion/spaCy/discussions/4806 with doc.retokenize() as retokenizer: # Iterate over all spans and merge them into one # token. This is done after setting the entities – # otherwise, it would cause mismatched indices! retokenizer.merge(span)
def extract_entity(self, doc: Doc) -> List[Span]: food_spans = [] for food in self.food_names: food_index = doc.text.lower().find(food) if food_index > -1: food_spans.append( doc.char_span(food_index, food_index + len(food))) return food_spans
def split_span(doc: Doc, span: Span) -> List[Span]: """ Split a span in multiple span (one token per span) """ s = doc.text new_spans = list() label = span.label_ start_search = span.start_char for word in span: start = s.index(word.text, start_search, span.end_char) end = start + len(word.text) new_spans.append(doc.char_span(start, end, label)) start_search += len(word.text) return new_spans
def offsets_from_tags( doc: Doc, tags: List[str], label_encoding: Optional[str] = "BIOUL", only_token_spans: bool = False, ) -> List[Dict]: """Converts BIOUL or BIO tags to offsets Parameters ---------- doc A spaCy Doc created with `text` and the backbone tokenizer tags A list of BIOUL or BIO tags label_encoding The label encoding of the tags: BIOUL or BIO only_token_spans If True, offsets contains only token index references. Default is False Returns ------- offsets A list of dicts with start and end character/token index with respect to the doc and the span label: `{"start": int, "end": int, "start_token": int, "end_token": int, "label": str}` """ # spacy's biluo_tags_to_offsets surprisingly does not check this ... if len(doc) != len(tags): raise ValueError( f"Number of tokens and tags must be the same, " f"but 'len({list(doc)}) != len({tags})" ) if label_encoding == "BIO": tags = to_bioul(tags, encoding="BIO") offsets = [] for start, end, label in biluo_tags_to_offsets(doc, tags): span = doc.char_span(start, end) data = { "start_token": span.start, "end_token": span.end, "label": label, } if not only_token_spans: data.update({"start": start, "end": end}) offsets.append(data) return offsets
def __call__(self, doc: Doc): """Apply the pipeline component on a Doc object and modify it if matches are found. Return the Doc, so it can be processed by the next component in the pipeline, if available. References: - ``https://spacy.io/usage/processing-pipelines#component-example2``. Args: doc (Doc): spaCy document. Returns: doc """ if not self.crf_extractor: raise RuntimeError("`CRFEntityExtractor` was not initialized. " "Did you call `.from_disk()` method ?") example = {"doc": doc, "text": doc.text} self.spacy_tokenizer.tokenize(example, attribute="doc") spans = [ doc.char_span(entity_dict["start"], entity_dict["end"], label=entity_dict["entity"]) for entity_dict in self.crf_extractor.process(example) ] doc.ents = list(doc.ents) + spans for span in spans: # Iterate over all spans and merge them into one token. This is done # after setting the entities – otherwise, it would cause mismatched # indices! span.merge() return doc
def parse_read_doc(self, doc: Doc = None) -> List: return [doc.char_span(span[0], span[1]) for span in doc._.entities]
def prepare_spacy_doc(doc: Doc, prediction: Dict) -> Doc: doc_rels = [] doc_evs = [] # store events as relations. include confidence scores in the relation tuple (TODO: add relation property) for evs, ds in zip(prediction.get("predicted_events", []), doc.sents): sent_evs = [] for ev in evs: if len(ev) >= 3: trig = [r for r in ev if r[1] == "TRIGGER"] arg0s = [r for r in ev if r[2] == "ARG0"] #example arg0s: [[40, 43, 'ARG0', 12.1145, 1.0], [45, 45, 'ARG0', 11.3498, 1.0]] arg1s = [r for r in ev if r[2] == "ARG1"] e_trig = doc[trig[0][0]:trig[0][0] + 1] for arg0 in arg0s: e_arg0 = doc[arg0[0]:arg0[1] + 1] for arg1 in arg1s: e_arg1 = doc[arg1[0]:arg1[1] + 1] #here confidence is set as the minimum among {trigger,args}, as a conservative measure. sent_evs.append({ "ARG0": e_arg0, "ARG1": e_arg1, "RELATION_TRIGGER": e_trig, "CONF": min([arg0[4], arg1[4], trig[0][3]]) }) doc_evs.append(sent_evs) ds._.events = sent_evs doc._.events = doc_evs #TODO add doc._.span_ents too. for rels, ds in zip(prediction.get("predicted_relations", []), doc.sents): sent_rels = [] for rel in rels: e1 = doc[rel[0]:rel[1] + 1] e2 = doc[rel[2]:rel[3] + 1] tag = rel[4] sent_rels.append((e1, e2, tag)) doc_rels.append(sent_rels) ds._.rels = sent_rels doc._.rels = doc_rels if "predicted_ner" not in prediction: return doc preds = [p for r in prediction.get("predicted_ner", []) for p in r] # storing all span based entitis to doc._.span_ents span_ents = [] for sent in prediction["predicted_ner"]: ent_sent = [] for ent in sent: d = doc[ent[0]:ent[1] + 1] d._.label_ = ent[2] ent_sent.append(d) span_ents.append(ent_sent) doc._.span_ents = span_ents # store entities to doc.ents of spacy # because spacy can't support the overlapped entities we have to merge overlapped entities # to the longest ones. dist_ents = [] prc = [] for i, p1 in enumerate(preds): t = [p1] if i in prc: continue for j, p2 in enumerate(preds[i + 1:]): if p2[0] <= p1[1]: t.append(p1) prc.append(j + i + 1) dist_ents.append(t) res = [] for t in dist_ents: if len(t) == 1: res.append(t[0]) elif len(t) > 1: mn = t[0][0] mx = t[0][1] for p in t[1:]: if p[0] < mn: mn = p[0] if p[1] > mx: mx = p[1] res.append([mn, mx, t[0][2], t[0][3], t[0][4]]) sel_ents = [] for ent in res: try: d = doc[ent[0]:ent[1] + 1] s = doc.char_span(d.start_char, d.end_char, label=ent[2]) if s: sel_ents.append(s) except Exception as e: print("error in spacy span", e) raise e doc.ents = sel_ents return doc