def match_regex(rgx, span): """Return Span object for regex match""" m = re.search(rgx, span.text, re.I) if type(rgx) is str else rgx.search( span.text) if not m: return None i, j = m.span() if type(span) is Span: i += span.char_start j += span.char_start return Span(i, j - 1, span.sentence) return Span(i, j - 1, span)
def tag(self, document, ngrams=None): """ Use existing labeled data to generate Span objects """ if document.name not in self.annotations: return n_errs = 0 entities = {sent.i: {} for sent in document.sentences} for anno in self.annotations[document.name]: # get parent sentence for this span sent = self._get_span_sentence(anno.abs_char_start, anno.abs_char_end, document.sentences) if not sent: n_errs += 1 continue offset = sent.abs_char_offsets[0] span = Span(anno.abs_char_start - offset, anno.abs_char_end - offset, sentence=sent) # HACK -- exclude all entities that are overlapping/nested # within header spans (TODO move to seprate pipeline module) ignore_span = False if 'HEADER' in document.annotations[sent.i]: for h in document.annotations[sent.i]['HEADER']: if h is not None and self._is_overlapping(h, span): ignore_span = True break if ignore_span: continue if self.type_name not in entities[sent.i]: entities[sent.i][self.type_name] = [] entities[sent.i][self.type_name].append(span) for i in entities: document.annotations[i].update(entities[i]) if n_errs > 0: print(f'Skipped {document.name}({n_errs}) entities')
def match_rgx(rgx: Pattern, sentence: Sentence) -> Dict[Tuple, Span]: """Match a regular expression to a sentence TODO: search over ngrams vs. entire sentence by default Parameters ---------- rgx sentence Returns ------- """ matches = {} for match in rgx.finditer(sentence.text): start, end = match.span() span = Span(char_start=start, char_end=end - 1, sentence=sentence) matches[(start, end - 1, end - 1 - start)] = span return matches
def apply(self, s): # covert to source char offsets text = get_text(s.words, s.char_offsets) # apply alternate tokenization if self.split_on: words, char_offsets = retokenize(s, self.split_on) else: words, char_offsets = s.words, s.char_offsets matches = [] for i in range(0, len(words)): match = None start = char_offsets[i] # ignore leading whitespace if not words[i].strip(): continue for j in range(i + 1, min(i + self.max_ngrams + 1, len(words) + 1)): # ignore trailing whitespace if not words[j - 1].strip(): continue end = char_offsets[j - 1] + len(words[j - 1]) yield Span(start, end - 1, s)
def extract_long_form(i, sentence, max_dup_chars=2): ''' Search the left window for a candidate long-form sequence. Use the heuristic of "match first character" to guess long form ''' short_form = sentence.words[i] left_window = [w for w in sentence.words[0:i]] # strip brackets/parentheses while left_window and left_window[-1] in ["(", "[", ":"]: left_window.pop() if len(left_window) == 0: return None # match longest seq to the left of our short form # that matches on starting character long_form = [] char = short_form[0].lower() letters = [t[0].lower() for t in short_form] letters = [t for t in letters if t == char] letters = letters[0:min(len(letters), max_dup_chars)] matched = False for t in left_window[::-1]: if t[0] in "()[]-+,": break if len(letters) == 1 and t[0].lower() == letters[0]: long_form += [t] matched = True break elif len(letters) > 1 and t[0].lower() == letters[0]: long_form += [t] matched = True letters.pop(0) else: long_form += [t] # We didn't find the first letter of our short form, so # back-off and choose the longest contiguous noun phrase if (len(left_window) == len(long_form) and \ letters[0] != t[0].lower() and \ len(long_form[::-1]) > 1) or not matched: tags = list(zip(sentence.words[0:i - 1], sentence.pos_tags[0:i - 1]))[::-1] noun_phrase = [] while tags: t = tags.pop(0) if re.search("^(NN[PS]*|JJ)$", t[1]): noun_phrase.append(t) else: break if noun_phrase: long_form = list(zip(*noun_phrase))[0] # create candidate n = len(long_form[::-1]) offsets = sentence.char_offsets[0:i - 1][-n:] char_start = min(offsets) words = sentence.words[0:i - 1][-n:] offsets = map(lambda x: len(x[0]) + x[1], zip(words, offsets)) char_end = max(offsets) span = Span(char_start, char_end - 1, sentence) return Span(char_start, char_end - 1, sentence)
def __call__(self, sentence: Sentence) -> Dict[int, int]: """ Parameters ---------- sentence Returns ------- """ matches = apply_matcher(sentence.words, sentence.char_offsets, self.ontology, max_ngrams=self.max_ngrams, longest_match_only=True, case_sensitive=self.case_sensitive) matches = sorted(matches, key=lambda x: x[0], reverse=0) if not matches: return {} matches, labels = self._merge_matches(matches) terms = [m[-1] for m in matches] # Slot-filled matches f_matches = [] mask = np.array([0] * len(matches)) for slot in self.slot_rgxs: n_args = slot.count('{}') args = list(zip(terms, labels)) for i in range(len(args) - n_args + 1): # skip arguments that are already matched if 1 in mask[i:i + n_args]: continue xs, ys = zip(*args[i:i + n_args]) # HACK - positive classes only if None in ys or 2 in ys: continue rgx = re.compile(slot.format(*xs), re.I) m = match_rgx(rgx, sentence) if m: m = list(m.items())[0] span = list(m[0][0:2]) span[-1] += 1 m = tuple([span, m[-1].text]) # expand the argument matches to this span mask[i:i + n_args] = 1 f_matches.append((m, np.unique(ys)[0])) # add slot filled matches matches = [m for i, m in zip(mask, matches) if i == 0] labels = [y for i, y in zip(mask, labels) if i == 0] for m, y in f_matches: matches.append(m) labels.append(y) flip = False L = {} for ((char_start, char_end), term), label in zip(matches, labels): #key = term.lower() if term.lower() in self._labels else term # None labels are treated as abstains if not label: continue # check span-specific rules if self.span_rule and label == 1: span = Span(char_start, char_end - 1, sentence) if self.span_rule(span): label = 2 flip = True if term.lower() in self.stopwords or term in self.stopwords: label = 2 #label = self.stopwords[key] start, end = get_word_index_span((char_start, char_end - 1), sentence) for i in range(start, end + 1): L[i] = label flip = False return L