示例#1
0
文件: matchers.py 项目: gkovaig/trove
def match_regex(rgx, span):
    """Return Span object for regex match"""
    m = re.search(rgx, span.text, re.I) if type(rgx) is str else rgx.search(
        span.text)
    if not m:
        return None
    i, j = m.span()
    if type(span) is Span:
        i += span.char_start
        j += span.char_start
        return Span(i, j - 1, span.sentence)
    return Span(i, j - 1, span)
示例#2
0
    def tag(self, document, ngrams=None):
        """
        Use existing labeled data to generate Span objects
        """
        if document.name not in self.annotations:
            return

        n_errs = 0
        entities = {sent.i: {} for sent in document.sentences}
        for anno in self.annotations[document.name]:
            # get parent sentence for this span
            sent = self._get_span_sentence(anno.abs_char_start,
                                           anno.abs_char_end,
                                           document.sentences)
            if not sent:
                n_errs += 1
                continue

            offset = sent.abs_char_offsets[0]
            span = Span(anno.abs_char_start - offset,
                        anno.abs_char_end - offset,
                        sentence=sent)

            # HACK -- exclude all entities that are overlapping/nested
            # within header spans (TODO move to seprate pipeline module)
            ignore_span = False
            if 'HEADER' in document.annotations[sent.i]:
                for h in document.annotations[sent.i]['HEADER']:
                    if h is not None and self._is_overlapping(h, span):
                        ignore_span = True
                        break

            if ignore_span:
                continue

            if self.type_name not in entities[sent.i]:
                entities[sent.i][self.type_name] = []
            entities[sent.i][self.type_name].append(span)

        for i in entities:
            document.annotations[i].update(entities[i])

        if n_errs > 0:
            print(f'Skipped {document.name}({n_errs}) entities')
示例#3
0
文件: matchers.py 项目: gkovaig/trove
def match_rgx(rgx: Pattern, sentence: Sentence) -> Dict[Tuple, Span]:
    """Match a regular expression to a sentence
    TODO: search over ngrams vs. entire sentence by default

    Parameters
    ----------
    rgx
    sentence

    Returns
    -------

    """
    matches = {}
    for match in rgx.finditer(sentence.text):
        start, end = match.span()
        span = Span(char_start=start, char_end=end - 1, sentence=sentence)
        matches[(start, end - 1, end - 1 - start)] = span
    return matches
示例#4
0
    def apply(self, s):
        # covert to source char offsets
        text = get_text(s.words, s.char_offsets)

        # apply alternate tokenization
        if self.split_on:
            words, char_offsets = retokenize(s, self.split_on)
        else:
            words, char_offsets = s.words, s.char_offsets

        matches = []
        for i in range(0, len(words)):
            match = None
            start = char_offsets[i]
            # ignore leading whitespace
            if not words[i].strip():
                continue
            for j in range(i + 1, min(i + self.max_ngrams + 1,
                                      len(words) + 1)):
                # ignore trailing whitespace
                if not words[j - 1].strip():
                    continue
                end = char_offsets[j - 1] + len(words[j - 1])
                yield Span(start, end - 1, s)
示例#5
0
def extract_long_form(i, sentence, max_dup_chars=2):
    '''
    Search the left window for a candidate long-form sequence.
    Use the heuristic of "match first character" to guess long form
    '''
    short_form = sentence.words[i]
    left_window = [w for w in sentence.words[0:i]]

    # strip brackets/parentheses
    while left_window and left_window[-1] in ["(", "[", ":"]:
        left_window.pop()

    if len(left_window) == 0:
        return None

    # match longest seq to the left of our short form
    # that matches on starting character
    long_form = []
    char = short_form[0].lower()
    letters = [t[0].lower() for t in short_form]
    letters = [t for t in letters if t == char]
    letters = letters[0:min(len(letters), max_dup_chars)]

    matched = False

    for t in left_window[::-1]:
        if t[0] in "()[]-+,":
            break

        if len(letters) == 1 and t[0].lower() == letters[0]:
            long_form += [t]
            matched = True
            break

        elif len(letters) > 1 and t[0].lower() == letters[0]:
            long_form += [t]
            matched = True
            letters.pop(0)

        else:
            long_form += [t]

    # We didn't find the first letter of our short form, so
    # back-off and choose the longest contiguous noun phrase
    if (len(left_window) == len(long_form) and \
        letters[0] != t[0].lower() and \
        len(long_form[::-1]) > 1) or not matched:

        tags = list(zip(sentence.words[0:i - 1],
                        sentence.pos_tags[0:i - 1]))[::-1]
        noun_phrase = []

        while tags:
            t = tags.pop(0)
            if re.search("^(NN[PS]*|JJ)$", t[1]):
                noun_phrase.append(t)
            else:
                break

        if noun_phrase:
            long_form = list(zip(*noun_phrase))[0]

    # create candidate
    n = len(long_form[::-1])
    offsets = sentence.char_offsets[0:i - 1][-n:]
    char_start = min(offsets)
    words = sentence.words[0:i - 1][-n:]

    offsets = map(lambda x: len(x[0]) + x[1], zip(words, offsets))
    char_end = max(offsets)

    span = Span(char_start, char_end - 1, sentence)

    return Span(char_start, char_end - 1, sentence)
示例#6
0
    def __call__(self, sentence: Sentence) -> Dict[int, int]:
        """

        Parameters
        ----------
        sentence

        Returns
        -------

        """
        matches = apply_matcher(sentence.words,
                                sentence.char_offsets,
                                self.ontology,
                                max_ngrams=self.max_ngrams,
                                longest_match_only=True,
                                case_sensitive=self.case_sensitive)

        matches = sorted(matches, key=lambda x: x[0], reverse=0)
        if not matches:
            return {}

        matches, labels = self._merge_matches(matches)
        terms = [m[-1] for m in matches]

        # Slot-filled matches
        f_matches = []
        mask = np.array([0] * len(matches))
        for slot in self.slot_rgxs:
            n_args = slot.count('{}')
            args = list(zip(terms, labels))

            for i in range(len(args) - n_args + 1):

                # skip arguments that are already matched
                if 1 in mask[i:i + n_args]:
                    continue

                xs, ys = zip(*args[i:i + n_args])

                # HACK - positive classes only
                if None in ys or 2 in ys:
                    continue

                rgx = re.compile(slot.format(*xs), re.I)
                m = match_rgx(rgx, sentence)
                if m:
                    m = list(m.items())[0]
                    span = list(m[0][0:2])
                    span[-1] += 1
                    m = tuple([span, m[-1].text])
                    # expand the argument matches to this span
                    mask[i:i + n_args] = 1
                    f_matches.append((m, np.unique(ys)[0]))

        # add slot filled matches
        matches = [m for i, m in zip(mask, matches) if i == 0]
        labels = [y for i, y in zip(mask, labels) if i == 0]
        for m, y in f_matches:
            matches.append(m)
            labels.append(y)

        flip = False
        L = {}
        for ((char_start, char_end), term), label in zip(matches, labels):
            #key = term.lower() if term.lower() in self._labels else term

            # None labels are treated as abstains
            if not label:
                continue

            # check span-specific rules
            if self.span_rule and label == 1:
                span = Span(char_start, char_end - 1, sentence)
                if self.span_rule(span):
                    label = 2
                    flip = True

            if term.lower() in self.stopwords or term in self.stopwords:
                label = 2
                #label = self.stopwords[key]

            start, end = get_word_index_span((char_start, char_end - 1),
                                             sentence)
            for i in range(start, end + 1):
                L[i] = label

            flip = False

        return L