Пример #1
0
def tag_noun_chunks(doc):
    # entities
    spans = list(doc.ents)
    spans = filter_spans(spans)
    with doc.retokenize() as retokenizer:
        string_store = doc.vocab.strings
        for span in spans:
            start = span.start
            end = span.end
            retokenizer.merge(doc[start:end],
                              attrs=intify_attrs({'ent_type': 'NOUN_CHUNK'},
                                                 string_store))
            #retokenizer.merge(span)

    # noun chunks
    spans = list(doc.noun_chunks)
    spans = filter_spans(spans)
    with doc.retokenize() as retokenizer:
        string_store = doc.vocab.strings
        for span in spans:
            start = span.start
            end = span.end
            retokenizer.merge(doc[start:end],
                              attrs=intify_attrs({'ent_type': 'NOUN_CHUNK'},
                                                 string_store))
Пример #2
0
def test_attrs_idempotence(text):
    int_attrs = intify_attrs({
        "lemma": text,
        "is_alpha": True
    },
                             strings_map={text: 10})
    assert intify_attrs(int_attrs) == {LEMMA: 10, IS_ALPHA: True}
Пример #3
0
def test_attrs_ent_iob_intify():
    int_attrs = intify_attrs({"ENT_IOB": ""})
    assert int_attrs == {ENT_IOB: 0}

    int_attrs = intify_attrs({"ENT_IOB": "I"})
    assert int_attrs == {ENT_IOB: 1}

    int_attrs = intify_attrs({"ENT_IOB": "O"})
    assert int_attrs == {ENT_IOB: 2}

    int_attrs = intify_attrs({"ENT_IOB": "B"})
    assert int_attrs == {ENT_IOB: 3}

    int_attrs = intify_attrs({ENT_IOB: ""})
    assert int_attrs == {ENT_IOB: 0}

    int_attrs = intify_attrs({ENT_IOB: "I"})
    assert int_attrs == {ENT_IOB: 1}

    int_attrs = intify_attrs({ENT_IOB: "O"})
    assert int_attrs == {ENT_IOB: 2}

    int_attrs = intify_attrs({ENT_IOB: "B"})
    assert int_attrs == {ENT_IOB: 3}

    with pytest.raises(ValueError):
        int_attrs = intify_attrs({"ENT_IOB": "XX"})

    with pytest.raises(ValueError):
        int_attrs = intify_attrs({ENT_IOB: "XX"})
Пример #4
0
    def spacy_dependency_parse(self, charter_abstract):
        """
        Execute spaCy NLP pipeline for charter abstracts

        :param charter_abstract: the charter abstract
        :return: the spaCy doc object
        """
        # Attention during sentence segmentation. The German model of spaCy tends to recognise some elements too quickly
        # as a sentence, see:  https://github.com/explosion/spaCy/issues/1756 and
        # https://spacy.io/usage/linguistic-features#sbd

        # Lemmatisation issues
        # Merge NEs into a single token: https://github.com/explosion/spaCy/issues/2193
        # Lemmatise NEs https://github.com/explosion/spaCy/issues/1809
        # Issues with spaCy German lemmatiser (1) https://github.com/explosion/spaCy/issues/2486
        # Issues with spaCy German lemmatiser (2) https://github.com/explosion/spaCy/issues/2668

        doc = self.nlp(charter_abstract.decode('utf-8'))
        # get named entities
        entities = [(ent.start, ent.end, ent.label, ent.lemma_)
                    for ent in doc.ents]

        # merge and retokenize named entities
        with doc.retokenize() as retokenizer:
            string_store = doc.vocab.strings
            for start, end, label, lemma in entities:
                retokenizer.merge(doc[start:end],
                                  attrs=intify_attrs(
                                      {
                                          'ent_type': label,
                                          'lemma': lemma
                                      }, string_store))

        return doc
Пример #5
0
def merge_entities(doc):
    """
    Merge named entities into single tokens in ``doc``, *in-place*. Can be used
    as a stand-alone function, or as part of a spaCy language pipeline::

        >>> spacy_lang = textacy.load_spacy('en')
        >>> spacy_lang.add_pipe(merge_entities, after='ner')
        >>> doc = spacy_lang('The entity in this sentence is Burton DeWilde.')
        >>> doc[-2]
        Burton DeWilde

    Args:
        doc (``SpacyDoc``)

    Returns:
        ``SpacyDoc``: Input ``doc`` with merged entities.
    """
    try:  # retokenizer was added to spacy in v2.0.11
        with doc.retokenize() as retokenizer:
            string_store = doc.vocab.strings
            for ent in doc.ents:
                retokenizer.merge(doc[ent.start:ent.end],
                                  attrs=intify_attrs({'ent_type': ent.label},
                                                     string_store))
    except AttributeError:
        ents = [(ent.start_char, ent.end_char, ent.label) for ent in doc.ents]
        for start_char, end_char, label in ents:
            doc.merge(start_char, end_char, ent_type=label)
    return doc
Пример #6
0
def test_attrs_do_deprecated(text):
    int_attrs = intify_attrs({
        "F": text,
        "is_alpha": True
    },
                             strings_map={text: 10},
                             _do_deprecated=True)
    assert int_attrs == {ORTH: 10, IS_ALPHA: True}
Пример #7
0
def tag_chunks_spans(doc, spans, span_type):
    spans = filter_spans(spans)
    with doc.retokenize() as retokenizer:
        string_store = doc.vocab.strings
        for span in spans:
            start = span.start
            end = span.end
            retokenizer.merge(doc[start:end],
                              attrs=intify_attrs({'ent_type': span_type},
                                                 string_store))
def tag_chunks(doc):
    spans = list(doc.ents) + list(doc.noun_chunks)
    spans = filter_spans(spans)
    with doc.retokenize() as retokenizer:
        string_store = doc.vocab.strings
        for span in spans:
            start = span.start
            end = span.end
            retokenizer.merge(doc[start:end],
                              attrs=intify_attrs({'ent_type': 'ENTITY'},
                                                 string_store))
Пример #9
0
def merge_spans(spans: Iterable[Span], doc: Doc) -> None:
    """
    Merge spans into single tokens in ``doc``, *in-place*.

    Args:
        spans (Iterable[:class:`spacy.tokens.Span`])
        doc (:class:`spacy.tokens.Doc`)
    """
    with doc.retokenize() as retokenizer:
        string_store = doc.vocab.strings
        for span in spans:
            retokenizer.merge(
                doc[span.start:span.end],
                attrs=attrs.intify_attrs({"ent_type": span.label},
                                         string_store),
            )
Пример #10
0
def merge_spans(spans, doc):
    """
    Merge spans into single tokens in ``doc``, *in-place*.

    Args:
        spans (Iterable[``spacy.Span``])
        doc (``spacy.Doc``)
    """
    try:  # retokenizer was added to spacy in v2.0.11
        with doc.retokenize() as retokenizer:
            string_store = doc.vocab.strings
            for span in spans:
                retokenizer.merge(doc[span.start:span.end],
                                  attrs=attrs.intify_attrs(
                                      {'ent_type': span.label}, string_store))
    except AttributeError:
        spans = [(span.start_char, span.end_char, span.label)
                 for span in spans]
        for start_char, end_char, label in spans:
            doc.merge(start_char, end_char, ent_type=label)
Пример #11
0
def test_attrs_key(text):
    assert intify_attrs({"ORTH": text}) == {ORTH: text}
    assert intify_attrs({"NORM": text}) == {NORM: text}
    assert intify_attrs({"lemma": text}, strings_map={text: 10}) == {LEMMA: 10}
Пример #12
0
def test_attrs_key(text):
    assert intify_attrs({"ORTH": text}) == {ORTH: text}
    assert intify_attrs({"NORM": text}) == {NORM: text}
    assert intify_attrs({"lemma": text}, strings_map={text: 10}) == {LEMMA: 10}
Пример #13
0
def test_attrs_do_deprecated(text):
    int_attrs = intify_attrs(
        {"F": text, "is_alpha": True}, strings_map={text: 10}, _do_deprecated=True
    )
    assert int_attrs == {ORTH: 10, IS_ALPHA: True}
Пример #14
0
def test_attrs_idempotence(text):
    int_attrs = intify_attrs({"lemma": text, "is_alpha": True}, strings_map={text: 10})
    assert intify_attrs(int_attrs) == {LEMMA: 10, IS_ALPHA: True}