Пример #1
0
def test_has_annotation(en_vocab):
    doc = Doc(en_vocab, words=["Hello", "world"])
    attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "HEAD", "ENT_IOB", "ENT_TYPE")
    for attr in attrs:
        assert not doc.has_annotation(attr)

    doc[0].tag_ = "A"
    doc[0].pos_ = "X"
    doc[0].set_morph("Feat=Val")
    doc[0].lemma_ = "a"
    doc[0].dep_ = "dep"
    doc[0].head = doc[1]
    doc.set_ents([Span(doc, 0, 1, label="HELLO")], default="missing")

    for attr in attrs:
        assert doc.has_annotation(attr)
        assert not doc.has_annotation(attr, require_complete=True)

    doc[1].tag_ = "A"
    doc[1].pos_ = "X"
    doc[1].set_morph("")
    doc[1].lemma_ = "a"
    doc[1].dep_ = "dep"
    doc.ents = [Span(doc, 0, 2, label="HELLO")]

    for attr in attrs:
        assert doc.has_annotation(attr)
        assert doc.has_annotation(attr, require_complete=True)
Пример #2
0
def test_doc_from_array_sent_starts(en_vocab):
    # fmt: off
    words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."]
    heads = [0, 0, 0, 0, 0, 0, 6, 6, 6, 6]
    deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep"]
    # fmt: on
    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
    # HEAD overrides SENT_START without warning
    attrs = [SENT_START, HEAD]
    arr = doc.to_array(attrs)
    new_doc = Doc(en_vocab, words=words)
    new_doc.from_array(attrs, arr)
    # no warning using default attrs
    attrs = doc._get_array_attrs()
    arr = doc.to_array(attrs)
    with pytest.warns(None) as record:
        new_doc.from_array(attrs, arr)
        assert len(record) == 0
    # only SENT_START uses SENT_START
    attrs = [SENT_START]
    arr = doc.to_array(attrs)
    new_doc = Doc(en_vocab, words=words)
    new_doc.from_array(attrs, arr)
    assert [t.is_sent_start for t in doc] == [t.is_sent_start for t in new_doc]
    assert not new_doc.has_annotation("DEP")
    # only HEAD uses HEAD
    attrs = [HEAD, DEP]
    arr = doc.to_array(attrs)
    new_doc = Doc(en_vocab, words=words)
    new_doc.from_array(attrs, arr)
    assert [t.is_sent_start for t in doc] == [t.is_sent_start for t in new_doc]
    assert new_doc.has_annotation("DEP")
Пример #3
0
def test_doc_is_nered(en_vocab):
    words = ["I", "live", "in", "New", "York"]
    doc = Doc(en_vocab, words=words)
    assert not doc.has_annotation("ENT_IOB")
    doc.ents = [Span(doc, 3, 5, label="GPE")]
    assert doc.has_annotation("ENT_IOB")
    # Test creating doc from array with unknown values
    arr = numpy.array([[0, 0], [0, 0], [0, 0], [384, 3], [384, 1]], dtype="uint64")
    doc = Doc(en_vocab, words=words).from_array([ENT_TYPE, ENT_IOB], arr)
    assert doc.has_annotation("ENT_IOB")
    # Test serialization
    new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
    assert new_doc.has_annotation("ENT_IOB")
Пример #4
0
 def _add_valid_doc(self, doc: Doc) -> None:
     self.docs.append(doc)
     self._doc_ids.append(id(doc))
     self.n_docs += 1
     self.n_tokens += len(doc)
     if doc.has_annotation("SENT_START"):
         self.n_sents += itertoolz.count(doc.sents)
def test_sentencizer_complex(en_vocab, words, sent_starts, sent_ends, n_sents):
    doc = Doc(en_vocab, words=words)
    sentencizer = Sentencizer(punct_chars=None)
    doc = sentencizer(doc)
    assert doc.has_annotation("SENT_START")
    assert [t.is_sent_start for t in doc] == sent_starts
    assert [t.is_sent_end for t in doc] == sent_ends
    assert len(list(doc.sents)) == n_sents
def test_sentencizer(en_vocab):
    doc = Doc(en_vocab, words=["Hello", "!", "This", "is", "a", "test", "."])
    sentencizer = Sentencizer(punct_chars=None)
    doc = sentencizer(doc)
    assert doc.has_annotation("SENT_START")
    sent_starts = [t.is_sent_start for t in doc]
    sent_ends = [t.is_sent_end for t in doc]
    assert sent_starts == [True, False, True, False, False, False, False]
    assert sent_ends == [False, True, False, False, False, False, True]
    assert len(list(doc.sents)) == 2
Пример #7
0
def test_issue1834():
    """Test that sentence boundaries & parse/tag flags are not lost
    during serialization."""
    words = ["This", "is", "a", "first", "sentence", ".", "And", "another", "one"]
    doc = Doc(Vocab(), words=words)
    doc[6].is_sent_start = True
    new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
    assert new_doc[6].sent_start
    assert not new_doc.has_annotation("DEP")
    assert not new_doc.has_annotation("TAG")
    doc = Doc(
        Vocab(),
        words=words,
        tags=["TAG"] * len(words),
        heads=[0, 0, 0, 0, 0, 0, 6, 6, 6],
        deps=["dep"] * len(words),
    )
    new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
    assert new_doc[6].sent_start
    assert new_doc.has_annotation("DEP")
    assert new_doc.has_annotation("TAG")
Пример #8
0
def test_issue3468():
    """Test that sentence boundaries are set correctly so Doc.has_annotation("SENT_START") can
    be restored after serialization."""
    nlp = English()
    nlp.add_pipe("sentencizer")
    doc = nlp("Hello world")
    assert doc[0].is_sent_start
    assert doc.has_annotation("SENT_START")
    assert len(list(doc.sents)) == 1
    doc_bytes = doc.to_bytes()
    new_doc = Doc(nlp.vocab).from_bytes(doc_bytes)
    assert new_doc[0].is_sent_start
    assert new_doc.has_annotation("SENT_START")
    assert len(list(new_doc.sents)) == 1
Пример #9
0
    def __call__(self, doc: Doc) -> Doc:
        """
        Slightly modified from spacy.pipeline.function.merge_entities to accommodate
        stopword trimming.
        """
        with doc.retokenize() as retokenizer:
            # Merge discovered entities / noun chunks.
            # Ones found via `PipedPhraseMatcher` have label "CUSTOM"
            ents = [
                ent for ent in doc.ents if self.filter_entities is None
                or ent.label_ in self.filter_entities
            ]
            custom = set(tok.i for ent in ents for tok in ent
                         if ent.label_ == "CUSTOM")

            noun_chunks = []
            if doc.has_annotation("DEP"):
                # ensure precedence of CUSTOM phrases
                noun_chunks = [
                    noun for noun in doc.noun_chunks
                    if not any(tok.i in custom for tok in noun)
                ]

            # eliminate overlapping spans, keeping the longest
            # NB that, given earlier filtering, CUSTOM phrases should never be subsumed/
            # broken up
            phrases = filter_spans([
                p for p in ents + noun_chunks
                if p.label_ == "CUSTOM" or len(p) <= self.max_phrase_len
            ])

            for phrase in phrases:
                attrs = {
                    "tag": phrase.root.tag,
                    "dep": phrase.root.dep,
                    "ent_type": phrase.label,
                }
                # need to trim leading/trailing stopwords
                if phrase.label_ != "CUSTOM" and self.stopwords is not None:
                    while phrase and phrase[0].lower_ in self.stopwords:
                        phrase = phrase[1:]
                    while phrase and phrase[-1].lower_ in self.stopwords:
                        phrase = phrase[:-1]

                if not phrase:
                    continue

                retokenizer.merge(phrase, attrs=attrs)

        return doc
Пример #10
0
def get_noun_phrases(doc: Doc) -> List[Span]:
    """Compile a list of noun phrases in sense2vec's format (without
    determiners). Separated out to make it easier to customize, e.g. for
    languages that don't implement a noun_chunks iterator out-of-the-box, or
    use different label schemes.

    doc (Doc): The Doc to get noun phrases from.
    RETURNS (list): The noun phrases as a list of Span objects.
    """
    trim_labels = ("advmod", "amod", "compound")
    spans = []
    if doc.has_annotation("DEP"):
        for np in doc.noun_chunks:
            while len(np) > 1 and np[0].dep_ not in trim_labels:
                np = np[1:]
            spans.append(np)
    return spans
Пример #11
0
def to_tokenized_text(doc: Doc) -> List[List[str]]:
    """
    Transform ``doc`` into an ordered, nested list of token-texts for each sentence.

    Args:
        doc

    Returns:
        A list of tokens' texts for each sentence in ``doc``.

    Note:
        If ``doc`` hasn't been segmented into sentences, the entire document
        is treated as a single sentence.
    """
    if doc.has_annotation("SENT_START"):
        return [[token.text for token in sent] for sent in doc.sents]
    else:
        return [[token.text for token in doc]]
Пример #12
0
def test_issue3012(en_vocab):
    """Test that the is_tagged attribute doesn't get overwritten when we from_array
    without tag information."""
    words = ["This", "is", "10", "%", "."]
    tags = ["DT", "VBZ", "CD", "NN", "."]
    pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"]
    ents = ["O", "O", "B-PERCENT", "I-PERCENT", "O"]
    doc = Doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
    assert doc.has_annotation("TAG")
    expected = ("10", "NUM", "CD", "PERCENT")
    assert (doc[2].text, doc[2].pos_, doc[2].tag_,
            doc[2].ent_type_) == expected
    header = [ENT_IOB, ENT_TYPE]
    ent_array = doc.to_array(header)
    doc.from_array(header, ent_array)
    assert (doc[2].text, doc[2].pos_, doc[2].tag_,
            doc[2].ent_type_) == expected
    # Serializing then deserializing
    doc_bytes = doc.to_bytes()
    doc2 = Doc(en_vocab).from_bytes(doc_bytes)
    assert (doc2[2].text, doc2[2].pos_, doc2[2].tag_,
            doc2[2].ent_type_) == expected
Пример #13
0
    def apply_transforms(self, doc: Doc, lang: types.LangLike,
                         **kwargs) -> Doc:
        """
        Sequentially apply some subset of data augmentation transforms to ``doc``,
        then return a new ``Doc`` created from the augmented text using ``lang``.

        Args:
            doc
            lang
            **kwargs: If, for whatever reason, you have to pass keyword argument values
                into transforms that vary or depend on characteristics of ``doc``,
                specify them here. The transforms' call signatures will be inspected,
                and values will be passed along, as needed.

        Returns:
            :class:`spacy.tokens.Doc`
        """
        if doc.has_annotation("SENT_START"):
            nested_aug_toks = [
                aug_utils.to_aug_toks(sent) for sent in doc.sents
            ]
        else:
            nested_aug_toks = [aug_utils.to_aug_toks(doc)]
        tfs = self._get_random_transforms()
        new_nested_aug_toks = []
        for aug_toks in nested_aug_toks:
            # this is a bit of a hack, but whatchagonnado
            if kwargs:
                for tf in tfs:
                    tf_kwargs = utils.get_kwargs_for_func(tf, kwargs)
                    aug_toks = tf(aug_toks, **tf_kwargs)
            else:
                for tf in tfs:
                    aug_toks = tf(aug_toks)
            new_nested_aug_toks.append(aug_toks)
        return self._make_new_spacy_doc(new_nested_aug_toks, lang)
Пример #14
0
def test_has_annotation_sents(en_vocab):
    doc = Doc(en_vocab, words=["Hello", "beautiful", "world"])
    attrs = ("SENT_START", "IS_SENT_START", "IS_SENT_END")
    for attr in attrs:
        assert not doc.has_annotation(attr)
        assert not doc.has_annotation(attr, require_complete=True)

    # The first token (index 0) is always assumed to be a sentence start,
    # and ignored by the check in doc.has_annotation

    doc[1].is_sent_start = False
    for attr in attrs:
        assert doc.has_annotation(attr)
        assert not doc.has_annotation(attr, require_complete=True)

    doc[2].is_sent_start = False
    for attr in attrs:
        assert doc.has_annotation(attr)
        assert doc.has_annotation(attr, require_complete=True)
Пример #15
0
def test_issue599(en_vocab):
    doc = Doc(en_vocab)
    doc2 = Doc(doc.vocab)
    doc2.from_bytes(doc.to_bytes())
    assert doc2.has_annotation("DEP")
Пример #16
0
def test_tokenlast_has_sent_end_true():
    doc = Doc(Vocab(), words=["hello", "world"])
    assert doc[0].is_sent_end is None
    assert doc[1].is_sent_end is True
    assert not doc.has_annotation("SENT_START")