예제 #1
0
def test_doc_retokenize_spans_entity_merge_iob():
    # Test entity IOB stays consistent after merging
    words = ["a", "b", "c", "d", "e"]
    doc = Doc(Vocab(), words=words)
    doc.ents = [
        (doc.vocab.strings.add("ent-abc"), 0, 3),
        (doc.vocab.strings.add("ent-d"), 3, 4),
    ]
    assert doc[0].ent_iob_ == "B"
    assert doc[1].ent_iob_ == "I"
    assert doc[2].ent_iob_ == "I"
    assert doc[3].ent_iob_ == "B"
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[0:1])
    assert doc[0].ent_iob_ == "B"
    assert doc[1].ent_iob_ == "I"

    words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
    doc = Doc(Vocab(), words=words)
    doc.ents = [
        (doc.vocab.strings.add("ent-de"), 3, 5),
        (doc.vocab.strings.add("ent-fg"), 5, 7),
    ]
    assert doc[3].ent_iob_ == "B"
    assert doc[4].ent_iob_ == "I"
    assert doc[5].ent_iob_ == "B"
    assert doc[6].ent_iob_ == "I"
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[2:4])
        retokenizer.merge(doc[4:6])
        retokenizer.merge(doc[7:9])
    assert len(doc) == 6
    assert doc[3].ent_iob_ == "B"
    assert doc[4].ent_iob_ == "I"
예제 #2
0
def test_doc_retokenize_split_heads_error(en_vocab):
    doc = Doc(en_vocab, words=["LosAngeles", "start", "."])
    # Not enough heads
    with pytest.raises(ValueError):
        with doc.retokenize() as retokenizer:
            retokenizer.split(doc[0], ["Los", "Angeles"], [doc[1]])

    # Too many heads
    with pytest.raises(ValueError):
        with doc.retokenize() as retokenizer:
            retokenizer.split(doc[0], ["Los", "Angeles"], [doc[1], doc[1], doc[1]])
예제 #3
0
def test_issue1547():
    """Test that entity labels still match after merging tokens."""
    words = ["\n", "worda", ".", "\n", "wordb", "-", "Biosphere", "2", "-", " \n"]
    doc = Doc(Vocab(), words=words)
    doc.ents = [Span(doc, 6, 8, label=doc.vocab.strings["PRODUCT"])]
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[5:7])
    assert [ent.text for ent in doc.ents]
예제 #4
0
def test_doc_retokenize_merge_extension_attrs_invalid(en_vocab, underscore_attrs):
    Token.set_extension("a", getter=lambda x: x, force=True)
    Token.set_extension("b", method=lambda x: x, force=True)
    doc = Doc(en_vocab, words=["hello", "world", "!"])
    attrs = {"_": underscore_attrs}
    with pytest.raises(ValueError):
        with doc.retokenize() as retokenizer:
            retokenizer.merge(doc[0:2], attrs=attrs)
예제 #5
0
def test_doc_retokenize_split_extension_attrs_invalid(en_vocab, underscore_attrs):
    Token.set_extension("x", default=False, force=True)
    Token.set_extension("a", getter=lambda x: x, force=True)
    Token.set_extension("b", method=lambda x: x, force=True)
    doc = Doc(en_vocab, words=["LosAngeles", "start"])
    attrs = {"_": underscore_attrs}
    with pytest.raises(ValueError):
        with doc.retokenize() as retokenizer:
            heads = [(doc[0], 1), doc[1]]
            retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
예제 #6
0
def test_doc_retokenize_split_orths_mismatch(en_vocab):
    """Test that the regular retokenizer.split raises an error if the orths
    don't match the original token text. There might still be a method that
    allows this, but for the default use cases, merging and splitting should
    always conform with spaCy's non-destructive tokenization policy. Otherwise,
    it can lead to very confusing and unexpected results.
    """
    doc = Doc(en_vocab, words=["LosAngeles", "start", "."])
    with pytest.raises(ValueError):
        with doc.retokenize() as retokenizer:
            retokenizer.split(doc[0], ["L", "A"], [(doc[0], 0), (doc[0], 0)])
예제 #7
0
def test_doc_retokenize_merge_extension_attrs(en_vocab):
    Token.set_extension("a", default=False, force=True)
    Token.set_extension("b", default="nothing", force=True)
    doc = Doc(en_vocab, words=["hello", "world", "!"])
    # Test regular merging
    with doc.retokenize() as retokenizer:
        attrs = {"lemma": "hello world", "_": {"a": True, "b": "1"}}
        retokenizer.merge(doc[0:2], attrs=attrs)
    assert doc[0].lemma_ == "hello world"
    assert doc[0]._.a is True
    assert doc[0]._.b == "1"
    # Test bulk merging
    doc = Doc(en_vocab, words=["hello", "world", "!", "!"])
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[0:2], attrs={"_": {"a": True, "b": "1"}})
        retokenizer.merge(doc[2:4], attrs={"_": {"a": None, "b": "2"}})
    assert doc[0]._.a is True
    assert doc[0]._.b == "1"
    assert doc[1]._.a is None
    assert doc[1]._.b == "2"
예제 #8
0
def test_doc_retokenize_split_dependencies(en_vocab):
    doc = Doc(en_vocab, words=["LosAngeles", "start", "."])
    dep1 = doc.vocab.strings.add("amod")
    dep2 = doc.vocab.strings.add("subject")
    with doc.retokenize() as retokenizer:
        retokenizer.split(
            doc[0],
            ["Los", "Angeles"],
            [(doc[0], 1), doc[1]],
            attrs={"dep": [dep1, dep2]},
        )
    assert doc[0].dep == dep1
    assert doc[1].dep == dep2
예제 #9
0
def test_doc_retokenize_spans_entity_split_iob():
    # Test entity IOB stays consistent after merging
    words = ["abc", "d", "e"]
    doc = Doc(Vocab(), words=words)
    doc.ents = [(doc.vocab.strings.add("ent-abcd"), 0, 2)]
    assert doc[0].ent_iob_ == "B"
    assert doc[1].ent_iob_ == "I"
    with doc.retokenize() as retokenizer:
        retokenizer.split(doc[0], ["a", "b", "c"], [(doc[0], 1), (doc[0], 2), doc[1]])
    assert doc[0].ent_iob_ == "B"
    assert doc[1].ent_iob_ == "I"
    assert doc[2].ent_iob_ == "I"
    assert doc[3].ent_iob_ == "I"
예제 #10
0
def test_doc_retokenizer_merge_lex_attrs(en_vocab):
    """Test that retokenization also sets attributes on the lexeme if they're
    lexical attributes. For example, if a user sets IS_STOP, it should mean that
    "all tokens with that lexeme" are marked as a stop word, so the ambiguity
    here is acceptable. Also see #2390.
    """
    # Test regular merging
    doc = Doc(en_vocab, words=["hello", "world", "!"])
    assert not any(t.is_stop for t in doc)
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[0:2], attrs={"lemma": "hello world", "is_stop": True})
    assert doc[0].lemma_ == "hello world"
    assert doc[0].is_stop
    # Test bulk merging
    doc = Doc(en_vocab, words=["eins", "zwei", "!", "!"])
    assert not any(t.like_num for t in doc)
    assert not any(t.is_stop for t in doc)
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[0:2], attrs={"like_num": True})
        retokenizer.merge(doc[2:4], attrs={"is_stop": True})
    assert doc[0].like_num
    assert doc[1].is_stop
    assert not doc[0].is_stop
    assert not doc[1].like_num
예제 #11
0
def test_doc_retokenize_split_extension_attrs(en_vocab):
    Token.set_extension("a", default=False, force=True)
    Token.set_extension("b", default="nothing", force=True)
    doc = Doc(en_vocab, words=["LosAngeles", "start"])
    with doc.retokenize() as retokenizer:
        heads = [(doc[0], 1), doc[1]]
        underscore = [{"a": True, "b": "1"}, {"b": "2"}]
        attrs = {"lemma": ["los", "angeles"], "_": underscore}
        retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
    assert doc[0].lemma_ == "los"
    assert doc[0]._.a is True
    assert doc[0]._.b == "1"
    assert doc[1].lemma_ == "angeles"
    assert doc[1]._.a is False
    assert doc[1]._.b == "2"
예제 #12
0
def test_doc_retokenizer_split_lex_attrs(en_vocab):
    """Test that retokenization also sets attributes on the lexeme if they're
    lexical attributes. For example, if a user sets IS_STOP, it should mean that
    "all tokens with that lexeme" are marked as a stop word, so the ambiguity
    here is acceptable. Also see #2390.
    """
    assert not Doc(en_vocab, words=["Los"])[0].is_stop
    assert not Doc(en_vocab, words=["Angeles"])[0].is_stop
    doc = Doc(en_vocab, words=["LosAngeles", "start"])
    assert not doc[0].is_stop
    with doc.retokenize() as retokenizer:
        attrs = {"is_stop": [True, False]}
        heads = [(doc[0], 1), doc[1]]
        retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
    assert doc[0].is_stop
    assert not doc[1].is_stop
예제 #13
0
def test_doc_retokenize_spans_subtree_size_check(en_tokenizer):
    # fmt: off
    text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale"
    heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12]
    deps = ["compound", "nsubj", "ROOT", "det", "amod", "prt", "attr",
            "nsubj", "relcl", "prep", "pobj", "cc", "conj", "compound",
            "dobj"]
    # fmt: on
    tokens = en_tokenizer(text)
    doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
    sent1 = list(doc.sents)[0]
    init_len = len(list(sent1.root.subtree))
    with doc.retokenize() as retokenizer:
        attrs = {"lemma": "none", "ent_type": "none"}
        retokenizer.merge(doc[0:2], attrs=attrs)
    assert len(list(sent1.root.subtree)) == init_len - 1
예제 #14
0
def test_doc_retokenize_spans_merge_heads(en_vocab):
    words = ["I", "found", "a", "pilates", "class", "near", "work", "."]
    heads = [1, 1, 4, 6, 1, 4, 5, 1]
    deps = ["dep"] * len(heads)
    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
    assert len(doc) == 8
    with doc.retokenize() as retokenizer:
        attrs = {"tag": doc[4].tag_, "lemma": "pilates class", "ent_type": "O"}
        retokenizer.merge(doc[3:5], attrs=attrs)
    assert len(doc) == 7
    assert doc[0].head.i == 1
    assert doc[1].head.i == 1
    assert doc[2].head.i == 3
    assert doc[3].head.i == 1
    assert doc[4].head.i in [1, 3]
    assert doc[5].head.i == 4
예제 #15
0
def test_doc_retokenizer_split_lex_attrs(en_vocab):
    """Test that retokenization also sets attributes on the lexeme if they're
    lexical attributes. For example, if a user sets IS_STOP, it should mean that
    "all tokens with that lexeme" are marked as a stop word, so the ambiguity
    here is acceptable. Also see #2390.
    """
    assert not Doc(en_vocab, words=["Los"])[0].is_stop
    assert not Doc(en_vocab, words=["Angeles"])[0].is_stop
    doc = Doc(en_vocab, words=["LosAngeles", "start"])
    assert not doc[0].is_stop
    with doc.retokenize() as retokenizer:
        attrs = {"is_stop": [True, False]}
        heads = [(doc[0], 1), doc[1]]
        retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
    assert doc[0].is_stop
    assert not doc[1].is_stop
예제 #16
0
    def __call__(self, text: str) -> Doc:
        dtokens = self.detailed_tokens(text)
        words = [x.surface for x in dtokens]
        spaces = [x.space for x in dtokens]
        doc = Doc(self.vocab, words=words, spaces=spaces)
        for token, dtoken in zip(doc, dtokens):
            token.tag_ = dtoken.pos
            token.lemma_ = dtoken.lemma if dtoken.lemma != "*" else token.text
            token._.set(self.key_fstring, dtoken.fstring)

        with doc.retokenize() as retokenizer:
            for match in RE_URL.finditer(doc.text):
                span = doc.char_span(*match.span())
                if span:
                    retokenizer.merge(span)
        doc.is_tagged = True
        return doc
예제 #17
0
def test_retokenized_docs(doc):
    a = doc.to_array(["TAG"])
    doc1 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a)
    doc2 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a)
    example = Example(doc1, doc2)
    # fmt: off
    expected1 = [
        "Sarah", "'s", "sister", "flew", "to", "Silicon", "Valley", "via",
        "London", "."
    ]
    expected2 = [None, "sister", "flew", "to", None, "via", "London", "."]
    # fmt: on
    assert example.get_aligned("ORTH", as_string=True) == expected1
    with doc1.retokenize() as retokenizer:
        retokenizer.merge(doc1[0:2])
        retokenizer.merge(doc1[5:7])
    assert example.get_aligned("ORTH", as_string=True) == expected2
예제 #18
0
    def __call__(self, doc: Doc) -> Doc:
        """Apply the pipeline component to a `Doc` object.

        doc (Doc): The `Doc` returned by the previous pipeline component.
        RETURNS (Doc): The modified `Doc` object.
        """
        spans = self.matcher(doc, as_spans=True)
        for span in spans:
            for token in span:
                token._.set(self._is_emoji, True)

        if self.merge_spans:
            spans = filter_spans(spans)
            with doc.retokenize() as retokenizer:
                for span in spans:
                    if len(span) > 1:
                        retokenizer.merge(span)
        return doc
예제 #19
0
def test_doc_retokenizer_split_norm(en_vocab):
    """#6060: reset norm in split"""
    text = "The quick brownfoxjumpsoverthe lazy dog w/ white spots"
    doc = Doc(en_vocab, words=text.split())

    # Set custom norm on the w/ token.
    doc[5].norm_ = "with"

    # Retokenize to split out the words in the token at doc[2].
    token = doc[2]
    with doc.retokenize() as retokenizer:
        retokenizer.split(token, ["brown", "fox", "jumps", "over", "the"],
                          heads=[(token, idx) for idx in range(5)])

    assert doc[9].text == "w/"
    assert doc[9].norm_ == "with"
    assert doc[5].text == "over"
    assert doc[5].norm_ == "over"
def test_doc_retokenize_spans_merge_tokens(en_tokenizer):
    text = "Los Angeles start."
    heads = [1, 2, 2, 2]
    deps = ["dep"] * len(heads)
    tokens = en_tokenizer(text)
    doc = Doc(tokens.vocab,
              words=[t.text for t in tokens],
              heads=heads,
              deps=deps)
    assert len(doc) == 4
    assert doc[0].head.text == "Angeles"
    assert doc[1].head.text == "start"
    with doc.retokenize() as retokenizer:
        attrs = {"tag": "NNP", "lemma": "Los Angeles", "ent_type": "GPE"}
        retokenizer.merge(doc[0:2], attrs=attrs)
    assert len(doc) == 3
    assert doc[0].text == "Los Angeles"
    assert doc[0].head.text == "start"
    assert doc[0].ent_type_ == "GPE"
def merge_spans(spans: Iterable[Span], doc: Doc) -> None:
    """
    Merge spans into single tokens in ``doc``, *in-place*.

    Args:
        spans (Iterable[:class:`spacy.tokens.Span`])
        doc (:class:`spacy.tokens.Doc`)
    """
    try:  # retokenizer was added to spacy in v2.0.11
        with doc.retokenize() as retokenizer:
            string_store = doc.vocab.strings
            for span in spans:
                retokenizer.merge(
                    doc[span.start : span.end],
                    attrs=attrs.intify_attrs({"ent_type": span.label}, string_store),
                )
    except AttributeError:
        spans = [(span.start_char, span.end_char, span.label) for span in spans]
        for start_char, end_char, label in spans:
            doc.merge(start_char, end_char, ent_type=label)
예제 #22
0
def test_doc_retokenize_spans_sentence_update_after_merge(en_tokenizer):
    # fmt: off
    text = "Stewart Lee is a stand up comedian. He lives in England and loves Joe Pasquale."
    heads = [1, 2, 2, 4, 2, 4, 4, 2, 9, 9, 9, 10, 9, 9, 15, 13, 9]
    deps = ['compound', 'nsubj', 'ROOT', 'det', 'amod', 'prt', 'attr',
            'punct', 'nsubj', 'ROOT', 'prep', 'pobj', 'cc', 'conj',
            'compound', 'dobj', 'punct']
    # fmt: on
    tokens = en_tokenizer(text)
    doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
    sent1, sent2 = list(doc.sents)
    init_len = len(sent1)
    init_len2 = len(sent2)
    with doc.retokenize() as retokenizer:
        attrs = {"lemma": "none", "ent_type": "none"}
        retokenizer.merge(doc[0:2], attrs=attrs)
        retokenizer.merge(doc[-2:], attrs=attrs)
    sent1, sent2 = list(doc.sents)
    assert len(sent1) == init_len - 1
    assert len(sent2) == init_len2 - 1
예제 #23
0
def test_issue3540(en_vocab):

    words = ["I", "live", "in", "NewYork", "right", "now"]
    tensor = np.asarray(
        [[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1],
         [6.0, 6.1]],
        dtype="f",
    )
    doc = Doc(en_vocab, words=words)
    doc.tensor = tensor

    gold_text = ["I", "live", "in", "NewYork", "right", "now"]
    assert [token.text for token in doc] == gold_text

    gold_lemma = ["I", "live", "in", "NewYork", "right", "now"]
    assert [token.lemma_ for token in doc] == gold_lemma

    vectors_1 = [token.vector for token in doc]
    assert len(vectors_1) == len(doc)

    with doc.retokenize() as retokenizer:
        heads = [(doc[3], 1), doc[2]]
        attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]}
        retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)

    gold_text = ["I", "live", "in", "New", "York", "right", "now"]
    assert [token.text for token in doc] == gold_text

    gold_lemma = ["I", "live", "in", "New", "York", "right", "now"]
    assert [token.lemma_ for token in doc] == gold_lemma

    vectors_2 = [token.vector for token in doc]
    assert len(vectors_2) == len(doc)

    assert vectors_1[0].tolist() == vectors_2[0].tolist()
    assert vectors_1[1].tolist() == vectors_2[1].tolist()
    assert vectors_1[2].tolist() == vectors_2[2].tolist()

    assert vectors_1[4].tolist() == vectors_2[5].tolist()
    assert vectors_1[5].tolist() == vectors_2[6].tolist()
예제 #24
0
def test_doc_api_runtime_error(en_tokenizer):
    # Example that caused run-time error while parsing Reddit
    # fmt: off
    text = "67% of black households are single parent \n\n72% of all black babies born out of wedlock \n\n50% of all black kids don\u2019t finish high school"
    deps = ["nummod", "nsubj", "prep", "amod", "pobj", "ROOT", "amod", "attr", "", "nummod", "appos", "prep", "det",
            "amod", "pobj", "acl", "prep", "prep", "pobj",
            "", "nummod", "nsubj", "prep", "det", "amod", "pobj", "aux", "neg", "ccomp", "amod", "dobj"]
    # fmt: on
    tokens = en_tokenizer(text)
    doc = Doc(tokens.vocab, words=[t.text for t in tokens], deps=deps)
    nps = []
    for np in doc.noun_chunks:
        while len(np) > 1 and np[0].dep_ not in ("advmod", "amod", "compound"):
            np = np[1:]
        if len(np) > 1:
            nps.append(np)
    with doc.retokenize() as retokenizer:
        for np in nps:
            attrs = {
                "tag": np.root.tag_,
                "lemma": np.text,
                "ent_type": np.root.ent_type_,
            }
            retokenizer.merge(np, attrs=attrs)
예제 #25
0
    def __call__(self, doc: Doc):
        if "sub_tokens" not in doc.user_data:
            return doc
        if self._split_mode is None:
            return doc
        elif self._split_mode == "C":
            del doc.user_data["sub_tokens"]
            return doc
        elif self._split_mode == "B":
            sub_tokens_index = 1
        elif self._split_mode == "A":
            sub_tokens_index = 0
        else:
            raise Exception("invalid split_mode: " + self._split_mode)

        sub_tokens_list = [
            sub_tokens[sub_tokens_index] if sub_tokens else None
            for sub_tokens in doc.user_data["sub_tokens"]
        ]

        for token_i, sub_tokens in reversed(
                tuple(zip(range(len(doc)), sub_tokens_list))):
            token = doc[token_i]
            token_ent_type = token.ent_type

            # edit token.dep_
            if token.head.i == token.i:
                dep = "ROOT"
            else:
                dep = token.dep_

            compounds = dep in {"compound", "nummod", "punct"}

            # retokenize
            if sub_tokens_index is not None and sub_tokens:
                deps = [tag_dep_map(dtoken.tag)
                        for dtoken in sub_tokens[:-1]] + [token.dep_]
                last = len(sub_tokens) - 1
                if token.head.i == token.i:
                    heads = [(token, last) for _ in range(last + 1)]
                elif compounds:
                    heads = [token.head for _ in range(len(sub_tokens))]
                else:
                    heads = [(token, last) for _ in range(last)] + [token.head]
                surfaces = [dtoken.surface for dtoken in sub_tokens]

                def morph(dtoken):
                    m = {}
                    if dtoken.inf:
                        m["Inflection"] = dtoken.inf
                    if dtoken.reading:
                        m["Reading"] = re.sub("[=|]", "_", dtoken.reading)
                    return "|".join(f"{k}={v}" for k, v in m.items())

                attrs = {
                    "TAG": [dtoken.tag for dtoken in sub_tokens],
                    "DEP":
                    deps,
                    "POS":
                    tag_to_pos(
                        sub_tokens,
                        doc[token.i +
                            1].tag_ if token.i < len(doc) - 1 else None),
                    "LEMMA": [dtoken.lemma for dtoken in sub_tokens],
                    "NORM": [dtoken.norm for dtoken in sub_tokens],
                    "ENT_TYPE": [token_ent_type for dtoken in sub_tokens],
                    "MORPH": [morph(dtoken) for dtoken in sub_tokens],
                }
                try:
                    with doc.retokenize() as retokenizer:
                        retokenizer.split(token,
                                          surfaces,
                                          heads=heads,
                                          attrs=attrs)
                except Exception as e:
                    import sys
                    print("Retokenization error:", file=sys.stderr)
                    print(doc.text, file=sys.stderr)
                    print([(t.i, t.orth_) for t in doc], file=sys.stderr)
                    print(list(enumerate(doc.user_data["sub_tokens"])),
                          file=sys.stderr)
                    raise e

                # work-around: retokenize() does not consider the head of the splitted tokens
                if not compounds:
                    for t in doc:
                        if t.i < token_i or token_i + len(sub_tokens) <= t.i:
                            if t.head.i == token_i:
                                t.head = doc[token_i + last]

        del doc.user_data["sub_tokens"]
        return doc
예제 #26
0
            c_start = c_match[1]
            c_end = c_match[2]

            if ((c_start <= end and start <= c_end) or
                    (start <= c_end and c_start <= end)):
                ent_size = end - start
                c_ent_size = c_end - c_start

                if ent_size < c_ent_size:
                    skip_matches.add(i)
                elif c_ent_size < ent_size or start < c_start:
                    skip_matches.add(j)
                else:
                    skip_matches.add(i)

    with doc.retokenize() as retokenizer:
        for i, match in enumerate(matches):
            match_id = match[0]
            if i in skip_matches:
                continue
            start = match[1]
            end = match[2]
            merged += 1
            retokenizer.merge(doc[start:end])

    abstract_id_list.append(abstract_id)

    token_string = "\t\t".join(["\t".join([token.text.lower() for token in sentence if token_conditions(token)])
                                for sentence in doc.sents])

    word_list.append(token_string)
예제 #27
0
def merge_spans(doc: Doc, spans: Iterable[Span]):
    spans = filter_spans(spans)
    with doc.retokenize() as retokenizer:
        for span in spans:
            retokenizer.merge(span)
예제 #28
0
def test_retokenize_disallow_zero_length(en_vocab):
    doc = Doc(en_vocab, words=["hello", "world", "!"])
    with pytest.raises(ValueError):
        with doc.retokenize() as retokenizer:
            retokenizer.merge(doc[1:1])
예제 #29
0
def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
    # Test entity IOB stays consistent after merging
    words = ["a", "b", "c", "d", "e"]
    doc = Doc(Vocab(), words=words)
    doc.ents = [
        (doc.vocab.strings.add("ent-abc"), 0, 3),
        (doc.vocab.strings.add("ent-d"), 3, 4),
    ]
    assert doc[0].ent_iob_ == "B"
    assert doc[1].ent_iob_ == "I"
    assert doc[2].ent_iob_ == "I"
    assert doc[3].ent_iob_ == "B"
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[0:2])
    assert len(doc) == len(words) - 1
    assert doc[0].ent_iob_ == "B"
    assert doc[1].ent_iob_ == "I"

    # Test that IOB stays consistent with provided IOB
    words = ["a", "b", "c", "d", "e"]
    doc = Doc(Vocab(), words=words)
    with doc.retokenize() as retokenizer:
        attrs = {"ent_type": "ent-abc", "ent_iob": 1}
        retokenizer.merge(doc[0:3], attrs=attrs)
        retokenizer.merge(doc[3:5], attrs=attrs)
    assert doc[0].ent_iob_ == "B"
    assert doc[1].ent_iob_ == "I"

    # if no parse/heads, the first word in the span is the root and provides
    # default values
    words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
    doc = Doc(Vocab(), words=words)
    doc.ents = [
        (doc.vocab.strings.add("ent-de"), 3, 5),
        (doc.vocab.strings.add("ent-fg"), 5, 7),
    ]
    assert doc[3].ent_iob_ == "B"
    assert doc[4].ent_iob_ == "I"
    assert doc[5].ent_iob_ == "B"
    assert doc[6].ent_iob_ == "I"
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[2:4])
        retokenizer.merge(doc[4:6])
        retokenizer.merge(doc[7:9])
    assert len(doc) == 6
    assert doc[3].ent_iob_ == "B"
    assert doc[3].ent_type_ == "ent-de"
    assert doc[4].ent_iob_ == "B"
    assert doc[4].ent_type_ == "ent-fg"

    # if there is a parse, span.root provides default values
    words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
    heads = [0, 0, 3, 0, 0, 0, 5, 0, 0]
    ents = ["O"] * len(words)
    ents[3] = "B-ent-de"
    ents[4] = "I-ent-de"
    ents[5] = "B-ent-fg"
    ents[6] = "I-ent-fg"
    deps = ["dep"] * len(words)
    en_vocab.strings.add("ent-de")
    en_vocab.strings.add("ent-fg")
    en_vocab.strings.add("dep")
    doc = Doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
    assert doc[2:4].root == doc[3]  # root of 'c d' is d
    assert doc[4:6].root == doc[4]  # root is 'e f' is e
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[2:4])
        retokenizer.merge(doc[4:6])
        retokenizer.merge(doc[7:9])
    assert len(doc) == 6
    assert doc[2].ent_iob_ == "B"
    assert doc[2].ent_type_ == "ent-de"
    assert doc[3].ent_iob_ == "I"
    assert doc[3].ent_type_ == "ent-de"
    assert doc[4].ent_iob_ == "B"
    assert doc[4].ent_type_ == "ent-fg"

    # check that B is preserved if span[start] is B
    words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
    heads = [0, 0, 3, 4, 0, 0, 5, 0, 0]
    ents = ["O"] * len(words)
    ents[3] = "B-ent-de"
    ents[4] = "I-ent-de"
    ents[5] = "B-ent-de"
    ents[6] = "I-ent-de"
    deps = ["dep"] * len(words)
    doc = Doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[3:5])
        retokenizer.merge(doc[5:7])
    assert len(doc) == 7
    assert doc[3].ent_iob_ == "B"
    assert doc[3].ent_type_ == "ent-de"
    assert doc[4].ent_iob_ == "B"
    assert doc[4].ent_type_ == "ent-de"