def test_doc_retokenize_spans_entity_merge_iob(): # Test entity IOB stays consistent after merging words = ["a", "b", "c", "d", "e"] doc = Doc(Vocab(), words=words) doc.ents = [ (doc.vocab.strings.add("ent-abc"), 0, 3), (doc.vocab.strings.add("ent-d"), 3, 4), ] assert doc[0].ent_iob_ == "B" assert doc[1].ent_iob_ == "I" assert doc[2].ent_iob_ == "I" assert doc[3].ent_iob_ == "B" with doc.retokenize() as retokenizer: retokenizer.merge(doc[0:1]) assert doc[0].ent_iob_ == "B" assert doc[1].ent_iob_ == "I" words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"] doc = Doc(Vocab(), words=words) doc.ents = [ (doc.vocab.strings.add("ent-de"), 3, 5), (doc.vocab.strings.add("ent-fg"), 5, 7), ] assert doc[3].ent_iob_ == "B" assert doc[4].ent_iob_ == "I" assert doc[5].ent_iob_ == "B" assert doc[6].ent_iob_ == "I" with doc.retokenize() as retokenizer: retokenizer.merge(doc[2:4]) retokenizer.merge(doc[4:6]) retokenizer.merge(doc[7:9]) assert len(doc) == 6 assert doc[3].ent_iob_ == "B" assert doc[4].ent_iob_ == "I"
def test_doc_retokenize_split_heads_error(en_vocab): doc = Doc(en_vocab, words=["LosAngeles", "start", "."]) # Not enough heads with pytest.raises(ValueError): with doc.retokenize() as retokenizer: retokenizer.split(doc[0], ["Los", "Angeles"], [doc[1]]) # Too many heads with pytest.raises(ValueError): with doc.retokenize() as retokenizer: retokenizer.split(doc[0], ["Los", "Angeles"], [doc[1], doc[1], doc[1]])
def test_issue1547(): """Test that entity labels still match after merging tokens.""" words = ["\n", "worda", ".", "\n", "wordb", "-", "Biosphere", "2", "-", " \n"] doc = Doc(Vocab(), words=words) doc.ents = [Span(doc, 6, 8, label=doc.vocab.strings["PRODUCT"])] with doc.retokenize() as retokenizer: retokenizer.merge(doc[5:7]) assert [ent.text for ent in doc.ents]
def test_doc_retokenize_merge_extension_attrs_invalid(en_vocab, underscore_attrs): Token.set_extension("a", getter=lambda x: x, force=True) Token.set_extension("b", method=lambda x: x, force=True) doc = Doc(en_vocab, words=["hello", "world", "!"]) attrs = {"_": underscore_attrs} with pytest.raises(ValueError): with doc.retokenize() as retokenizer: retokenizer.merge(doc[0:2], attrs=attrs)
def test_doc_retokenize_split_extension_attrs_invalid(en_vocab, underscore_attrs): Token.set_extension("x", default=False, force=True) Token.set_extension("a", getter=lambda x: x, force=True) Token.set_extension("b", method=lambda x: x, force=True) doc = Doc(en_vocab, words=["LosAngeles", "start"]) attrs = {"_": underscore_attrs} with pytest.raises(ValueError): with doc.retokenize() as retokenizer: heads = [(doc[0], 1), doc[1]] retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
def test_doc_retokenize_split_orths_mismatch(en_vocab): """Test that the regular retokenizer.split raises an error if the orths don't match the original token text. There might still be a method that allows this, but for the default use cases, merging and splitting should always conform with spaCy's non-destructive tokenization policy. Otherwise, it can lead to very confusing and unexpected results. """ doc = Doc(en_vocab, words=["LosAngeles", "start", "."]) with pytest.raises(ValueError): with doc.retokenize() as retokenizer: retokenizer.split(doc[0], ["L", "A"], [(doc[0], 0), (doc[0], 0)])
def test_doc_retokenize_merge_extension_attrs(en_vocab): Token.set_extension("a", default=False, force=True) Token.set_extension("b", default="nothing", force=True) doc = Doc(en_vocab, words=["hello", "world", "!"]) # Test regular merging with doc.retokenize() as retokenizer: attrs = {"lemma": "hello world", "_": {"a": True, "b": "1"}} retokenizer.merge(doc[0:2], attrs=attrs) assert doc[0].lemma_ == "hello world" assert doc[0]._.a is True assert doc[0]._.b == "1" # Test bulk merging doc = Doc(en_vocab, words=["hello", "world", "!", "!"]) with doc.retokenize() as retokenizer: retokenizer.merge(doc[0:2], attrs={"_": {"a": True, "b": "1"}}) retokenizer.merge(doc[2:4], attrs={"_": {"a": None, "b": "2"}}) assert doc[0]._.a is True assert doc[0]._.b == "1" assert doc[1]._.a is None assert doc[1]._.b == "2"
def test_doc_retokenize_split_dependencies(en_vocab): doc = Doc(en_vocab, words=["LosAngeles", "start", "."]) dep1 = doc.vocab.strings.add("amod") dep2 = doc.vocab.strings.add("subject") with doc.retokenize() as retokenizer: retokenizer.split( doc[0], ["Los", "Angeles"], [(doc[0], 1), doc[1]], attrs={"dep": [dep1, dep2]}, ) assert doc[0].dep == dep1 assert doc[1].dep == dep2
def test_doc_retokenize_spans_entity_split_iob(): # Test entity IOB stays consistent after merging words = ["abc", "d", "e"] doc = Doc(Vocab(), words=words) doc.ents = [(doc.vocab.strings.add("ent-abcd"), 0, 2)] assert doc[0].ent_iob_ == "B" assert doc[1].ent_iob_ == "I" with doc.retokenize() as retokenizer: retokenizer.split(doc[0], ["a", "b", "c"], [(doc[0], 1), (doc[0], 2), doc[1]]) assert doc[0].ent_iob_ == "B" assert doc[1].ent_iob_ == "I" assert doc[2].ent_iob_ == "I" assert doc[3].ent_iob_ == "I"
def test_doc_retokenizer_merge_lex_attrs(en_vocab): """Test that retokenization also sets attributes on the lexeme if they're lexical attributes. For example, if a user sets IS_STOP, it should mean that "all tokens with that lexeme" are marked as a stop word, so the ambiguity here is acceptable. Also see #2390. """ # Test regular merging doc = Doc(en_vocab, words=["hello", "world", "!"]) assert not any(t.is_stop for t in doc) with doc.retokenize() as retokenizer: retokenizer.merge(doc[0:2], attrs={"lemma": "hello world", "is_stop": True}) assert doc[0].lemma_ == "hello world" assert doc[0].is_stop # Test bulk merging doc = Doc(en_vocab, words=["eins", "zwei", "!", "!"]) assert not any(t.like_num for t in doc) assert not any(t.is_stop for t in doc) with doc.retokenize() as retokenizer: retokenizer.merge(doc[0:2], attrs={"like_num": True}) retokenizer.merge(doc[2:4], attrs={"is_stop": True}) assert doc[0].like_num assert doc[1].is_stop assert not doc[0].is_stop assert not doc[1].like_num
def test_doc_retokenize_split_extension_attrs(en_vocab): Token.set_extension("a", default=False, force=True) Token.set_extension("b", default="nothing", force=True) doc = Doc(en_vocab, words=["LosAngeles", "start"]) with doc.retokenize() as retokenizer: heads = [(doc[0], 1), doc[1]] underscore = [{"a": True, "b": "1"}, {"b": "2"}] attrs = {"lemma": ["los", "angeles"], "_": underscore} retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs) assert doc[0].lemma_ == "los" assert doc[0]._.a is True assert doc[0]._.b == "1" assert doc[1].lemma_ == "angeles" assert doc[1]._.a is False assert doc[1]._.b == "2"
def test_doc_retokenizer_split_lex_attrs(en_vocab): """Test that retokenization also sets attributes on the lexeme if they're lexical attributes. For example, if a user sets IS_STOP, it should mean that "all tokens with that lexeme" are marked as a stop word, so the ambiguity here is acceptable. Also see #2390. """ assert not Doc(en_vocab, words=["Los"])[0].is_stop assert not Doc(en_vocab, words=["Angeles"])[0].is_stop doc = Doc(en_vocab, words=["LosAngeles", "start"]) assert not doc[0].is_stop with doc.retokenize() as retokenizer: attrs = {"is_stop": [True, False]} heads = [(doc[0], 1), doc[1]] retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs) assert doc[0].is_stop assert not doc[1].is_stop
def test_doc_retokenize_spans_subtree_size_check(en_tokenizer): # fmt: off text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale" heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12] deps = ["compound", "nsubj", "ROOT", "det", "amod", "prt", "attr", "nsubj", "relcl", "prep", "pobj", "cc", "conj", "compound", "dobj"] # fmt: on tokens = en_tokenizer(text) doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) sent1 = list(doc.sents)[0] init_len = len(list(sent1.root.subtree)) with doc.retokenize() as retokenizer: attrs = {"lemma": "none", "ent_type": "none"} retokenizer.merge(doc[0:2], attrs=attrs) assert len(list(sent1.root.subtree)) == init_len - 1
def test_doc_retokenize_spans_merge_heads(en_vocab): words = ["I", "found", "a", "pilates", "class", "near", "work", "."] heads = [1, 1, 4, 6, 1, 4, 5, 1] deps = ["dep"] * len(heads) doc = Doc(en_vocab, words=words, heads=heads, deps=deps) assert len(doc) == 8 with doc.retokenize() as retokenizer: attrs = {"tag": doc[4].tag_, "lemma": "pilates class", "ent_type": "O"} retokenizer.merge(doc[3:5], attrs=attrs) assert len(doc) == 7 assert doc[0].head.i == 1 assert doc[1].head.i == 1 assert doc[2].head.i == 3 assert doc[3].head.i == 1 assert doc[4].head.i in [1, 3] assert doc[5].head.i == 4
def __call__(self, text: str) -> Doc: dtokens = self.detailed_tokens(text) words = [x.surface for x in dtokens] spaces = [x.space for x in dtokens] doc = Doc(self.vocab, words=words, spaces=spaces) for token, dtoken in zip(doc, dtokens): token.tag_ = dtoken.pos token.lemma_ = dtoken.lemma if dtoken.lemma != "*" else token.text token._.set(self.key_fstring, dtoken.fstring) with doc.retokenize() as retokenizer: for match in RE_URL.finditer(doc.text): span = doc.char_span(*match.span()) if span: retokenizer.merge(span) doc.is_tagged = True return doc
def test_retokenized_docs(doc): a = doc.to_array(["TAG"]) doc1 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a) doc2 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a) example = Example(doc1, doc2) # fmt: off expected1 = [ "Sarah", "'s", "sister", "flew", "to", "Silicon", "Valley", "via", "London", "." ] expected2 = [None, "sister", "flew", "to", None, "via", "London", "."] # fmt: on assert example.get_aligned("ORTH", as_string=True) == expected1 with doc1.retokenize() as retokenizer: retokenizer.merge(doc1[0:2]) retokenizer.merge(doc1[5:7]) assert example.get_aligned("ORTH", as_string=True) == expected2
def __call__(self, doc: Doc) -> Doc: """Apply the pipeline component to a `Doc` object. doc (Doc): The `Doc` returned by the previous pipeline component. RETURNS (Doc): The modified `Doc` object. """ spans = self.matcher(doc, as_spans=True) for span in spans: for token in span: token._.set(self._is_emoji, True) if self.merge_spans: spans = filter_spans(spans) with doc.retokenize() as retokenizer: for span in spans: if len(span) > 1: retokenizer.merge(span) return doc
def test_doc_retokenizer_split_norm(en_vocab): """#6060: reset norm in split""" text = "The quick brownfoxjumpsoverthe lazy dog w/ white spots" doc = Doc(en_vocab, words=text.split()) # Set custom norm on the w/ token. doc[5].norm_ = "with" # Retokenize to split out the words in the token at doc[2]. token = doc[2] with doc.retokenize() as retokenizer: retokenizer.split(token, ["brown", "fox", "jumps", "over", "the"], heads=[(token, idx) for idx in range(5)]) assert doc[9].text == "w/" assert doc[9].norm_ == "with" assert doc[5].text == "over" assert doc[5].norm_ == "over"
def test_doc_retokenize_spans_merge_tokens(en_tokenizer): text = "Los Angeles start." heads = [1, 2, 2, 2] deps = ["dep"] * len(heads) tokens = en_tokenizer(text) doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) assert len(doc) == 4 assert doc[0].head.text == "Angeles" assert doc[1].head.text == "start" with doc.retokenize() as retokenizer: attrs = {"tag": "NNP", "lemma": "Los Angeles", "ent_type": "GPE"} retokenizer.merge(doc[0:2], attrs=attrs) assert len(doc) == 3 assert doc[0].text == "Los Angeles" assert doc[0].head.text == "start" assert doc[0].ent_type_ == "GPE"
def merge_spans(spans: Iterable[Span], doc: Doc) -> None: """ Merge spans into single tokens in ``doc``, *in-place*. Args: spans (Iterable[:class:`spacy.tokens.Span`]) doc (:class:`spacy.tokens.Doc`) """ try: # retokenizer was added to spacy in v2.0.11 with doc.retokenize() as retokenizer: string_store = doc.vocab.strings for span in spans: retokenizer.merge( doc[span.start : span.end], attrs=attrs.intify_attrs({"ent_type": span.label}, string_store), ) except AttributeError: spans = [(span.start_char, span.end_char, span.label) for span in spans] for start_char, end_char, label in spans: doc.merge(start_char, end_char, ent_type=label)
def test_doc_retokenize_spans_sentence_update_after_merge(en_tokenizer): # fmt: off text = "Stewart Lee is a stand up comedian. He lives in England and loves Joe Pasquale." heads = [1, 2, 2, 4, 2, 4, 4, 2, 9, 9, 9, 10, 9, 9, 15, 13, 9] deps = ['compound', 'nsubj', 'ROOT', 'det', 'amod', 'prt', 'attr', 'punct', 'nsubj', 'ROOT', 'prep', 'pobj', 'cc', 'conj', 'compound', 'dobj', 'punct'] # fmt: on tokens = en_tokenizer(text) doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) sent1, sent2 = list(doc.sents) init_len = len(sent1) init_len2 = len(sent2) with doc.retokenize() as retokenizer: attrs = {"lemma": "none", "ent_type": "none"} retokenizer.merge(doc[0:2], attrs=attrs) retokenizer.merge(doc[-2:], attrs=attrs) sent1, sent2 = list(doc.sents) assert len(sent1) == init_len - 1 assert len(sent2) == init_len2 - 1
def test_issue3540(en_vocab): words = ["I", "live", "in", "NewYork", "right", "now"] tensor = np.asarray( [[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]], dtype="f", ) doc = Doc(en_vocab, words=words) doc.tensor = tensor gold_text = ["I", "live", "in", "NewYork", "right", "now"] assert [token.text for token in doc] == gold_text gold_lemma = ["I", "live", "in", "NewYork", "right", "now"] assert [token.lemma_ for token in doc] == gold_lemma vectors_1 = [token.vector for token in doc] assert len(vectors_1) == len(doc) with doc.retokenize() as retokenizer: heads = [(doc[3], 1), doc[2]] attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]} retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs) gold_text = ["I", "live", "in", "New", "York", "right", "now"] assert [token.text for token in doc] == gold_text gold_lemma = ["I", "live", "in", "New", "York", "right", "now"] assert [token.lemma_ for token in doc] == gold_lemma vectors_2 = [token.vector for token in doc] assert len(vectors_2) == len(doc) assert vectors_1[0].tolist() == vectors_2[0].tolist() assert vectors_1[1].tolist() == vectors_2[1].tolist() assert vectors_1[2].tolist() == vectors_2[2].tolist() assert vectors_1[4].tolist() == vectors_2[5].tolist() assert vectors_1[5].tolist() == vectors_2[6].tolist()
def test_doc_api_runtime_error(en_tokenizer): # Example that caused run-time error while parsing Reddit # fmt: off text = "67% of black households are single parent \n\n72% of all black babies born out of wedlock \n\n50% of all black kids don\u2019t finish high school" deps = ["nummod", "nsubj", "prep", "amod", "pobj", "ROOT", "amod", "attr", "", "nummod", "appos", "prep", "det", "amod", "pobj", "acl", "prep", "prep", "pobj", "", "nummod", "nsubj", "prep", "det", "amod", "pobj", "aux", "neg", "ccomp", "amod", "dobj"] # fmt: on tokens = en_tokenizer(text) doc = Doc(tokens.vocab, words=[t.text for t in tokens], deps=deps) nps = [] for np in doc.noun_chunks: while len(np) > 1 and np[0].dep_ not in ("advmod", "amod", "compound"): np = np[1:] if len(np) > 1: nps.append(np) with doc.retokenize() as retokenizer: for np in nps: attrs = { "tag": np.root.tag_, "lemma": np.text, "ent_type": np.root.ent_type_, } retokenizer.merge(np, attrs=attrs)
def __call__(self, doc: Doc): if "sub_tokens" not in doc.user_data: return doc if self._split_mode is None: return doc elif self._split_mode == "C": del doc.user_data["sub_tokens"] return doc elif self._split_mode == "B": sub_tokens_index = 1 elif self._split_mode == "A": sub_tokens_index = 0 else: raise Exception("invalid split_mode: " + self._split_mode) sub_tokens_list = [ sub_tokens[sub_tokens_index] if sub_tokens else None for sub_tokens in doc.user_data["sub_tokens"] ] for token_i, sub_tokens in reversed( tuple(zip(range(len(doc)), sub_tokens_list))): token = doc[token_i] token_ent_type = token.ent_type # edit token.dep_ if token.head.i == token.i: dep = "ROOT" else: dep = token.dep_ compounds = dep in {"compound", "nummod", "punct"} # retokenize if sub_tokens_index is not None and sub_tokens: deps = [tag_dep_map(dtoken.tag) for dtoken in sub_tokens[:-1]] + [token.dep_] last = len(sub_tokens) - 1 if token.head.i == token.i: heads = [(token, last) for _ in range(last + 1)] elif compounds: heads = [token.head for _ in range(len(sub_tokens))] else: heads = [(token, last) for _ in range(last)] + [token.head] surfaces = [dtoken.surface for dtoken in sub_tokens] def morph(dtoken): m = {} if dtoken.inf: m["Inflection"] = dtoken.inf if dtoken.reading: m["Reading"] = re.sub("[=|]", "_", dtoken.reading) return "|".join(f"{k}={v}" for k, v in m.items()) attrs = { "TAG": [dtoken.tag for dtoken in sub_tokens], "DEP": deps, "POS": tag_to_pos( sub_tokens, doc[token.i + 1].tag_ if token.i < len(doc) - 1 else None), "LEMMA": [dtoken.lemma for dtoken in sub_tokens], "NORM": [dtoken.norm for dtoken in sub_tokens], "ENT_TYPE": [token_ent_type for dtoken in sub_tokens], "MORPH": [morph(dtoken) for dtoken in sub_tokens], } try: with doc.retokenize() as retokenizer: retokenizer.split(token, surfaces, heads=heads, attrs=attrs) except Exception as e: import sys print("Retokenization error:", file=sys.stderr) print(doc.text, file=sys.stderr) print([(t.i, t.orth_) for t in doc], file=sys.stderr) print(list(enumerate(doc.user_data["sub_tokens"])), file=sys.stderr) raise e # work-around: retokenize() does not consider the head of the splitted tokens if not compounds: for t in doc: if t.i < token_i or token_i + len(sub_tokens) <= t.i: if t.head.i == token_i: t.head = doc[token_i + last] del doc.user_data["sub_tokens"] return doc
c_start = c_match[1] c_end = c_match[2] if ((c_start <= end and start <= c_end) or (start <= c_end and c_start <= end)): ent_size = end - start c_ent_size = c_end - c_start if ent_size < c_ent_size: skip_matches.add(i) elif c_ent_size < ent_size or start < c_start: skip_matches.add(j) else: skip_matches.add(i) with doc.retokenize() as retokenizer: for i, match in enumerate(matches): match_id = match[0] if i in skip_matches: continue start = match[1] end = match[2] merged += 1 retokenizer.merge(doc[start:end]) abstract_id_list.append(abstract_id) token_string = "\t\t".join(["\t".join([token.text.lower() for token in sentence if token_conditions(token)]) for sentence in doc.sents]) word_list.append(token_string)
def merge_spans(doc: Doc, spans: Iterable[Span]): spans = filter_spans(spans) with doc.retokenize() as retokenizer: for span in spans: retokenizer.merge(span)
def test_retokenize_disallow_zero_length(en_vocab): doc = Doc(en_vocab, words=["hello", "world", "!"]) with pytest.raises(ValueError): with doc.retokenize() as retokenizer: retokenizer.merge(doc[1:1])
def test_doc_retokenize_spans_entity_merge_iob(en_vocab): # Test entity IOB stays consistent after merging words = ["a", "b", "c", "d", "e"] doc = Doc(Vocab(), words=words) doc.ents = [ (doc.vocab.strings.add("ent-abc"), 0, 3), (doc.vocab.strings.add("ent-d"), 3, 4), ] assert doc[0].ent_iob_ == "B" assert doc[1].ent_iob_ == "I" assert doc[2].ent_iob_ == "I" assert doc[3].ent_iob_ == "B" with doc.retokenize() as retokenizer: retokenizer.merge(doc[0:2]) assert len(doc) == len(words) - 1 assert doc[0].ent_iob_ == "B" assert doc[1].ent_iob_ == "I" # Test that IOB stays consistent with provided IOB words = ["a", "b", "c", "d", "e"] doc = Doc(Vocab(), words=words) with doc.retokenize() as retokenizer: attrs = {"ent_type": "ent-abc", "ent_iob": 1} retokenizer.merge(doc[0:3], attrs=attrs) retokenizer.merge(doc[3:5], attrs=attrs) assert doc[0].ent_iob_ == "B" assert doc[1].ent_iob_ == "I" # if no parse/heads, the first word in the span is the root and provides # default values words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"] doc = Doc(Vocab(), words=words) doc.ents = [ (doc.vocab.strings.add("ent-de"), 3, 5), (doc.vocab.strings.add("ent-fg"), 5, 7), ] assert doc[3].ent_iob_ == "B" assert doc[4].ent_iob_ == "I" assert doc[5].ent_iob_ == "B" assert doc[6].ent_iob_ == "I" with doc.retokenize() as retokenizer: retokenizer.merge(doc[2:4]) retokenizer.merge(doc[4:6]) retokenizer.merge(doc[7:9]) assert len(doc) == 6 assert doc[3].ent_iob_ == "B" assert doc[3].ent_type_ == "ent-de" assert doc[4].ent_iob_ == "B" assert doc[4].ent_type_ == "ent-fg" # if there is a parse, span.root provides default values words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"] heads = [0, 0, 3, 0, 0, 0, 5, 0, 0] ents = ["O"] * len(words) ents[3] = "B-ent-de" ents[4] = "I-ent-de" ents[5] = "B-ent-fg" ents[6] = "I-ent-fg" deps = ["dep"] * len(words) en_vocab.strings.add("ent-de") en_vocab.strings.add("ent-fg") en_vocab.strings.add("dep") doc = Doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents) assert doc[2:4].root == doc[3] # root of 'c d' is d assert doc[4:6].root == doc[4] # root is 'e f' is e with doc.retokenize() as retokenizer: retokenizer.merge(doc[2:4]) retokenizer.merge(doc[4:6]) retokenizer.merge(doc[7:9]) assert len(doc) == 6 assert doc[2].ent_iob_ == "B" assert doc[2].ent_type_ == "ent-de" assert doc[3].ent_iob_ == "I" assert doc[3].ent_type_ == "ent-de" assert doc[4].ent_iob_ == "B" assert doc[4].ent_type_ == "ent-fg" # check that B is preserved if span[start] is B words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"] heads = [0, 0, 3, 4, 0, 0, 5, 0, 0] ents = ["O"] * len(words) ents[3] = "B-ent-de" ents[4] = "I-ent-de" ents[5] = "B-ent-de" ents[6] = "I-ent-de" deps = ["dep"] * len(words) doc = Doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents) with doc.retokenize() as retokenizer: retokenizer.merge(doc[3:5]) retokenizer.merge(doc[5:7]) assert len(doc) == 7 assert doc[3].ent_iob_ == "B" assert doc[3].ent_type_ == "ent-de" assert doc[4].ent_iob_ == "B" assert doc[4].ent_type_ == "ent-de"