def test_iob_to_biluo(): good_iob = ["O", "O", "B-LOC", "I-LOC", "O", "B-PERSON"] good_biluo = ["O", "O", "B-LOC", "L-LOC", "O", "U-PERSON"] bad_iob = ["O", "O", '"', "B-LOC", "I-LOC"] converted_biluo = iob_to_biluo(good_iob) assert good_biluo == converted_biluo with pytest.raises(ValueError): iob_to_biluo(bad_iob)
def test_issue2385(): """Test that IOB tags are correctly converted to BILUO tags.""" # fix bug in labels with a 'b' character tags1 = ("B-BRAWLER", "I-BRAWLER", "I-BRAWLER") assert iob_to_biluo(tags1) == ["B-BRAWLER", "I-BRAWLER", "L-BRAWLER"] # maintain support for iob1 format tags2 = ("I-ORG", "I-ORG", "B-ORG") assert iob_to_biluo(tags2) == ["B-ORG", "L-ORG", "U-ORG"] # maintain support for iob2 format tags3 = ("B-PERSON", "I-PERSON", "B-PERSON") assert iob_to_biluo(tags3) == ["B-PERSON", "L-PERSON", "U-PERSON"]
def from_spacy_doc(cls, doc, map_spacy_entities_to_presidio=True, scheme="BILUO"): if scheme not in ("BILUO", "BILOU", "BIO", "IOB"): raise ValueError( "scheme should be one of \"BILUO\",\"BILOU\",\"BIO\",\"IOB\"") spans = [] for ent in doc.ents: entity_type = (cls.rename_from_spacy_tags(ent.label_) if map_spacy_entities_to_presidio else ent.label_) span = Span( entity_type=entity_type, entity_value=ent.text, start_position=ent.start_char, end_position=ent.end_char, ) spans.append(span) tags = [ f"{token.ent_iob_}-{token.ent_type_}" if token.ent_iob_ != "O" else "O" for token in doc ] if scheme in ("BILUO", "BILOU"): tags = iob_to_biluo(tags) return cls(full_text=doc.text, masked=None, spans=spans, tokens=doc, tags=tags, create_tags_from_span=False, scheme=scheme)
def from_spacy_doc(cls, doc: Doc, translate_tags: bool = True, scheme: str = "BILUO") -> "InputSample": if scheme not in ("BILUO", "BILOU", "BIO", "IOB"): raise ValueError( 'scheme should be one of "BILUO","BILOU","BIO","IOB"') spans = [] for ent in doc.ents: entity_type = (cls.rename_from_spacy_tag(ent.label_) if translate_tags else ent.label_) span = Span( entity_type=entity_type, entity_value=ent.text, start_position=ent.start_char, end_position=ent.end_char, ) spans.append(span) tags = [ f"{token.ent_iob_}-{token.ent_type_}" if token.ent_iob_ != "O" else "O" for token in doc ] if scheme in ("BILUO", "BILOU"): tags = iob_to_biluo(tags) return cls( full_text=doc.text, masked=None, spans=spans, tokens=doc, tags=tags, create_tags_from_span=False, scheme=scheme, )
def test_issue2385_biluo(tags): """Test that BILUO-compatible tags aren't modified.""" assert iob_to_biluo(tags) == list(tags)