Пример #1
0
def test_fusion_strategy_with_multiple_overlaps_highest_score_last():
    ents = [
        NamedEntity(start_char=64,
                    end_char=71,
                    tag="MISC",
                    text="han.solo",
                    score=0.92,
                    recognizer="SomeRecognizer"),
        NamedEntity(start_char=64,
                    end_char=85,
                    tag="ORG",
                    text="*****@*****.**",
                    score=0.83,
                    recognizer="SomeRecognizer"),
        NamedEntity(start_char=64,
                    end_char=85,
                    tag="EMAIL",
                    text="*****@*****.**",
                    score=1.0,
                    recognizer="AnotherRecognizer"),
    ]
    expected_ents = [
        NamedEntity(start_char=64,
                    end_char=85,
                    tag="EMAIL",
                    text="*****@*****.**",
                    score=1.0,
                    recognizer="AnotherRecognizer"),
    ]
    assert combine(ents, strategy="fusion") == expected_ents
Пример #2
0
def test_smart_fusion_strategy_with_double_match_and_overlap():
    ents = [
        NamedEntity(start_char=0,
                    end_char=8,
                    tag="PER",
                    text="Han Solo",
                    score=0.94,
                    recognizer="SomeRecognizer"),
        NamedEntity(start_char=64,
                    end_char=71,
                    tag="MISC",
                    text="han.solo",
                    score=0.99,
                    recognizer="SomeRecognizer"),
        NamedEntity(start_char=64,
                    end_char=85,
                    tag="EMAIL",
                    text="*****@*****.**",
                    score=0.5,
                    recognizer="SomeRecognizer"),
        NamedEntity(start_char=64,
                    end_char=85,
                    tag="EMAIL",
                    text="*****@*****.**",
                    score=0.5,
                    recognizer="AnotherRecognizer"),
        NamedEntity(start_char=100,
                    end_char=108,
                    tag="LOC",
                    text="Tatooine",
                    score=0.98,
                    recognizer="SomeRecognizer"),
    ]
    expected_ents = [
        NamedEntity(start_char=0,
                    end_char=8,
                    tag="PER",
                    text="Han Solo",
                    score=0.94,
                    recognizer="SomeRecognizer"),
        NamedEntity(start_char=64,
                    end_char=85,
                    tag="EMAIL",
                    text="*****@*****.**",
                    score=1.0,
                    recognizer="AnotherRecognizer"),
        NamedEntity(start_char=100,
                    end_char=108,
                    tag="LOC",
                    text="Tatooine",
                    score=0.98,
                    recognizer="SomeRecognizer"),
    ]
    assert combine(ents, strategy="smart-fusion") == expected_ents
Пример #3
0
def test_disjunctive_union_strategy_with_overlapping_ents():
    ents = [
        NamedEntity(start_char=0,
                    end_char=8,
                    tag="PER",
                    text="Han Solo",
                    score=0.94,
                    recognizer="SomeRecognizer"),
        NamedEntity(start_char=64,
                    end_char=71,
                    tag="MISC",
                    text="han.solo",
                    score=0.92,
                    recognizer="SomeRecognizer"),
        NamedEntity(start_char=64,
                    end_char=85,
                    tag="EMAIL",
                    text="*****@*****.**",
                    score=1.0,
                    recognizer="AnotherRecognizer"),
    ]
    with pytest.raises(AssertionError):
        combine(ents, strategy="disjunctive_union")
Пример #4
0
def recognize(text: str, config: Config, combination_strategy=None, context_words=False, return_tokens=True) -> dict:
    """Find personally identifiable data in the given text and return it.

    :param text: the text that is searched for named entities
    :param config: pass a config object to configure the recognition methods
    :param combination_strategy: choose from None, `disjunctive_union`, `fusion`, and `smart-fusion`; see the docs of
        `combination_strategies.combine` for more details
    :param context_words: if True, use context words to boost the score of entities: this is the case, if one of a recognizer's
        context words appears in the entity's sentence. Setting `context_words` to True will also align each entity's
        start/end to the nearest token's start/end
    :param return_tokens: compute and return the tokenization; this will also align each entity's start/end to the nearest
        token's start/end
    """
    analyzer.update_config(config)
    recognition_results = analyzer.run_recognition(text)

    if len(recognition_results) == 0:
        ents = []
    else:
        ents = [ent for result in recognition_results for ent in result]

    result = {}
    tokens = []
    if return_tokens or context_words:
        # tokenize
        analyzer.tokenizer.tokenize(text)
        tokens = analyzer.tokenizer.get_tokens()
        # align entities with tokens
        entity_aligner = EntityAligner()
        entity_aligner.align_entities_with_tokens(ents, tokens)

    # combine entities after they have been aligned
    ents = combine(ents, strategy=combination_strategy)

    if return_tokens:
        result["tokens"] = tokens

    if context_words:
        for ent in ents:
            sentence_tokens = analyzer.tokenizer.get_sentence_for_token(
                ent.start_tok, exclude_tokens=list(range(ent.start_tok, ent.end_tok))
            )
            sentence_words = [token.text for token in sentence_tokens]
            context_words = analyzer.recognizer_lookup[ent.recognizer].CONTEXT_WORDS
            if any(word in sentence_words for word in context_words):
                ent.score = min(ent.score * analyzer.config.context_word_confidence_boost_factor, 1.0)

    result["ents"] = ents
    return result
Пример #5
0
def test_fusion_strategy_with_disjoint_ents():
    ents = [
        NamedEntity(start_char=0,
                    end_char=8,
                    tag="PER",
                    text="Han Solo",
                    score=0.94,
                    recognizer="SomeRecognizer"),
        NamedEntity(start_char=47,
                    end_char=59,
                    tag="MISC",
                    text="Han's E-Mail",
                    score=0.83,
                    recognizer="SomeRecognizer"),
        NamedEntity(start_char=64,
                    end_char=85,
                    tag="EMAIL",
                    text="*****@*****.**",
                    score=1.0,
                    recognizer="AnotherRecognizer"),
    ]
    assert combine(ents, strategy="fusion") == ents
Пример #6
0
def test_none_strategy():
    ents = [
        NamedEntity(start_char=64,
                    end_char=71,
                    tag="MISC",
                    text="han.solo",
                    score=0.92,
                    recognizer="SomeRecognizer"),
        NamedEntity(start_char=64,
                    end_char=85,
                    tag="EMAIL",
                    text="*****@*****.**",
                    score=1.0,
                    recognizer="AnotherRecognizer"),
        NamedEntity(start_char=64,
                    end_char=85,
                    tag="ORG",
                    text="*****@*****.**",
                    score=0.83,
                    recognizer="SomeRecognizer"),
    ]
    assert combine(ents, strategy=None) == ents
Пример #7
0
def test_fusion_with_same_score_overlapping():
    ents = [
        NamedEntity(
            start_char=301,
            end_char=306,
            tag="PHONE",
            text="12345",
            score=0.8,
            recognizer="AnotherRecognizer",
            start_tok=44,
            end_tok=45,
        ),
        NamedEntity(
            start_char=301,
            end_char=313,
            tag="LOC",
            text="12345 Berlin",
            score=0.8,
            recognizer="AnotherRecognizer",
            start_tok=44,
            end_tok=46,
        ),
    ]
    expected_ents = [
        NamedEntity(
            start_char=301,
            end_char=313,
            tag="LOC",
            text="12345 Berlin",
            score=0.8,
            recognizer="AnotherRecognizer",
            start_tok=44,
            end_tok=46,
        ),
    ]
    assert combine(ents, strategy="fusion") == expected_ents