Python InputSample.tokens 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: presidio_evaluator

클래스/타입: InputSample

메소드/함수: tokens

hotexamples.com에서의 예제들: 11

Python InputSample.tokens - 11개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 presidio_evaluator.InputSample.tokens에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

InputSample(24)

read_dataset_json(11)

tokens(11)

tags(10)

create_spacy_dataset(5)

create_conll_dataset(4)

from_faker_spans_result(4)

translate_input_sample_tags(3)

from_spacy_doc(2)

create_spacy_json(1)

from_json(1)

from_spacy(1)

masked(1)

translate_tag(1)

예제 #1

파일 보기

파일: stanza_model.py 프로젝트: microsoft/presidio-research

    def predict(self, sample: InputSample) -> List[str]:
        """
        Predict the tags using a stanza model.

        :param sample: InputSample with text
        :return: list of tags
        """

        doc = self.model(sample.full_text)
        if doc.ents:
            tags, texts, start, end = zip(*[(s.label_, s.text, s.start_char,
                                             s.end_char) for s in doc.ents])

            # Stanza tokens might not be consistent with spaCy's tokens.
            # Use spacy tokenization and not stanza
            # to maintain consistency with other models:
            if not sample.tokens:
                sample.tokens = tokenize(sample.full_text)

            # Create tags (label per token) based on stanza spans and spacy tokens
            tags = span_to_tag(
                scheme=self.labeling_scheme,
                text=sample.full_text,
                starts=start,
                ends=end,
                tags=tags,
                tokens=sample.tokens,
            )
        else:
            tags = ["O" for _ in range(len(sample.tokens))]

        if len(tags) != len(sample.tokens):
            print("mismatch between input tokens and new tokens")

        return tags

예제 #2

파일 보기

파일: flair_model.py 프로젝트: microsoft/presidio-research

    def predict(self, sample: InputSample) -> List[str]:

        sentence = Sentence(text=sample.full_text,
                            use_tokenizer=self.spacy_tokenizer)
        self.model.predict(sentence)

        ents = sentence.get_spans("ner")
        if ents:
            tags, texts, start, end = zip(*[(ent.tag, ent.text, ent.start_pos,
                                             ent.end_pos) for ent in ents])

            tags = [tag if tag != "PER" else "PERSON"
                    for tag in tags]  # Flair's tag for PERSON is PER

            # Flair tokens might not be consistent with spaCy's tokens (even when using spacy tokenizer)
            # Use spacy tokenization and not stanza to maintain consistency with other models:
            if not sample.tokens:
                sample.tokens = tokenize(sample.full_text)

            # Create tags (label per token) based on stanza spans and spacy tokens
            tags = span_to_tag(
                scheme="IO",
                text=sample.full_text,
                starts=start,
                ends=end,
                tags=tags,
                tokens=sample.tokens,
            )
        else:
            tags = ["O" for _ in range(len(sample.tokens))]

        if len(tags) != len(sample.tokens):
            print("mismatch between input tokens and new tokens")

        return tags

예제 #3

파일 보기

def test_evaluate_sample_wrong_entities_to_keep_correct_statistics():
    prediction = ["O", "O", "O", "U-ANIMAL"]
    model = MockTokensModel(prediction=prediction,
                            entities_to_keep=['SPACESHIP'])

    sample = InputSample(full_text="I am the walrus",
                         masked="I am the [ANIMAL]",
                         spans=None)
    sample.tokens = ["I", "am", "the", "walrus"]
    sample.tags = ["O", "O", "O", "U-ANIMAL"]

    evaluated = model.evaluate_sample(sample)
    assert evaluated.results[("O", "O")] == 4

예제 #4

파일 보기

파일: test_evaluator.py 프로젝트: microsoft/presidio-research

def test_evaluate_same_entity_correct_statistics():
    prediction = ["O", "U-ANIMAL", "O", "U-ANIMAL"]
    model = MockTokensModel(prediction=prediction)
    evaluator = Evaluator(model=model, entities_to_keep=["ANIMAL"])
    sample = InputSample(full_text="I dog the walrus",
                         masked="I [ANIMAL] the [ANIMAL]",
                         spans=None)
    sample.tokens = ["I", "am", "the", "walrus"]
    sample.tags = ["O", "O", "O", "U-ANIMAL"]

    evaluation_result = evaluator.evaluate_sample(sample, prediction)
    assert evaluation_result.results[("O", "O")] == 2
    assert evaluation_result.results[("ANIMAL", "ANIMAL")] == 1
    assert evaluation_result.results[("O", "ANIMAL")] == 1

예제 #5

파일 보기

파일: test_evaluator.py 프로젝트: microsoft/presidio-research

def test_evaluate_multiple_tokens_correct_statistics():
    prediction = ["O", "O", "O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL"]
    model = MockTokensModel(prediction=prediction)
    evaluator = Evaluator(model=model, entities_to_keep=["ANIMAL"])
    sample = InputSample("I am the walrus amaericanus magnifico",
                         masked=None,
                         spans=None)
    sample.tokens = ["I", "am", "the", "walrus", "americanus", "magnifico"]
    sample.tags = ["O", "O", "O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL"]

    evaluated = evaluator.evaluate_sample(sample, prediction)
    evaluation = evaluator.calculate_score([evaluated])

    assert evaluation.pii_precision == 1
    assert evaluation.pii_recall == 1

예제 #6

파일 보기

파일: test_evaluator.py 프로젝트: microsoft/presidio-research

def test_evaluate_multiple_examples_correct_statistics():
    prediction = ["U-PERSON", "O", "O", "U-PERSON", "O", "O"]
    model = MockTokensModel(prediction=prediction)
    evaluator = Evaluator(model=model, entities_to_keep=["PERSON"])
    input_sample = InputSample("My name is Raphael or David",
                               masked=None,
                               spans=None)
    input_sample.tokens = ["My", "name", "is", "Raphael", "or", "David"]
    input_sample.tags = ["O", "O", "O", "U-PERSON", "O", "U-PERSON"]

    evaluated = evaluator.evaluate_all(
        [input_sample, input_sample, input_sample, input_sample])
    scores = evaluator.calculate_score(evaluated)
    assert scores.pii_precision == 0.5
    assert scores.pii_recall == 0.5

예제 #7

파일 보기

def test_evaluator_simple():
    prediction = ["O", "O", "O", "U-ANIMAL"]
    model = MockTokensModel(prediction=prediction, entities_to_keep=['ANIMAL'])

    sample = InputSample(full_text="I am the walrus",
                         masked="I am the [ANIMAL]",
                         spans=None)
    sample.tokens = ["I", "am", "the", "walrus"]
    sample.tags = ["O", "O", "O", "U-ANIMAL"]

    evaluated = model.evaluate_sample(sample)
    final_evaluation = model.calculate_score([evaluated])

    assert final_evaluation.pii_precision == 1
    assert final_evaluation.pii_recall == 1

예제 #8

파일 보기

def test_evaluate_multiple_entities_to_keep_correct_statistics():
    prediction = ["O", "U-ANIMAL", "O", "U-ANIMAL"]
    model = MockTokensModel(prediction=prediction,
                            labeling_scheme='BIO',
                            entities_to_keep=['ANIMAL', 'PLANT', 'SPACESHIP'])
    sample = InputSample(full_text="I dog the walrus",
                         masked="I [ANIMAL] the [ANIMAL]",
                         spans=None)
    sample.tokens = ["I", "am", "the", "walrus"]
    sample.tags = ["O", "O", "O", "U-ANIMAL"]

    evaluation_result = model.evaluate_sample(sample)
    assert evaluation_result.results[("O", "O")] == 2
    assert evaluation_result.results[("ANIMAL", "ANIMAL")] == 1
    assert evaluation_result.results[("O", "ANIMAL")] == 1

예제 #9

파일 보기

def test_evaluate_multiple_tokens_no_match_match_correct_statistics():
    prediction = ["O", "O", "O", "B-SPACESHIP", "L-SPACESHIP", "O"]
    model = MockTokensModel(prediction=prediction, entities_to_keep=['ANIMAL'])

    sample = InputSample("I am the walrus amaericanus magnifico",
                         masked=None,
                         spans=None)
    sample.tokens = ["I", "am", "the", "walrus", "americanus", "magnifico"]
    sample.tags = ["O", "O", "O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL"]

    evaluated = model.evaluate_sample(sample)
    evaluation = model.calculate_score([evaluated])

    assert np.isnan(evaluation.pii_precision)
    assert evaluation.pii_recall == 0

예제 #10

파일 보기

def test_evaluate_multiple_examples_ignore_entity_correct_statistics():
    prediction = ["O", "O", "O", "U-PERSON", "O", "U-TENNIS_PLAYER"]
    model = MockTokensModel(prediction=prediction,
                            labeling_scheme='BILOU',
                            entities_to_keep=['PERSON', 'TENNIS_PLAYER'])
    input_sample = InputSample("My name is Raphael or David",
                               masked=None,
                               spans=None)
    input_sample.tokens = ["My", "name", "is", "Raphael", "or", "David"]
    input_sample.tags = ["O", "O", "O", "U-PERSON", "O", "U-PERSON"]

    evaluated = model.evaluate_all(
        [input_sample, input_sample, input_sample, input_sample])
    scores = model.calculate_score(evaluated)
    assert scores.pii_precision == 1
    assert scores.pii_recall == 1

예제 #11

파일 보기

파일: presidio_api_evaluator.py 프로젝트: sandnattanicha/Final-Project

            self.analyze_template = template
            return

        requested_fields = []
        for entity in entities:
            for field in template['fields']:
                if entity == field['name']:
                    requested_fields.append(field)

        new_template = {'fields': requested_fields}

        self.analyze_template = new_template


if __name__ == "__main__":
    # Example:
    text = "My siblings are Dan and magen"
    bilou_tags = ['O', 'O', 'O', 'U-PERSON', 'O', 'U-PERSON']
    presidio = PresidioAPIEvaluator(verbose=True, all_fields=True, compare_by_io=True)
    tokens = tokenize(text)
    s = InputSample(text, masked=None, spans=None)
    s.tokens = tokens
    s.tags = bilou_tags

    evaluated_sample = presidio.evaluate_sample(s)
    p, r, entity_recall, f, mistakes = presidio.calculate_score([evaluated_sample])
    print("Precision = {}\n"
          "Recall = {}\n"
          "F_3 = {}\n"
          "Errors = {}".format(p, r, f, mistakes))