Exemplo n.º 1
0
 def visualize_ner_tags(self,
                        display_index=range(5),
                        save_display_html: bool = False,
                        save_all_html: bool = True,
                        **kwargs):
     html = render_ner_html([self.sentences[i] for i in display_index],
                            **kwargs)
     display(HTML(html))
     if save_display_html:
         (self.path / 'sentences_true_example.html').write_text(html)
     if save_all_html:
         html = render_ner_html(self.sentences, **kwargs)
         (self.path / 'sentences_true_all.html').write_text(html)
Exemplo n.º 2
0
def test_html_rendering():
    text = ("Boris Johnson has been elected new Conservative leader in "
            "a ballot of party members and will become the "
            "next UK prime minister. &")
    sentence = Sentence(text)

    print(sentence[0:2].add_label("ner", "PER"))
    print(sentence[6:7].add_label("ner", "MISC"))
    print(sentence[19:20].add_label("ner", "LOC"))
    colors = {
        "PER": "#F7FF53",
        "ORG": "#E8902E",
        "LOC": "yellow",
        "MISC": "#4647EB",
        "O": "#ddd",
    }
    actual = render_ner_html([sentence], colors=colors)

    expected_res = HTML_PAGE.format(
        text=PARAGRAPH.format(
            sentence=TAGGED_ENTITY.format(
                color="#F7FF53", entity="Boris Johnson", label="PER") +
            " has been elected new " + TAGGED_ENTITY.format(
                color="#4647EB", entity="Conservative", label="MISC") +
            " leader in a ballot of party members and will become the next " +
            TAGGED_ENTITY.format(color="yellow", entity="UK", label="LOC") +
            " prime minister. &"),
        title="Flair",
    )

    assert expected_res == actual
Exemplo n.º 3
0
def test_html_rendering():
    text = (
        "Boris Johnson has been elected new Conservative leader in a ballot of party members and will become the "
        "next UK prime minister. &")
    sent = Sentence()
    sent.get_spans = MagicMock()
    sent.get_spans.return_value = [
        mock_ner_span(text, "PER", 0, 13),
        mock_ner_span(text, "MISC", 35, 47),
        mock_ner_span(text, "LOC", 109, 111),
    ]
    sent.to_original_text = MagicMock()
    sent.to_original_text.return_value = text
    colors = {
        "PER": "#F7FF53",
        "ORG": "#E8902E",
        "LOC": "yellow",
        "MISC": "#4647EB",
        "O": "#ddd",
    }
    actual = render_ner_html([sent], colors=colors)

    expected_res = HTML_PAGE.format(
        text=PARAGRAPH.format(
            sentence=TAGGED_ENTITY.format(
                color="#F7FF53", entity="Boris Johnson", label="PER") +
            " has been elected new " + TAGGED_ENTITY.format(
                color="#4647EB", entity="Conservative", label="MISC") +
            " leader in a ballot of party members and will become the next " +
            TAGGED_ENTITY.format(color="yellow", entity="UK", label="LOC") +
            " prime minister. &"),
        title="Flair",
    )

    assert expected_res == actual
def main(data_folder: str, output_folder: str, model_folder: str) -> None:
    nlp: Language = spacy.blank('fr')
    nlp.tokenizer = get_tokenizer(nlp)
    tokenizer = build_spacy_tokenizer(nlp)
    filenames = [
        filename for filename in os.listdir(data_folder)
        if filename.endswith(".txt")
    ]
    tagger: SequenceTagger = SequenceTagger.load(
        os.path.join(model_folder, 'best-model.pt'))

    for filename in tqdm(iterable=filenames,
                         unit=" txt",
                         desc="anonymize cases"):
        with open(os.path.join(data_folder, filename), 'r') as input_f:
            sentences = tagger.predict(sentences=input_f.readlines(),
                                       mini_batch_size=32,
                                       verbose=False,
                                       use_tokenizer=tokenizer)
            case_name = filename.split('.')[0]
            page_html = render_ner_html(sentences,
                                        colors=colors,
                                        title=case_name)

            with open(os.path.join(output_folder, case_name + ".html"),
                      "w") as output:
                output.write(page_html)
Exemplo n.º 5
0
    def predict(self,
                sentences: Union[str, Sentence, List[Sentence], List[str]],
                display_html: bool = True,
                html_file: str = None,
                display_str: bool = False,
                **kwargs):
        if type(sentences) == Sentence:
            sentences = [sentences]
        elif type(sentences) == str:
            sentences = split_single(sentences)

        if type(sentences[0]) == str:
            sentences = [Sentence(s, use_tokenizer=True) for s in sentences]

        self.model.predict(sentences)

        if display_html or html_file:
            html = render_ner_html(sentences, **kwargs)
            if display_html:
                display(HTML(html))
            if html_file:
                (self.path / html_file).write_text(html)
        if display_str:
            for sentence in sentences:
                print(sentence.to_tagged_string())
def main(data_folder: str, model_folder: str, top_n: int) -> None:
    print(f"keep only top {top_n} examples per file")
    nlp: Language = spacy.blank('fr')
    nlp.tokenizer = get_tokenizer(nlp)
    tokenizer = build_spacy_tokenizer(nlp)
    filenames = [
        filename for filename in os.listdir(data_folder)
        if filename.endswith(".xml")
    ]
    sentences: List[Sentence] = list()
    with tqdm(total=len(filenames), unit=" XML",
              desc="Parsing XML") as progress_bar:
        for filename in filenames:
            paragraphs: List[Paragraph] = get_paragraph_from_file(
                path=os.path.join(data_folder, filename),
                keep_paragraph_without_annotation=True)
            if len(paragraphs) > top_n:
                for paragraph in paragraphs[:top_n]:
                    if len(paragraph.text) > 0:
                        s = Sentence(text=paragraph.text, tokenizer=tokenizer)
                        sentences.append(s)
            progress_bar.update()
    if len(sentences) == 0:
        raise Exception(
            "No example loaded, causes: no cases in provided path or sample size is to high"
        )

    tagger: SequenceTagger = SequenceTagger.load(
        os.path.join(model_folder, 'best-model.pt'))
    _ = tagger.predict(sentences=sentences,
                       mini_batch_size=32,
                       verbose=True,
                       embedding_storage_mode="cpu")

    print("prepare html")
    page_html = render_ner_html(sentences, colors=colors)
    print("write html")
    with open("sentence.html", "w") as writer:
        writer.write(page_html)
Exemplo n.º 7
0
def predict_flair(model, text):
    manual_sentence = Sentence(manual_user_input)
    model.predict(manual_sentence)
    return render_ner_html(manual_sentence, colors=colors, wrap_page=False)
Exemplo n.º 8
0
                    # not the first one, put empty
                    word.text = ""
                inside_parenthesis = True
            elif tag != "O" and not any([True for i in to_skip if i in tag]):
                inside_parenthesis = False
                if word.text.lower() not in replacement_dict:
                    replacement_dict[word.text.lower()] = pseudo[len(replacement_dict)]
                word.text = replacement_dict[word.text.lower()]
            else:
                inside_parenthesis = False

colors = {
    "ETABLISSEMENT": "#35c2b2",
    "ADDRESS": "#FFAE62",
    "ORGANIZATION": "#FFB990",
    "SITE": "#ff8800",
    "HOPITAL": "#edddcb",
    "MEDIA": "#e966c4",
    "MAIL": "#1688cb",
    "ETAT": "#00c5ed",
    "RESIDENCE": "#94bce1",
    "PERSONNE_DE_JUSTICE": "#89B2C4",
    "GROUPE": "#9cae64",
    "DATE": "#F9E17D",
    "NUMEROS": "#F8485E",
    "PERS": "#FA7268",
    "FONDS": "#C3FF1F",
}

st.write(render_ner_html(sentences=paragraphs, colors=colors, wrap_page=False), unsafe_allow_html=True)