Exemplo n.º 1
0
def test_token_indices():
    text = ":    nation on"
    sentence = Sentence(text)
    assert text == sentence.to_original_text()

    text = ":    nation on"
    sentence = Sentence(text, use_tokenizer=SegtokTokenizer())
    assert text == sentence.to_original_text()

    text = "I love Berlin."
    sentence = Sentence(text)
    assert text == sentence.to_original_text()

    text = (
        'Schartau sagte dem " Tagesspiegel " vom Freitag , Fischer sei " '
        "in einer Weise aufgetreten , die alles andere als überzeugend "
        'war " .'
    )
    sentence = Sentence(text)
    assert text == sentence.to_original_text()

    text = (
        'Schartau sagte dem " Tagesspiegel " vom Freitag , Fischer sei " '
        "in einer Weise aufgetreten , die alles andere als überzeugend "
        'war " .'
    )
    sentence = Sentence(text, use_tokenizer=SegtokTokenizer())
    assert text == sentence.to_original_text()
Exemplo n.º 2
0
def test_html_rendering():
    text = (
        "Boris Johnson has been elected new Conservative leader in a ballot of party members and will become the "
        "next UK prime minister. &")
    sent = Sentence()
    sent.get_spans = MagicMock()
    sent.get_spans.return_value = [
        mock_ner_span("PER", 0, 13),
        mock_ner_span("MISC", 35, 47),
        mock_ner_span("LOC", 109, 111),
    ]
    sent.to_original_text = MagicMock()
    sent.to_original_text.return_value = text
    settings = {"colors": {"LOC": "yellow"}, "labels": {"LOC": "location"}}
    actual = Visualizer.render_ner_html([sent], settings=settings)

    expected_res = HTML_PAGE.format(
        text=TAGGED_ENTITY.format(
            color="#F7FF53", entity="Boris Johnson", label="PER") +
        " has been elected new " + TAGGED_ENTITY.format(
            color="#4647EB", entity="Conservative", label="MISC") +
        " leader in a ballot of party members and will become the next " +
        TAGGED_ENTITY.format(color="yellow", entity="UK", label="location") +
        " prime minister. &")

    assert expected_res == actual
Exemplo n.º 3
0
def test_html_rendering():
    text = (
        "Boris Johnson has been elected new Conservative leader in a ballot of party members and will become the "
        "next UK prime minister. &")
    sent = Sentence()
    sent.get_spans = MagicMock()
    sent.get_spans.return_value = [
        mock_ner_span(text, "PER", 0, 13),
        mock_ner_span(text, "MISC", 35, 47),
        mock_ner_span(text, "LOC", 109, 111),
    ]
    sent.to_original_text = MagicMock()
    sent.to_original_text.return_value = text
    colors = {
        "PER": "#F7FF53",
        "ORG": "#E8902E",
        "LOC": "yellow",
        "MISC": "#4647EB",
        "O": "#ddd",
    }
    actual = render_ner_html([sent], colors=colors)

    expected_res = HTML_PAGE.format(
        text=PARAGRAPH.format(
            sentence=TAGGED_ENTITY.format(
                color="#F7FF53", entity="Boris Johnson", label="PER") +
            " has been elected new " + TAGGED_ENTITY.format(
                color="#4647EB", entity="Conservative", label="MISC") +
            " leader in a ballot of party members and will become the next " +
            TAGGED_ENTITY.format(color="yellow", entity="UK", label="LOC") +
            " prime minister. &"),
        title="Flair",
    )

    assert expected_res == actual
Exemplo n.º 4
0
def test_token_indices():
    text = ':    nation on'
    sentence = Sentence(text)
    assert (text == sentence.to_original_text())
    text = ':    nation on'
    sentence = Sentence(text, use_tokenizer=True)
    assert (text == sentence.to_original_text())
    text = 'I love Berlin.'
    sentence = Sentence(text)
    assert (text == sentence.to_original_text())
    text = 'Schartau sagte dem " Tagesspiegel " vom Freitag , Fischer sei " in einer Weise aufgetreten , die alles andere als überzeugend war " .'
    sentence = Sentence(text)
    assert (text == sentence.to_original_text())
    text = 'Schartau sagte dem " Tagesspiegel " vom Freitag , Fischer sei " in einer Weise aufgetreten , die alles andere als überzeugend war " .'
    sentence = Sentence(text, use_tokenizer=True)
    assert (text == sentence.to_original_text())
Exemplo n.º 5
0
def split_to_spans(s: Sentence):
    orig = s.to_original_text()
    last_idx = 0
    spans = []
    tagged_ents = s.get_spans('ner')
    for ent in tagged_ents:
        if last_idx != ent.start_pos:
            spans.append((orig[last_idx:ent.start_pos], None))
        spans.append((orig[ent.start_pos:ent.end_pos], ent.tag))
        last_idx = ent.end_pos
    if last_idx < len(orig) - 1:
        spans.append((orig[last_idx:len(orig)], None))
    return spans
Exemplo n.º 6
0
def split_to_spans(s: Sentence):
    orig = s.to_original_text()
    last_idx = 0
    spans = []
    tagged_ents = s.get_labels("ner")
    for ent in tagged_ents:
        if last_idx != ent.span.start_pos:
            spans.append((orig[last_idx:ent.span.start_pos], None))
        spans.append((ent.span.text, ent.value))
        assert ent.span.end_pos is not None
        last_idx = ent.span.end_pos
    if last_idx < len(orig) - 1:
        spans.append((orig[last_idx:len(orig)], None))
    return spans
Exemplo n.º 7
0
    def _label(self, sentence: Sentence):
        """
        This will add a complex_label to the given sentence for every match.span() for every registered_mapping.
        If a match span overlaps with a token span an exception is raised.
        """
        collection = RegexpTagger.TokenCollection(sentence)

        for label, pattern in self._regexp_mapping.items():
            for match in pattern.finditer(sentence.to_original_text()):
                span: Tuple[int, int] = match.span()
                try:
                    token_span = collection.get_token_span(span)
                except ValueError:
                    raise Exception(
                        f"The match span {span} for label '{label}' is overlapping with a token!"
                    )
                sentence.add_complex_label(label, SpanLabel(token_span, label))
Exemplo n.º 8
0
def get_reason_for_appearance(organisation: Span, sentence: Sentence):
    """ Extract the reason for the appearance of an 'ORG' NER tag in a sentence. """
    # Find ORG placement in sentence.
    org_end = organisation.end_pos
    frame_tags = sentence.get_spans("frame")
    # Extract frame and POS tags after organisation occurence.
    pos_tags = list(
        filter(lambda span: "VBD" in span.tag, sentence.get_spans("pos")))
    frame_tags_after_org = list(
        filter(lambda span: span.start_pos > org_end, frame_tags))
    pos_tags_after_org = list(
        filter(lambda span: span.start_pos > org_end, pos_tags))
    # If no frame tags are usable, fall back to POS tags.
    if not frame_tags_after_org and not pos_tags_after_org:
        return None

    first_after_org = (frame_tags_after_org[0]
                       if frame_tags_after_org else pos_tags_after_org[0])
    original = sentence.to_original_text()
    # Extract reason following ORG occurence.
    reason = original[first_after_org.start_pos:]
    return reason
Exemplo n.º 9
0
    def transform(self, X, y=None, **kwargs):
        """
        an abstract method that is used to transform according to what happend in the fit method
        :param X: features - Dataframe
        :param y: target vector - Series
        :param kwargs: free parameters - dictionary
        :return: X: the transformed data - Dataframe
        """

        X = X['text']

        dataset_hash = hash(str(X) + str(self.embedder.__dict__))
        if dataset_hash in self.dataset_cache:
            return self.dataset_cache[dataset_hash]
        else:
            embeddings = []

            for first in trange(0, len(X), self.batch_size):
                subset = X[first:first + self.batch_size]
                sentences = []
                for element in subset:
                    sentence = Sentence(element)
                    # sentence.tokens = sentence.tokens[:200]
                    sentences.append(sentence)

                self.embedder.embed(sentences)
                for sentence in sentences:
                    key = sentence.to_original_text()
                    if key in self.vector_cache.keys():
                        vector = self.vector_cache[key]
                    else:
                        vector = sentence.get_embedding().cpu().detach().numpy(
                        )
                        self.vector_cache[key] = vector
                    embeddings.append(vector)

            embedding_dataset = numpy.vstack(embeddings)
            self.dataset_cache[dataset_hash] = embedding_dataset
            return embedding_dataset