Пример #1
0
    def __init__(
        self,
        text: Union[str, List[str]] = None,
        use_tokenizer: Union[bool, Tokenizer] = True,
        language_code: str = None,
        start_position: int = None
    ):
        """
        Class to hold all meta related to a text (tokens, predictions, language code, ...)
        :param text: original string (sentence), or a list of string tokens (words)
        :param use_tokenizer: a custom tokenizer (default is :class:`SpaceTokenizer`)
            more advanced options are :class:`SegTokTokenizer` to use segtok or :class:`SpacyTokenizer`
            to use Spacy library if available). Check the implementations of abstract class Tokenizer or
            implement your own subclass (if you need it). If instead of providing a Tokenizer, this parameter
            is just set to True (deprecated), :class:`SegtokTokenizer` will be used.
        :param language_code: Language of the sentence
        :param start_position: Start char offset of the sentence in the superordinate document
        """
        super().__init__()

        self.tokens: List[Token] = []

        self._embeddings: Dict = {}

        self.language_code: str = language_code

        self.start_pos = start_position
        self.end_pos = (
            start_position + len(text) if start_position is not None else None
        )

        if isinstance(use_tokenizer, Tokenizer):
            tokenizer = use_tokenizer
        elif hasattr(use_tokenizer, "__call__"):
            from flair.tokenization import TokenizerWrapper
            tokenizer = TokenizerWrapper(use_tokenizer)
        elif type(use_tokenizer) == bool:
            from flair.tokenization import SegtokTokenizer, SpaceTokenizer
            tokenizer = SegtokTokenizer() if use_tokenizer else SpaceTokenizer()
        else:
            raise AssertionError("Unexpected type of parameter 'use_tokenizer'. " +
                                 "Parameter should be bool, Callable[[str], List[Token]] (deprecated), Tokenizer")

        # if text is passed, instantiate sentence with tokens (words)
        if text is not None:
            if isinstance(text, (list, tuple)):
                [self.add_token(self._restore_windows_1252_characters(token))
                 for token in text]
            else:
                text = self._restore_windows_1252_characters(text)
                [self.add_token(token) for token in tokenizer.tokenize(text)]

        # log a warning if the dataset is empty
        if text == "":
            log.warning(
                "Warning: An empty Sentence was created! Are there empty strings in your dataset?"
            )

        self.tokenized = None
Пример #2
0
def test_newline_sentence_splitter():
    newline_splitter = NewlineSentenceSplitter()

    sentences = newline_splitter.split("I love Berlin\nMe too")
    assert len(sentences) == 2
    assert sentences[0].start_pos == 0
    assert len(sentences[0].tokens) == 3
    assert sentences[0].start_pos == 0
    assert len(sentences[1].tokens) == 2

    newline_splitter = NewlineSentenceSplitter(tokenizer=TokenizerWrapper(no_op_tokenizer))
    sentences = newline_splitter.split("I love Berlin\nMe too")
    assert len(sentences) == 2
    assert len(sentences[0].tokens) == 1
    assert sentences[1].start_pos == 14
    assert len(sentences[1].tokens) == 1

    sentences = newline_splitter.split("I love Berlin Me too")
    assert len(sentences) == 1

    sentences = newline_splitter.split("I love Berlin\n\nMe too")
    assert len(sentences) == 2

    sentences = newline_splitter.split("I love Berlin\n \nMe too")
    assert len(sentences) == 2
Пример #3
0
def test_tag_sentence_splitter():
    tag_splitter = TagSentenceSplitter(tag="#!")

    sentences = tag_splitter.split("I love Berlin#!Me too")
    assert len(sentences) == 2
    assert sentences[0].start_pos == 0
    assert len(sentences[0].tokens) == 3
    assert sentences[1].start_pos == 15
    assert len(sentences[1].tokens) == 2

    tag_splitter = TagSentenceSplitter(tag="#!", tokenizer=TokenizerWrapper(no_op_tokenizer))
    sentences = tag_splitter.split("I love Berlin#!Me too")
    assert len(sentences) == 2
    assert sentences[0].start_pos == 0
    assert len(sentences[0].tokens) == 1
    assert sentences[1].start_pos == 15
    assert len(sentences[1].tokens) == 1

    sentences = tag_splitter.split("I love Berlin Me too")
    assert len(sentences) == 1

    sentences = tag_splitter.split("I love Berlin#!#!Me too")
    assert len(sentences) == 2

    sentences = tag_splitter.split("I love Berl#! #!inMe too")
    assert len(sentences) == 2
Пример #4
0
def test_create_sentence_with_custom_tokenizer():
    def custom_tokenizer(text: str) -> List[Token]:
        return [Token(text, 0)]

    sentence:Sentence = Sentence("I love Berlin.", use_tokenizer=TokenizerWrapper(custom_tokenizer))
    assert 1 == len(sentence.tokens)
    assert "I love Berlin." == sentence.tokens[0].text
Пример #5
0
def test_split_text_spacy():
    spacy_splitter = SpacySentenceSplitter("en_core_sci_sm")

    sentences = spacy_splitter.split("This a sentence. "
                                     "And here is another one.")
    assert len(sentences) == 2
    assert sentences[0].start_pos == 0
    assert len(sentences[0].tokens) == 4
    assert sentences[1].start_pos == 17
    assert len(sentences[1].tokens) == 6

    sentences = spacy_splitter.split(
        "VF inhibits something. ACE-dependent (GH+) issuses too.")
    assert len(sentences) == 2
    assert sentences[0].start_pos == 0
    assert len(sentences[0].tokens) == 4
    assert sentences[1].start_pos == 23
    assert len(sentences[1].tokens) == 7

    spacy_splitter = SpacySentenceSplitter(
        "en_core_sci_sm", tokenizer=TokenizerWrapper(no_op_tokenizer))
    sentences = spacy_splitter.split("This a sentence. "
                                     "And here is another one.")
    assert len(sentences) == 2
    assert sentences[0].start_pos == 0
    assert len(sentences[0].tokens) == 1
    assert sentences[1].start_pos == 17
    assert len(sentences[1].tokens) == 1
Пример #6
0
def test_no_sentence_splitter():
    no_splitter = NoSentenceSplitter()
    sentences = no_splitter.split("I love Berlin")
    assert len(sentences) == 1
    assert sentences[0].start_pos == 0
    assert len(sentences[0].tokens) == 3

    no_splitter = NoSentenceSplitter(TokenizerWrapper(no_op_tokenizer))
    sentences = no_splitter.split("I love Berlin")
    assert len(sentences) == 1
    assert sentences[0].start_pos == 0
    assert len(sentences[0].tokens) == 1
Пример #7
0
def test_segtok_sentence_splitter():
    segtok_splitter = SegtokSentenceSplitter()
    sentences = segtok_splitter.split("I love Berlin. Berlin is a great city.")
    assert len(sentences) == 2
    assert sentences[0].start_pos == 0
    assert len(sentences[0].tokens) == 4
    assert sentences[1].start_pos == 15
    assert len(sentences[1].tokens) == 6

    segtok_splitter = SegtokSentenceSplitter(tokenizer=TokenizerWrapper(no_op_tokenizer))
    sentences = segtok_splitter.split("I love Berlin. Berlin is a great city.")
    assert len(sentences) == 2
    assert sentences[0].start_pos == 0
    assert len(sentences[0].tokens) == 1
    assert sentences[1].start_pos == 15
    assert len(sentences[1].tokens) == 1
Пример #8
0
def test_conll_writer_whitespace_after():
    text = f"A sentence with cardio-dependent. {SENTENCE_TAG}Clark et al. reported that"
    dataset = InternalBioNerDataset(
        documents={"1": text},
        entities_per_document={"1": []},
    )

    assert_conll_writer_output(
        dataset, [
            "A O +",
            "sentence O +",
            "with O +",
            "cardio O -",
            "dependent. O +",
            "Clark O +",
            "et O +",
            "al. O +",
            "reported O +",
            "that O -",
        ],
        TagSentenceSplitter(tag=SENTENCE_TAG,
                            tokenizer=TokenizerWrapper(simple_tokenizer)))
Пример #9
0
def test_create_sentence_with_custom_tokenizer():
    sentence: Sentence = Sentence("I love Berlin.", use_tokenizer=TokenizerWrapper(no_op_tokenizer))
    assert 1 == len(sentence.tokens)
    assert 0 == sentence.tokens[0].start_pos
    assert "I love Berlin." == sentence.tokens[0].text