Exemplo n.º 1
0
    def _tokenize(text: str, label: List[str]) -> List[str]:
        tokens = SpaceTokenizer().run_tokenize(text)
        token_texts = [token.text for token in tokens]

        assert len(token_texts) == len(
            label), "Tokenization does not match with available labels"
        return token_texts
Exemplo n.º 2
0
    def __init__(
        self,
        text: Union[str, List[str]] = None,
        use_tokenizer: Union[bool, Tokenizer] = True,
        language_code: str = None,
        start_position: int = None
    ):
        """
        Class to hold all meta related to a text (tokens, predictions, language code, ...)
        :param text: original string (sentence), or a list of string tokens (words)
        :param use_tokenizer: a custom tokenizer (default is :class:`SpaceTokenizer`)
            more advanced options are :class:`SegTokTokenizer` to use segtok or :class:`SpacyTokenizer`
            to use Spacy library if available). Check the implementations of abstract class Tokenizer or
            implement your own subclass (if you need it). If instead of providing a Tokenizer, this parameter
            is just set to True (deprecated), :class:`SegtokTokenizer` will be used.
        :param language_code: Language of the sentence
        :param start_position: Start char offset of the sentence in the superordinate document
        """
        super().__init__()

        self.tokens: List[Token] = []

        self._embeddings: Dict = {}

        self.language_code: str = language_code

        self.start_pos = start_position
        self.end_pos = (
            start_position + len(text) if start_position is not None else None
        )

        if isinstance(use_tokenizer, Tokenizer):
            tokenizer = use_tokenizer
        elif hasattr(use_tokenizer, "__call__"):
            from flair.tokenization import TokenizerWrapper
            tokenizer = TokenizerWrapper(use_tokenizer)
        elif type(use_tokenizer) == bool:
            from flair.tokenization import SegtokTokenizer, SpaceTokenizer
            tokenizer = SegtokTokenizer() if use_tokenizer else SpaceTokenizer()
        else:
            raise AssertionError("Unexpected type of parameter 'use_tokenizer'. " +
                                 "Parameter should be bool, Callable[[str], List[Token]] (deprecated), Tokenizer")

        # if text is passed, instantiate sentence with tokens (words)
        if text is not None:
            if isinstance(text, (list, tuple)):
                [self.add_token(self._restore_windows_1252_characters(token))
                 for token in text]
            else:
                text = self._restore_windows_1252_characters(text)
                [self.add_token(token) for token in tokenizer.tokenize(text)]

        # log a warning if the dataset is empty
        if text == "":
            log.warning(
                "Warning: An empty Sentence was created! Are there empty strings in your dataset?"
            )

        self.tokenized = None
Exemplo n.º 3
0
def test_token_positions_when_creating_with_tokenizer():
    sentence = Sentence("I love Berlin .", use_tokenizer=SpaceTokenizer())

    assert 0 == sentence.tokens[0].start_position
    assert 1 == sentence.tokens[0].end_position
    assert 2 == sentence.tokens[1].start_position
    assert 6 == sentence.tokens[1].end_position
    assert 7 == sentence.tokens[2].start_position
    assert 13 == sentence.tokens[2].end_position

    sentence = Sentence(" I love  Berlin.", use_tokenizer=SegtokTokenizer())

    assert 1 == sentence.tokens[0].start_position
    assert 2 == sentence.tokens[0].end_position
    assert 3 == sentence.tokens[1].start_position
    assert 7 == sentence.tokens[1].end_position
    assert 9 == sentence.tokens[2].start_position
    assert 15 == sentence.tokens[2].end_position
Exemplo n.º 4
0
 def __init__(
         self,
         texts: Union[str, List[str]],
         use_tokenizer: Union[bool, Callable[[str], List[Token]], Tokenizer] = SpaceTokenizer(),
 ):
     """
     Instantiate StringDataset
     :param texts: a string or List of string that make up StringDataset
     :param use_tokenizer: Custom tokenizer to use (default is SpaceTokenizer,
     more advanced options are SegTokTokenizer to use segtok or SpacyTokenizer to use Spacy library models
     if available). Check the code of subclasses of Tokenizer to implement your own (if you need it).
     If instead of providing a function, this parameter is just set to True, SegTokTokenizer will be used.
     """
     # cast to list if necessary
     if type(texts) == Sentence:
         texts = [texts]
     self.texts = texts
     self.use_tokenizer = use_tokenizer
Exemplo n.º 5
0
def assert_conll_writer_output(
    dataset: InternalBioNerDataset,
    expected_output: List[str],
    sentence_splitter: SentenceSplitter = None,
):
    fd, outfile_path = tempfile.mkstemp()
    try:
        sentence_splitter = sentence_splitter if sentence_splitter else NoSentenceSplitter(
            tokenizer=SpaceTokenizer())

        writer = CoNLLWriter(sentence_splitter=sentence_splitter)
        writer.write_to_conll(dataset, Path(outfile_path))
        with open(outfile_path) as f:
            contents = [line.strip() for line in f.readlines() if line.strip()]
    finally:
        os.close(fd)
        os.remove(outfile_path)

    assert contents == expected_output
Exemplo n.º 6
0
    def predict(self, sentence: str, model_path: str = ''):
        """
        Predict a sentences

        :param sentence: sentence to predict
        :param model_path: path to the model
        :return: sense id of the predicted preposition
        """

        # (Try to) load classifier if none has yet been loaded
        if self.__classifier is None:
            self._load_classifier(model_path)
            if self.__classifier is None:
                raise ValueError(
                    'Unable to load a classifier. Prediction not possible')

        # Tokenize sentence with space tokenizer
        sentence = Sentence(sentence, SpaceTokenizer())
        self.__classifier.predict(sentences=sentence,
                                  mini_batch_size=self.__mini_batch_size,
                                  verbose=self.__verbose)

        # Return sense id (number only)
        return str(sentence.labels).split(" ")[0].split("__")[2]