Exemplo n.º 1
0
def test_token_indices():
    text = ":    nation on"
    sentence = Sentence(text)
    assert text == sentence.to_original_text()

    text = ":    nation on"
    sentence = Sentence(text, use_tokenizer=SegtokTokenizer())
    assert text == sentence.to_original_text()

    text = "I love Berlin."
    sentence = Sentence(text)
    assert text == sentence.to_original_text()

    text = (
        'Schartau sagte dem " Tagesspiegel " vom Freitag , Fischer sei " '
        "in einer Weise aufgetreten , die alles andere als überzeugend "
        'war " .'
    )
    sentence = Sentence(text)
    assert text == sentence.to_original_text()

    text = (
        'Schartau sagte dem " Tagesspiegel " vom Freitag , Fischer sei " '
        "in einer Weise aufgetreten , die alles andere als überzeugend "
        'war " .'
    )
    sentence = Sentence(text, use_tokenizer=SegtokTokenizer())
    assert text == sentence.to_original_text()
Exemplo n.º 2
0
def test_tagged_corpus_get_all_sentences():
    train_sentence = Sentence("I'm used in training.", use_tokenizer=SegtokTokenizer())
    dev_sentence = Sentence("I'm a dev sentence.", use_tokenizer=SegtokTokenizer())
    test_sentence = Sentence("I will be only used for testing.", use_tokenizer=SegtokTokenizer())

    corpus: Corpus = Corpus([train_sentence], [dev_sentence], [test_sentence])

    all_sentences = corpus.get_all_sentences()

    assert 3 == len(all_sentences)
Exemplo n.º 3
0
def test_tagged_corpus_get_tag_statistic():
    train_sentence = Sentence("Zalando Research is located in Berlin .")
    train_sentence[0].add_tag("ner", "B-ORG")
    train_sentence[1].add_tag("ner", "E-ORG")
    train_sentence[5].add_tag("ner", "S-LOC")

    dev_sentence = Sentence(
        "Facebook, Inc. is a company, and Google is one as well.",
        use_tokenizer=SegtokTokenizer(),
    )
    dev_sentence[0].add_tag("ner", "B-ORG")
    dev_sentence[1].add_tag("ner", "I-ORG")
    dev_sentence[2].add_tag("ner", "E-ORG")
    dev_sentence[8].add_tag("ner", "S-ORG")

    test_sentence = Sentence("Nothing to do with companies.")

    tag_to_count_dict = Corpus._count_token_labels(
        [train_sentence, dev_sentence, test_sentence], "ner"
    )

    assert 1 == tag_to_count_dict["S-ORG"]
    assert 1 == tag_to_count_dict["S-LOC"]
    assert 2 == tag_to_count_dict["B-ORG"]
    assert 2 == tag_to_count_dict["E-ORG"]
    assert 1 == tag_to_count_dict["I-ORG"]
Exemplo n.º 4
0
def test_sentence_to_real_string(tasks_base_path):
    sentence: Sentence = Sentence("I love Berlin.", use_tokenizer=SegtokTokenizer())
    assert "I love Berlin." == sentence.to_plain_string()

    corpus = flair.datasets.GERMEVAL_14(base_path=tasks_base_path)

    sentence = corpus.train[0]
    sentence.infer_space_after()
    assert (
            'Schartau sagte dem " Tagesspiegel " vom Freitag , Fischer sei " in einer Weise aufgetreten , die alles andere als überzeugend war " .'
            == sentence.to_tokenized_string()
    )
    assert (
            'Schartau sagte dem "Tagesspiegel" vom Freitag, Fischer sei "in einer Weise aufgetreten, die alles andere als überzeugend war".'
            == sentence.to_plain_string()
    )

    sentence = corpus.train[1]
    sentence.infer_space_after()
    assert (
            "Firmengründer Wolf Peter Bree arbeitete Anfang der siebziger Jahre als Möbelvertreter , als er einen fliegenden Händler aus dem Libanon traf ."
            == sentence.to_tokenized_string()
    )
    assert (
            "Firmengründer Wolf Peter Bree arbeitete Anfang der siebziger Jahre als Möbelvertreter, als er einen fliegenden Händler aus dem Libanon traf."
            == sentence.to_plain_string()
    )
Exemplo n.º 5
0
    def __init__(
        self,
        text: Union[str, List[str]] = None,
        use_tokenizer: Union[bool, Tokenizer] = True,
        language_code: str = None,
        start_position: int = None
    ):
        """
        Class to hold all meta related to a text (tokens, predictions, language code, ...)
        :param text: original string (sentence), or a list of string tokens (words)
        :param use_tokenizer: a custom tokenizer (default is :class:`SpaceTokenizer`)
            more advanced options are :class:`SegTokTokenizer` to use segtok or :class:`SpacyTokenizer`
            to use Spacy library if available). Check the implementations of abstract class Tokenizer or
            implement your own subclass (if you need it). If instead of providing a Tokenizer, this parameter
            is just set to True (deprecated), :class:`SegtokTokenizer` will be used.
        :param language_code: Language of the sentence
        :param start_position: Start char offset of the sentence in the superordinate document
        """
        super().__init__()

        self.tokens: List[Token] = []

        self._embeddings: Dict = {}

        self.language_code: str = language_code

        self.start_pos = start_position
        self.end_pos = (
            start_position + len(text) if start_position is not None else None
        )

        if isinstance(use_tokenizer, Tokenizer):
            tokenizer = use_tokenizer
        elif hasattr(use_tokenizer, "__call__"):
            from flair.tokenization import TokenizerWrapper
            tokenizer = TokenizerWrapper(use_tokenizer)
        elif type(use_tokenizer) == bool:
            from flair.tokenization import SegtokTokenizer, SpaceTokenizer
            tokenizer = SegtokTokenizer() if use_tokenizer else SpaceTokenizer()
        else:
            raise AssertionError("Unexpected type of parameter 'use_tokenizer'. " +
                                 "Parameter should be bool, Callable[[str], List[Token]] (deprecated), Tokenizer")

        # if text is passed, instantiate sentence with tokens (words)
        if text is not None:
            if isinstance(text, (list, tuple)):
                [self.add_token(self._restore_windows_1252_characters(token))
                 for token in text]
            else:
                text = self._restore_windows_1252_characters(text)
                [self.add_token(token) for token in tokenizer.tokenize(text)]

        # log a warning if the dataset is empty
        if text == "":
            log.warning(
                "Warning: An empty Sentence was created! Are there empty strings in your dataset?"
            )

        self.tokenized = None
Exemplo n.º 6
0
def test_sentence_get_item():
    sentence: Sentence = Sentence("I love Berlin.", use_tokenizer=SegtokTokenizer())

    assert sentence.get_token(1) == sentence[0]
    assert sentence.get_token(3) == sentence[2]

    with pytest.raises(IndexError):
        token = sentence[4]
Exemplo n.º 7
0
def test_create_sentence_with_segtoktokenizer():
    sentence: Sentence = Sentence("I love Berlin.", use_tokenizer=SegtokTokenizer())

    assert 4 == len(sentence.tokens)
    assert "I" == sentence.tokens[0].text
    assert "love" == sentence.tokens[1].text
    assert "Berlin" == sentence.tokens[2].text
    assert "." == sentence.tokens[3].text
Exemplo n.º 8
0
 def _predict(self, sentences, tagger):
     tokenizer = SegtokTokenizer()
     dataset = SentenceDataset(
         [Sentence(text, tokenizer) for text in sentences])
     tagger.predict(dataset,
                    mini_batch_size=self.mini_batch_size,
                    embedding_storage_mode=self.embedding_storage_mode,
                    verbose=self.verbose)
     return [sentence for sentence in dataset]
Exemplo n.º 9
0
def correct_who_to_whom(text):

    doc = nlp(text)

    tokenized_text = []

    phrases = []

    token_number = 0

    for token in doc:

        tokenized_text.append(token.text_with_ws)

        token_number += 1

        # it is very difficult for named entity recognizer to recognize 'Who'
        # in isolation - the motivating text was repeated exclamation of
        # 'Who! Who!' in a The Grinch fan fiction.
        if token.text.lower() in ['grinch', 'whoville', 'scooby', 'horton']:
            return

        if token.text.lower() == 'who':
            if token.dep_ in ['dobj', 'iobj', 'pobj']:

                # check for the hard-coded exceptions
                if not check_for_exceptions(doc, token):
                    should_be_whom = True

                    sentence = Sentence(text, use_tokenizer=SegtokTokenizer())
                    tagger.predict(sentence)

                    # make sure it is not part of a named entity
                    for entity in sentence.get_spans('ner'):
                        if token.idx >= entity.start_pos and token.idx <= entity.end_pos:
                            should_be_whom = False

                    if should_be_whom:

                        # detokenizes the corrected excerpt (e.g. removes added space
                        # between last word in sentence and punctutation, rejoins
                        # don and 't to form don't, etc., only if such joins were
                        # present in the original text)
                        tokenized_text[token.i] = whom_string(
                            token.text_with_ws, True)

    # prints the text with corrections made (corrections surround by asterisks)
    corrected_text = ''.join([tkn for tkn in tokenized_text])
    print(corrected_text)
Exemplo n.º 10
0
def test_token_position_in_sentence():
    sentence = Sentence("I love Berlin .")

    assert 0 == sentence.tokens[0].start_position
    assert 1 == sentence.tokens[0].end_position
    assert 2 == sentence.tokens[1].start_position
    assert 6 == sentence.tokens[1].end_position
    assert 7 == sentence.tokens[2].start_position
    assert 13 == sentence.tokens[2].end_position

    sentence = Sentence(" I love  Berlin.", use_tokenizer=SegtokTokenizer())

    assert 1 == sentence.tokens[0].start_position
    assert 2 == sentence.tokens[0].end_position
    assert 3 == sentence.tokens[1].start_position
    assert 7 == sentence.tokens[1].end_position
    assert 9 == sentence.tokens[2].start_position
    assert 15 == sentence.tokens[2].end_position
Exemplo n.º 11
0
def test_token_positions_when_creating_with_tokenizer():
    sentence = Sentence("I love Berlin .", use_tokenizer=SpaceTokenizer())

    assert 0 == sentence.tokens[0].start_position
    assert 1 == sentence.tokens[0].end_position
    assert 2 == sentence.tokens[1].start_position
    assert 6 == sentence.tokens[1].end_position
    assert 7 == sentence.tokens[2].start_position
    assert 13 == sentence.tokens[2].end_position

    sentence = Sentence(" I love  Berlin.", use_tokenizer=SegtokTokenizer())

    assert 1 == sentence.tokens[0].start_position
    assert 2 == sentence.tokens[0].end_position
    assert 3 == sentence.tokens[1].start_position
    assert 7 == sentence.tokens[1].end_position
    assert 9 == sentence.tokens[2].start_position
    assert 15 == sentence.tokens[2].end_position
Exemplo n.º 12
0
def test_tagged_corpus_make_vocab_dictionary():
    train_sentence = Sentence("used in training. training is cool.", use_tokenizer=SegtokTokenizer())

    corpus: Corpus = Corpus([train_sentence], [], [])

    vocab = corpus.make_vocab_dictionary(max_tokens=2, min_freq=-1)

    assert 3 == len(vocab)
    assert "<unk>" in vocab.get_items()
    assert "training" in vocab.get_items()
    assert "." in vocab.get_items()

    vocab = corpus.make_vocab_dictionary(max_tokens=-1, min_freq=-1)

    assert 7 == len(vocab)

    vocab = corpus.make_vocab_dictionary(max_tokens=-1, min_freq=2)

    assert 3 == len(vocab)
    assert "<unk>" in vocab.get_items()
    assert "training" in vocab.get_items()
    assert "." in vocab.get_items()
Exemplo n.º 13
0
    def __init__(
            self,
            query: str,
            host: str,
            port: int,
            database: str,
            collection: str,
            text_field: str,
            categories_field: List[str] = None,
            max_tokens_per_doc: int = -1,
            max_chars_per_doc: int = -1,
            tokenizer: Tokenizer = SegtokTokenizer(),
            in_memory: bool = True,
    ):
        """
        Reads Mongo collections. Each collection should contain one document/text per item.

        Each item should have the following format:
        {
        'Beskrivning': 'Abrahamsby. Gård i Gottröra sn, Långhundra hd, Stockholms län, nära Långsjön.',
        'Län':'Stockholms län',
        'Härad': 'Långhundra',
        'Församling': 'Gottröra',
        'Plats': 'Abrahamsby'
        }

        :param query: Query, e.g. {'Län': 'Stockholms län'}
        :param host: Host, e.g. 'localhost',
        :param port: Port, e.g. 27017
        :param database: Database, e.g. 'rosenberg',
        :param collection: Collection, e.g. 'book',
        :param text_field: Text field, e.g. 'Beskrivning',
        :param categories_field: List of category fields, e.g ['Län', 'Härad', 'Tingslag', 'Församling', 'Plats'],
        :param max_tokens_per_doc: Takes at most this amount of tokens per document. If set to -1 all documents are taken as is.
        :param max_tokens_per_doc: If set, truncates each Sentence to a maximum number of Tokens
        :param max_chars_per_doc: If set, truncates each Sentence to a maximum number of chars
        :param tokenizer: Custom tokenizer to use (default SegtokTokenizer)
        :param in_memory: If True, keeps dataset as Sentences in memory, otherwise only keeps strings
        :return: list of sentences
        """

        # first, check if pymongo is installed
        try:
            import pymongo
        except ModuleNotFoundError:
            log.warning("-" * 100)
            log.warning('ATTENTION! The library "pymongo" is not installed!')
            log.warning(
                'To use MongoDataset, please first install with "pip install pymongo"'
            )
            log.warning("-" * 100)
            pass

        self.in_memory = in_memory
        self.tokenizer = tokenizer

        if self.in_memory:
            self.sentences = []
        else:
            self.indices = []

        self.total_sentence_count: int = 0
        self.max_chars_per_doc = max_chars_per_doc
        self.max_tokens_per_doc = max_tokens_per_doc

        self.__connection = pymongo.MongoClient(host, port)
        self.__cursor = self.__connection[database][collection]

        self.text = text_field
        self.categories = categories_field if categories_field is not None else []

        start = 0

        kwargs = lambda start: {"filter": query, "skip": start, "limit": 0}

        if self.in_memory:
            for document in self.__cursor.find(**kwargs(start)):
                sentence = self._parse_document_to_sentence(
                    document[self.text],
                    [
                        document[_] if _ in document else ""
                        for _ in self.categories
                    ],
                    tokenizer,
                )
                if sentence is not None and len(sentence.tokens) > 0:
                    self.sentences.append(sentence)
                    self.total_sentence_count += 1
        else:
            self.indices = self.__cursor.find().distinct("_id")
            self.total_sentence_count = self.__cursor.count_documents()
Exemplo n.º 14
0
def test_sentence_to_plain_string():
    sentence: Sentence = Sentence("I love Berlin.", use_tokenizer=SegtokTokenizer())

    assert "I love Berlin ." == sentence.to_tokenized_string()
Exemplo n.º 15
0
def segtok_tokenizer(text: str) -> List[Token]:
    # We don't want to create a SegtokTokenizer object each time this function is called,
    # so delegate the call directly to the static run_tokenize method
    from flair.tokenization import SegtokTokenizer
    return SegtokTokenizer.run_tokenize(text)
Exemplo n.º 16
0
Arquivo: rgb.py Projeto: 96jonesa/rgb
def correct_who_to_whom(text):

    doc = nlp(text)

    phrases = []

    token_number = 0

    for token in doc:

        token_number += 1

        # it is very difficult for named entity recognizer to recognize 'Who'
        # in isolation - the motivating text was repeated exclamation of
        # 'Who! Who!' in a The Grinch fan fiction.
        if token.text.lower() in ['grinch', 'whoville']:
            return

        if token.text.lower() == 'who' :
            if token.dep_ in ['dobj', 'iobj', 'pobj']:

                # check for the hard-coded exceptions
                if not check_for_exceptions(doc, token):
                    should_be_whom = True

                    sentence = Sentence(text, use_tokenizer=SegtokTokenizer())
                    tagger.predict(sentence)

                    # make sure it is not part of a named entity
                    for entity in sentence.get_spans('ner'):
                        if token.idx >= entity.start_pos and token.idx <= entity.end_pos:
                            should_be_whom = False

                    if should_be_whom:

                        phrase_start, phrase_end, whom_first = i_span(doc, token)

                        if whom_first:
                            # detokenizes the corrected excerpt (e.g. removes added space
                            # between last word in sentence and punctutation, rejoins
                            # don and 't to form don't, etc., only if such joins were
                            # present in the original text)
                            phrase = whom_string(token.text) + (''.join([tkn.text_with_ws for tkn in doc[phrase_start:phrase_end + 1]]))[3:]

                            phrases.append(phrase)
                        else:
                            # detokenizes the corrected excerpt (e.g. removes added space
                            # between last word in sentence and punctutation, rejoins
                            # don and 't to form don't, etc., only if such joins were
                            # present in the original text)
                            phrase = ''.join([tkn.text_with_ws for tkn in doc[phrase_start:phrase_end]]) + whom_string(token.text)

                            phrases.append(phrase)

    # if any corrections were found, then print the original text and the corrections.
    if phrases:
        joined_phrases = '\n\n'.join(phrases)

        print('<<<  TEXT  >>>')
        print(text)
        print('<<<  CORRECTIONS  >>>')
        print(joined_phrases)
        print()
        print()