def test_token_indices(): text = ": nation on" sentence = Sentence(text) assert text == sentence.to_original_text() text = ": nation on" sentence = Sentence(text, use_tokenizer=SegtokTokenizer()) assert text == sentence.to_original_text() text = "I love Berlin." sentence = Sentence(text) assert text == sentence.to_original_text() text = ( 'Schartau sagte dem " Tagesspiegel " vom Freitag , Fischer sei " ' "in einer Weise aufgetreten , die alles andere als überzeugend " 'war " .' ) sentence = Sentence(text) assert text == sentence.to_original_text() text = ( 'Schartau sagte dem " Tagesspiegel " vom Freitag , Fischer sei " ' "in einer Weise aufgetreten , die alles andere als überzeugend " 'war " .' ) sentence = Sentence(text, use_tokenizer=SegtokTokenizer()) assert text == sentence.to_original_text()
def test_tagged_corpus_get_all_sentences(): train_sentence = Sentence("I'm used in training.", use_tokenizer=SegtokTokenizer()) dev_sentence = Sentence("I'm a dev sentence.", use_tokenizer=SegtokTokenizer()) test_sentence = Sentence("I will be only used for testing.", use_tokenizer=SegtokTokenizer()) corpus: Corpus = Corpus([train_sentence], [dev_sentence], [test_sentence]) all_sentences = corpus.get_all_sentences() assert 3 == len(all_sentences)
def test_tagged_corpus_get_tag_statistic(): train_sentence = Sentence("Zalando Research is located in Berlin .") train_sentence[0].add_tag("ner", "B-ORG") train_sentence[1].add_tag("ner", "E-ORG") train_sentence[5].add_tag("ner", "S-LOC") dev_sentence = Sentence( "Facebook, Inc. is a company, and Google is one as well.", use_tokenizer=SegtokTokenizer(), ) dev_sentence[0].add_tag("ner", "B-ORG") dev_sentence[1].add_tag("ner", "I-ORG") dev_sentence[2].add_tag("ner", "E-ORG") dev_sentence[8].add_tag("ner", "S-ORG") test_sentence = Sentence("Nothing to do with companies.") tag_to_count_dict = Corpus._count_token_labels( [train_sentence, dev_sentence, test_sentence], "ner" ) assert 1 == tag_to_count_dict["S-ORG"] assert 1 == tag_to_count_dict["S-LOC"] assert 2 == tag_to_count_dict["B-ORG"] assert 2 == tag_to_count_dict["E-ORG"] assert 1 == tag_to_count_dict["I-ORG"]
def test_sentence_to_real_string(tasks_base_path): sentence: Sentence = Sentence("I love Berlin.", use_tokenizer=SegtokTokenizer()) assert "I love Berlin." == sentence.to_plain_string() corpus = flair.datasets.GERMEVAL_14(base_path=tasks_base_path) sentence = corpus.train[0] sentence.infer_space_after() assert ( 'Schartau sagte dem " Tagesspiegel " vom Freitag , Fischer sei " in einer Weise aufgetreten , die alles andere als überzeugend war " .' == sentence.to_tokenized_string() ) assert ( 'Schartau sagte dem "Tagesspiegel" vom Freitag, Fischer sei "in einer Weise aufgetreten, die alles andere als überzeugend war".' == sentence.to_plain_string() ) sentence = corpus.train[1] sentence.infer_space_after() assert ( "Firmengründer Wolf Peter Bree arbeitete Anfang der siebziger Jahre als Möbelvertreter , als er einen fliegenden Händler aus dem Libanon traf ." == sentence.to_tokenized_string() ) assert ( "Firmengründer Wolf Peter Bree arbeitete Anfang der siebziger Jahre als Möbelvertreter, als er einen fliegenden Händler aus dem Libanon traf." == sentence.to_plain_string() )
def __init__( self, text: Union[str, List[str]] = None, use_tokenizer: Union[bool, Tokenizer] = True, language_code: str = None, start_position: int = None ): """ Class to hold all meta related to a text (tokens, predictions, language code, ...) :param text: original string (sentence), or a list of string tokens (words) :param use_tokenizer: a custom tokenizer (default is :class:`SpaceTokenizer`) more advanced options are :class:`SegTokTokenizer` to use segtok or :class:`SpacyTokenizer` to use Spacy library if available). Check the implementations of abstract class Tokenizer or implement your own subclass (if you need it). If instead of providing a Tokenizer, this parameter is just set to True (deprecated), :class:`SegtokTokenizer` will be used. :param language_code: Language of the sentence :param start_position: Start char offset of the sentence in the superordinate document """ super().__init__() self.tokens: List[Token] = [] self._embeddings: Dict = {} self.language_code: str = language_code self.start_pos = start_position self.end_pos = ( start_position + len(text) if start_position is not None else None ) if isinstance(use_tokenizer, Tokenizer): tokenizer = use_tokenizer elif hasattr(use_tokenizer, "__call__"): from flair.tokenization import TokenizerWrapper tokenizer = TokenizerWrapper(use_tokenizer) elif type(use_tokenizer) == bool: from flair.tokenization import SegtokTokenizer, SpaceTokenizer tokenizer = SegtokTokenizer() if use_tokenizer else SpaceTokenizer() else: raise AssertionError("Unexpected type of parameter 'use_tokenizer'. " + "Parameter should be bool, Callable[[str], List[Token]] (deprecated), Tokenizer") # if text is passed, instantiate sentence with tokens (words) if text is not None: if isinstance(text, (list, tuple)): [self.add_token(self._restore_windows_1252_characters(token)) for token in text] else: text = self._restore_windows_1252_characters(text) [self.add_token(token) for token in tokenizer.tokenize(text)] # log a warning if the dataset is empty if text == "": log.warning( "Warning: An empty Sentence was created! Are there empty strings in your dataset?" ) self.tokenized = None
def test_sentence_get_item(): sentence: Sentence = Sentence("I love Berlin.", use_tokenizer=SegtokTokenizer()) assert sentence.get_token(1) == sentence[0] assert sentence.get_token(3) == sentence[2] with pytest.raises(IndexError): token = sentence[4]
def test_create_sentence_with_segtoktokenizer(): sentence: Sentence = Sentence("I love Berlin.", use_tokenizer=SegtokTokenizer()) assert 4 == len(sentence.tokens) assert "I" == sentence.tokens[0].text assert "love" == sentence.tokens[1].text assert "Berlin" == sentence.tokens[2].text assert "." == sentence.tokens[3].text
def _predict(self, sentences, tagger): tokenizer = SegtokTokenizer() dataset = SentenceDataset( [Sentence(text, tokenizer) for text in sentences]) tagger.predict(dataset, mini_batch_size=self.mini_batch_size, embedding_storage_mode=self.embedding_storage_mode, verbose=self.verbose) return [sentence for sentence in dataset]
def correct_who_to_whom(text): doc = nlp(text) tokenized_text = [] phrases = [] token_number = 0 for token in doc: tokenized_text.append(token.text_with_ws) token_number += 1 # it is very difficult for named entity recognizer to recognize 'Who' # in isolation - the motivating text was repeated exclamation of # 'Who! Who!' in a The Grinch fan fiction. if token.text.lower() in ['grinch', 'whoville', 'scooby', 'horton']: return if token.text.lower() == 'who': if token.dep_ in ['dobj', 'iobj', 'pobj']: # check for the hard-coded exceptions if not check_for_exceptions(doc, token): should_be_whom = True sentence = Sentence(text, use_tokenizer=SegtokTokenizer()) tagger.predict(sentence) # make sure it is not part of a named entity for entity in sentence.get_spans('ner'): if token.idx >= entity.start_pos and token.idx <= entity.end_pos: should_be_whom = False if should_be_whom: # detokenizes the corrected excerpt (e.g. removes added space # between last word in sentence and punctutation, rejoins # don and 't to form don't, etc., only if such joins were # present in the original text) tokenized_text[token.i] = whom_string( token.text_with_ws, True) # prints the text with corrections made (corrections surround by asterisks) corrected_text = ''.join([tkn for tkn in tokenized_text]) print(corrected_text)
def test_token_position_in_sentence(): sentence = Sentence("I love Berlin .") assert 0 == sentence.tokens[0].start_position assert 1 == sentence.tokens[0].end_position assert 2 == sentence.tokens[1].start_position assert 6 == sentence.tokens[1].end_position assert 7 == sentence.tokens[2].start_position assert 13 == sentence.tokens[2].end_position sentence = Sentence(" I love Berlin.", use_tokenizer=SegtokTokenizer()) assert 1 == sentence.tokens[0].start_position assert 2 == sentence.tokens[0].end_position assert 3 == sentence.tokens[1].start_position assert 7 == sentence.tokens[1].end_position assert 9 == sentence.tokens[2].start_position assert 15 == sentence.tokens[2].end_position
def test_token_positions_when_creating_with_tokenizer(): sentence = Sentence("I love Berlin .", use_tokenizer=SpaceTokenizer()) assert 0 == sentence.tokens[0].start_position assert 1 == sentence.tokens[0].end_position assert 2 == sentence.tokens[1].start_position assert 6 == sentence.tokens[1].end_position assert 7 == sentence.tokens[2].start_position assert 13 == sentence.tokens[2].end_position sentence = Sentence(" I love Berlin.", use_tokenizer=SegtokTokenizer()) assert 1 == sentence.tokens[0].start_position assert 2 == sentence.tokens[0].end_position assert 3 == sentence.tokens[1].start_position assert 7 == sentence.tokens[1].end_position assert 9 == sentence.tokens[2].start_position assert 15 == sentence.tokens[2].end_position
def test_tagged_corpus_make_vocab_dictionary(): train_sentence = Sentence("used in training. training is cool.", use_tokenizer=SegtokTokenizer()) corpus: Corpus = Corpus([train_sentence], [], []) vocab = corpus.make_vocab_dictionary(max_tokens=2, min_freq=-1) assert 3 == len(vocab) assert "<unk>" in vocab.get_items() assert "training" in vocab.get_items() assert "." in vocab.get_items() vocab = corpus.make_vocab_dictionary(max_tokens=-1, min_freq=-1) assert 7 == len(vocab) vocab = corpus.make_vocab_dictionary(max_tokens=-1, min_freq=2) assert 3 == len(vocab) assert "<unk>" in vocab.get_items() assert "training" in vocab.get_items() assert "." in vocab.get_items()
def __init__( self, query: str, host: str, port: int, database: str, collection: str, text_field: str, categories_field: List[str] = None, max_tokens_per_doc: int = -1, max_chars_per_doc: int = -1, tokenizer: Tokenizer = SegtokTokenizer(), in_memory: bool = True, ): """ Reads Mongo collections. Each collection should contain one document/text per item. Each item should have the following format: { 'Beskrivning': 'Abrahamsby. Gård i Gottröra sn, Långhundra hd, Stockholms län, nära Långsjön.', 'Län':'Stockholms län', 'Härad': 'Långhundra', 'Församling': 'Gottröra', 'Plats': 'Abrahamsby' } :param query: Query, e.g. {'Län': 'Stockholms län'} :param host: Host, e.g. 'localhost', :param port: Port, e.g. 27017 :param database: Database, e.g. 'rosenberg', :param collection: Collection, e.g. 'book', :param text_field: Text field, e.g. 'Beskrivning', :param categories_field: List of category fields, e.g ['Län', 'Härad', 'Tingslag', 'Församling', 'Plats'], :param max_tokens_per_doc: Takes at most this amount of tokens per document. If set to -1 all documents are taken as is. :param max_tokens_per_doc: If set, truncates each Sentence to a maximum number of Tokens :param max_chars_per_doc: If set, truncates each Sentence to a maximum number of chars :param tokenizer: Custom tokenizer to use (default SegtokTokenizer) :param in_memory: If True, keeps dataset as Sentences in memory, otherwise only keeps strings :return: list of sentences """ # first, check if pymongo is installed try: import pymongo except ModuleNotFoundError: log.warning("-" * 100) log.warning('ATTENTION! The library "pymongo" is not installed!') log.warning( 'To use MongoDataset, please first install with "pip install pymongo"' ) log.warning("-" * 100) pass self.in_memory = in_memory self.tokenizer = tokenizer if self.in_memory: self.sentences = [] else: self.indices = [] self.total_sentence_count: int = 0 self.max_chars_per_doc = max_chars_per_doc self.max_tokens_per_doc = max_tokens_per_doc self.__connection = pymongo.MongoClient(host, port) self.__cursor = self.__connection[database][collection] self.text = text_field self.categories = categories_field if categories_field is not None else [] start = 0 kwargs = lambda start: {"filter": query, "skip": start, "limit": 0} if self.in_memory: for document in self.__cursor.find(**kwargs(start)): sentence = self._parse_document_to_sentence( document[self.text], [ document[_] if _ in document else "" for _ in self.categories ], tokenizer, ) if sentence is not None and len(sentence.tokens) > 0: self.sentences.append(sentence) self.total_sentence_count += 1 else: self.indices = self.__cursor.find().distinct("_id") self.total_sentence_count = self.__cursor.count_documents()
def test_sentence_to_plain_string(): sentence: Sentence = Sentence("I love Berlin.", use_tokenizer=SegtokTokenizer()) assert "I love Berlin ." == sentence.to_tokenized_string()
def segtok_tokenizer(text: str) -> List[Token]: # We don't want to create a SegtokTokenizer object each time this function is called, # so delegate the call directly to the static run_tokenize method from flair.tokenization import SegtokTokenizer return SegtokTokenizer.run_tokenize(text)
def correct_who_to_whom(text): doc = nlp(text) phrases = [] token_number = 0 for token in doc: token_number += 1 # it is very difficult for named entity recognizer to recognize 'Who' # in isolation - the motivating text was repeated exclamation of # 'Who! Who!' in a The Grinch fan fiction. if token.text.lower() in ['grinch', 'whoville']: return if token.text.lower() == 'who' : if token.dep_ in ['dobj', 'iobj', 'pobj']: # check for the hard-coded exceptions if not check_for_exceptions(doc, token): should_be_whom = True sentence = Sentence(text, use_tokenizer=SegtokTokenizer()) tagger.predict(sentence) # make sure it is not part of a named entity for entity in sentence.get_spans('ner'): if token.idx >= entity.start_pos and token.idx <= entity.end_pos: should_be_whom = False if should_be_whom: phrase_start, phrase_end, whom_first = i_span(doc, token) if whom_first: # detokenizes the corrected excerpt (e.g. removes added space # between last word in sentence and punctutation, rejoins # don and 't to form don't, etc., only if such joins were # present in the original text) phrase = whom_string(token.text) + (''.join([tkn.text_with_ws for tkn in doc[phrase_start:phrase_end + 1]]))[3:] phrases.append(phrase) else: # detokenizes the corrected excerpt (e.g. removes added space # between last word in sentence and punctutation, rejoins # don and 't to form don't, etc., only if such joins were # present in the original text) phrase = ''.join([tkn.text_with_ws for tkn in doc[phrase_start:phrase_end]]) + whom_string(token.text) phrases.append(phrase) # if any corrections were found, then print the original text and the corrections. if phrases: joined_phrases = '\n\n'.join(phrases) print('<<< TEXT >>>') print(text) print('<<< CORRECTIONS >>>') print(joined_phrases) print() print()