def test_tagged_corpus_get_all_sentences(): train_sentence = Sentence("I'm used in training.", use_tokenizer=True) dev_sentence = Sentence("I'm a dev sentence.", use_tokenizer=True) test_sentence = Sentence("I will be only used for testing.", use_tokenizer=True) corpus: TaggedCorpus = TaggedCorpus([train_sentence], [dev_sentence], [test_sentence]) all_sentences = corpus.get_all_sentences() assert (3 == len(all_sentences))
def test_tagged_corpus_make_label_dictionary_string(): sentence_1 = Sentence('sentence 1', labels=['class_1']) sentence_2 = Sentence('sentence 2', labels=['class_2']) sentence_3 = Sentence('sentence 3', labels=['class_1']) corpus: TaggedCorpus = TaggedCorpus([sentence_1, sentence_2, sentence_3], [], []) label_dict = corpus.make_label_dictionary() assert (2 == len(label_dict)) assert ('<unk>' not in label_dict.get_items()) assert ('class_1' in label_dict.get_items()) assert ('class_2' in label_dict.get_items())
def read_column_data(path_to_column_file: str, column_name_map: Dict[int, str], infer_whitespace_after: bool = True): """ Reads a file in column format and produces a list of Sentence with tokenlevel annotation as specified in the column_name_map. For instance, by passing "{0: 'text', 1: 'pos', 2: 'np', 3: 'ner'}" as column_name_map you specify that the first column is the text (lexical value) of the token, the second the PoS tag, the third the chunk and the forth the NER tag. :param path_to_column_file: the path to the column file :param column_name_map: a map of column number to token annotation name :param infer_whitespace_after: if True, tries to infer whitespace_after field for Token :return: list of sentences """ sentences: List[Sentence] = [] lines: List[str] = open(path_to_column_file).read().strip().split('\n') # most data sets have the token text in the first column, if not, pass 'text' as column text_column: int = 0 for column in column_name_map: if column_name_map[column] == 'text': text_column = column sentence: Sentence = Sentence() for line in lines: if line.startswith('#'): continue if line == '': if len(sentence) > 0: sentence._infer_space_after() sentences.append(sentence) sentence: Sentence = Sentence() else: fields: List[str] = re.split("\s+", line) token = Token(fields[text_column]) for column in column_name_map: if len(fields) > column: if column != text_column: token.add_tag(column_name_map[column], fields[column]) sentence.add_token(token) if len(sentence.tokens) > 0: sentence._infer_space_after() sentences.append(sentence) return sentences
def test_sentence_infer_tokenization(): sentence: Sentence = Sentence() sentence.add_token(Token('xyz')) sentence.add_token(Token('"')) sentence.add_token(Token('abc')) sentence.add_token(Token('"')) sentence._infer_space_after() assert ('xyz " abc "' == sentence.to_tokenized_string()) assert ('xyz "abc"' == sentence.to_plain_string()) sentence: Sentence = Sentence('xyz " abc "') sentence._infer_space_after() assert ('xyz " abc "' == sentence.to_tokenized_string()) assert ('xyz "abc"' == sentence.to_plain_string())
def test_training(): # get default dictionary dictionary: Dictionary = Dictionary.load('chars') # init forward LM with 128 hidden states and 1 layer language_model: LanguageModel = LanguageModel(dictionary, is_forward_lm=True, hidden_size=128, nlayers=1) # get the example corpus and process at character level in forward direction corpus: TextCorpus = TextCorpus('resources/corpora/lorem_ipsum', dictionary, language_model.is_forward_lm, character_level=True) # train the language model trainer: LanguageModelTrainer = LanguageModelTrainer( language_model, corpus) trainer.train('./results', sequence_length=10, mini_batch_size=10, max_epochs=5) # use the character LM as embeddings to embed the example sentence 'I love Berlin' char_lm_embeddings = CharLMEmbeddings('./results/best-lm.pt') sentence = Sentence('I love Berlin') char_lm_embeddings.embed(sentence) print(sentence[1].embedding.size()) # clean up results directory shutil.rmtree('./results')
def test_create_sentence_without_tokenizer(): sentence: Sentence = Sentence('I love Berlin.') assert (3 == len(sentence.tokens)) assert ('I' == sentence.tokens[0].text) assert ('love' == sentence.tokens[1].text) assert ('Berlin.' == sentence.tokens[2].text)
def init_document_embeddings(): text = 'I love Berlin. Berlin is a great place to live.' sentence: Sentence = Sentence(text) glove: TokenEmbeddings = WordEmbeddings('en-glove') charlm: TokenEmbeddings = CharLMEmbeddings('mix-backward') return sentence, glove, charlm
def test_sentence_whitespace_tokenization(): sentence: Sentence = Sentence('I love Berlin .') assert (4 == len(sentence.tokens)) assert ('I' == sentence.get_token(1).text) assert ('love' == sentence.get_token(2).text) assert ('Berlin' == sentence.get_token(3).text) assert ('.' == sentence.get_token(4).text)
def test_sentence_get_item(): sentence: Sentence = Sentence('I love Berlin.', use_tokenizer=True) assert (sentence.get_token(1) == sentence[0]) assert (sentence.get_token(3) == sentence[2]) with pytest.raises(IndexError): token = sentence[4]
def test_create_sentence_with_tokenizer(): sentence: Sentence = Sentence('I love Berlin.', use_tokenizer=True) assert (4 == len(sentence.tokens)) assert ('I' == sentence.tokens[0].text) assert ('love' == sentence.tokens[1].text) assert ('Berlin' == sentence.tokens[2].text) assert ('.' == sentence.tokens[3].text)
def read_conll_ud(path_to_conll_file: str) -> List[Sentence]: """ Reads a file in CoNLL-U format and produces a list of Sentence with full morphosyntactic annotation :param path_to_conll_file: the path to the conll-u file :return: list of sentences """ sentences: List[Sentence] = [] lines: List[str] = open(path_to_conll_file, encoding='utf-8'). \ read().strip().split('\n') sentence: Sentence = Sentence() for line in lines: fields: List[str] = re.split("\s+", line) if line == '': if len(sentence) > 0: sentences.append(sentence) sentence: Sentence = Sentence() elif line.startswith('#'): continue elif '.' in fields[0]: continue elif '-' in fields[0]: continue else: token = Token(fields[1], head_id=int(fields[6])) token.add_tag('lemma', str(fields[2])) token.add_tag('upos', str(fields[3])) token.add_tag('pos', str(fields[4])) token.add_tag('dependency', str(fields[7])) for morph in str(fields[5]).split('|'): if not "=" in morph: continue; token.add_tag(morph.split('=')[0].lower(), morph.split('=')[1]) if len(fields) > 10 and str(fields[10]) == 'Y': token.add_tag('frame', str(fields[11])) sentence.add_token(token) if len(sentence.tokens) > 0: sentences.append(sentence) return sentences
def test_tagged_corpus_statistics_multi_label(): train_sentence = Sentence('I love Berlin.', labels=['class_1'], use_tokenizer=True) dev_sentence = Sentence('The sun is shining.', labels=['class_2'], use_tokenizer=True) test_sentence = Sentence('Berlin is sunny.', labels=['class_1', 'class_2'], use_tokenizer=True) class_to_count_dict = TaggedCorpus._get_classes_to_count([train_sentence, dev_sentence, test_sentence]) assert ('class_1' in class_to_count_dict) assert ('class_2' in class_to_count_dict) assert (2 == class_to_count_dict['class_1']) assert (2 == class_to_count_dict['class_2']) tokens_in_sentences = TaggedCorpus._get_tokens_per_sentence([train_sentence, dev_sentence, test_sentence]) assert (3 == len(tokens_in_sentences)) assert (4 == tokens_in_sentences[0]) assert (5 == tokens_in_sentences[1]) assert (4 == tokens_in_sentences[2])
def test_tagged_corpus_downsample(): sentence = Sentence('I love Berlin.', labels=[Label('class_1')], use_tokenizer=True) corpus: TaggedCorpus = TaggedCorpus( [sentence, sentence, sentence, sentence, sentence, sentence, sentence, sentence, sentence, sentence], [], []) assert (10 == len(corpus.train)) corpus.downsample(percentage=0.3, only_downsample_train=True) assert (3 == len(corpus.train))
def load_and_apply_char_lm_embeddings(emb_type: str): text = 'I love Berlin.' sentence: Sentence = Sentence(text) embeddings: TokenEmbeddings = CharLMEmbeddings(emb_type) embeddings.embed(sentence) for token in sentence.tokens: assert (len(token.get_embedding()) != 0) token.clear_embeddings() assert (len(token.get_embedding()) == 0)
def test_sentence_to_tagged_string(): token1 = Token('I', 0) token2 = Token('love', 1, 0) token3 = Token('Berlin', 2, 1) token3.add_tag('ner', 'LOC') sentence: Sentence = Sentence() sentence.add_token(token1) sentence.add_token(token2) sentence.add_token(token3) assert ('I love Berlin <LOC>' == sentence.to_tagged_string())
def test_get_head(): token1 = Token('I', 0) token2 = Token('love', 1, 0) token3 = Token('Berlin', 2, 1) sentence: Sentence = Sentence() sentence.add_token(token1) sentence.add_token(token2) sentence.add_token(token3) assert (token2 == token3.get_head()) assert (token1 == token2.get_head()) assert (None == token1.get_head())
def test_tag_sentence(): # test tagging sentence = Sentence('I love Berlin') tagger = SequenceTagger.load('ner') tagger.predict(sentence) # test re-tagging tagger = SequenceTagger.load('pos') tagger.predict(sentence)
def test_document_mean_embeddings(): text = 'I love Berlin. Berlin is a great place to live.' sentence: Sentence = Sentence(text) glove: TokenEmbeddings = WordEmbeddings('en-glove') charlm: TokenEmbeddings = CharLMEmbeddings('mix-backward') embeddings: DocumentMeanEmbeddings = DocumentMeanEmbeddings( [glove, charlm]) embeddings.embed(sentence) assert (len(sentence.get_embedding()) != 0) sentence.clear_embeddings() assert (len(sentence.get_embedding()) == 0)
def test_sentence_to_real_string(): sentence: Sentence = Sentence('I love Berlin.', use_tokenizer=True) assert ('I love Berlin.' == sentence.to_plain_string()) corpus = NLPTaskDataFetcher.fetch_data(NLPTask.GERMEVAL) sentence = corpus.train[0] assert ( 'Schartau sagte dem " Tagesspiegel " vom Freitag , Fischer sei " in einer Weise aufgetreten , die alles andere als überzeugend war " .' == sentence.to_tokenized_string()) assert ( 'Schartau sagte dem "Tagesspiegel" vom Freitag, Fischer sei "in einer Weise aufgetreten, die alles andere als überzeugend war".' == sentence.to_plain_string()) sentence = corpus.train[1] assert ( 'Firmengründer Wolf Peter Bree arbeitete Anfang der siebziger Jahre als Möbelvertreter , als er einen fliegenden Händler aus dem Libanon traf .' == sentence.to_tokenized_string()) assert ( 'Firmengründer Wolf Peter Bree arbeitete Anfang der siebziger Jahre als Möbelvertreter, als er einen fliegenden Händler aus dem Libanon traf.' == sentence.to_plain_string())
def test_stacked_embeddings(): text = 'I love Berlin.' sentence: Sentence = Sentence(text) glove: TokenEmbeddings = WordEmbeddings('en-glove') news: TokenEmbeddings = WordEmbeddings('en-news') charlm: TokenEmbeddings = CharLMEmbeddings('mix-backward') embeddings: StackedEmbeddings = StackedEmbeddings([glove, news, charlm]) embeddings.embed(sentence) for token in sentence.tokens: assert (len(token.get_embedding()) != 0) token.clear_embeddings() assert (len(token.get_embedding()) == 0)
def test_tagged_corpus_make_vocab_dictionary(): train_sentence = Sentence('used in training. training is cool.', use_tokenizer=True) corpus: TaggedCorpus = TaggedCorpus([train_sentence], [], []) vocab = corpus.make_vocab_dictionary(max_tokens=2, min_freq=-1) assert (3 == len(vocab)) assert ('<unk>' in vocab.get_items()) assert ('training' in vocab.get_items()) assert ('.' in vocab.get_items()) vocab = corpus.make_vocab_dictionary(max_tokens=-1, min_freq=-1) assert (7 == len(vocab)) vocab = corpus.make_vocab_dictionary(max_tokens=-1, min_freq=2) assert (3 == len(vocab)) assert ('<unk>' in vocab.get_items()) assert ('training' in vocab.get_items()) assert ('.' in vocab.get_items())
def test_text_classifier_mulit_label(): corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB) label_dict = corpus.make_label_dictionary() glove_embedding: WordEmbeddings = WordEmbeddings('en-glove') document_embeddings: DocumentMeanEmbeddings = DocumentMeanEmbeddings( [glove_embedding]) model = TextClassifier(document_embeddings, label_dict, True) trainer = TextClassifierTrainer(model, corpus, label_dict, False) trainer.train('./results', max_epochs=2) sentence = Sentence("Berlin is a really nice city.") for s in model.predict(sentence): for l in s.labels: assert (l.name is not None) assert (0.0 <= l.confidence <= 1.0) assert (type(l.confidence) is float) # clean up results directory shutil.rmtree('./results')
def read_text_classification_file(path_to_file): """ Reads a data file for text classification. The file should contain one document/text per line. The line should have the following format: __label__<class_name> <text> If you have a multi class task, you can have as many labels as you want at the beginning of the line, e.g., __label__<class_name_1> __label__<class_name_2> <text> :param path_to_file: the path to the data file :return: list of sentences """ label_prefix = '__label__' sentences = [] with open(path_to_file) as f: lines = f.readlines() for line in lines: words = line.split() labels = [] l_len = 0 for i in range(len(words)): if words[i].startswith(label_prefix): l_len += len(words[i]) + 1 label = words[i].replace(label_prefix, "") labels.append(label) else: break text = line[l_len:].strip() if text and labels: sentences.append(Sentence(text, labels=labels, use_tokenizer=True)) return sentences
from flairrelex.data import Sentence from flairrelex.models import SequenceTagger tagger: SequenceTagger = SequenceTagger.load('ner') sentence: Sentence = Sentence('George Washington went to Washington .') tagger.predict(sentence) print('Analysing %s' % sentence) print('\nThe following NER tags are found: \n') print(sentence.to_tagged_string())
def test_sentence_to_plain_string(): sentence: Sentence = Sentence('I love Berlin.', use_tokenizer=True) assert ('I love Berlin .' == sentence.to_tokenized_string())