def test_train_charlm_changed_chache_load_use_tagger(results_base_path, tasks_base_path): corpus = NLPTaskDataFetcher.fetch_data(NLPTask.FASHION, base_path=tasks_base_path) tag_dictionary = corpus.make_tag_dictionary('ner') # make a temporary cache directory that we remove afterwards cache_dir = results_base_path / 'cache' os.makedirs(cache_dir, exist_ok=True) embeddings = CharLMEmbeddings('news-forward-fast', cache_directory=cache_dir) tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type='ner', use_crf=False) # initialize trainer trainer: SequenceTaggerTrainer = SequenceTaggerTrainer(tagger, corpus, test_mode=True) trainer.train(str(results_base_path), learning_rate=0.1, mini_batch_size=2, max_epochs=2) # remove the cache directory shutil.rmtree(cache_dir) loaded_model: SequenceTagger = SequenceTagger.load_from_file( results_base_path / 'final-model.pt') sentence = Sentence('I love Berlin') sentence_empty = Sentence(' ') loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path)
def test_train_charlm_load_use_classifier(results_base_path, tasks_base_path): corpus = NLPTaskDataFetcher.load_corpus(NLPTask.IMDB, base_path=tasks_base_path) label_dict = corpus.make_label_dictionary() glove_embedding: TokenEmbeddings = CharLMEmbeddings('news-forward-fast') document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings( [glove_embedding], 128, 1, False, 64, False, False) model = TextClassifier(document_embeddings, label_dict, False) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, EvaluationMetric.MACRO_F1_SCORE, max_epochs=2, test_mode=True) sentence = Sentence("Berlin is a really nice city.") for s in model.predict(sentence): for l in s.labels: assert (l.value is not None) assert (0.0 <= l.score <= 1.0) assert (type(l.score) is float) loaded_model = TextClassifier.load_from_file(results_base_path / 'final-model.pt') sentence = Sentence('I love Berlin') sentence_empty = Sentence(' ') loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path)
def test_train_charlm_load_use_tagger(results_base_path, tasks_base_path): corpus = NLPTaskDataFetcher.load_corpus(NLPTask.FASHION, base_path=tasks_base_path) tag_dictionary = corpus.make_tag_dictionary('ner') embeddings = CharLMEmbeddings('news-forward-fast') tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type='ner', use_crf=False) # initialize trainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train(results_base_path, EvaluationMetric.MICRO_F1_SCORE, learning_rate=0.1, mini_batch_size=2, max_epochs=2, test_mode=True) loaded_model: SequenceTagger = SequenceTagger.load_from_file( results_base_path / 'final-model.pt') sentence = Sentence('I love Berlin') sentence_empty = Sentence(' ') loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path)
# Check, if our dataset reader works assert len(sentences_train) == number_train_tweets assert len(sentences_dev) == number_dev_tweets assert len(sentences_test) == number_test_tweets corpus: TaggedCorpus = TaggedCorpus(sentences_train, sentences_dev, sentences_test) tag_type = 'pos' tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('de'), CharLMEmbeddings('german-forward'), CharLMEmbeddings('german-backward'), ] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) from hyperopt import hp from flair.hyperparameter.param_selection import SearchSpace, Parameter search_space = SearchSpace() search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[embeddings]) search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[32, 64, 128, 256, 512]) search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2]) search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5) search_space.add(Parameter.LEARNING_RATE, hp.choice, options=[0.05, 0.1, 0.15, 0.2]) search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[8, 16, 24, 32])
""" from typing import List from flair.embeddings import StackedEmbeddings, CharLMEmbeddings, TokenEmbeddings from flair.models import SequenceTagger from flair.trainers import SequenceTaggerTrainer from flair.data import TaggedCorpus from models import FORWARD_LM, BACKWARD_LM, GLOVE from embeddings import KeyedWordEmbeddings from ne_groups import GROUPS from corpora import read_group embedding_types: List[TokenEmbeddings] = [ KeyedWordEmbeddings(GLOVE), CharLMEmbeddings(FORWARD_LM), CharLMEmbeddings(BACKWARD_LM) ] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) for entities in GROUPS: corpus: TaggedCorpus = read_group(entities) tag_dictionary = corpus.make_tag_dictionary(tag_type='ner') tagger: SequenceTagger = SequenceTagger(hidden_size=512, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type='ner', use_crf=True) trainer: SequenceTaggerTrainer = SequenceTaggerTrainer(tagger, corpus,
tag_to_biloes='ner') tag_type = 'ner' tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(corpus) word_vectors = gensim.models.KeyedVectors.load_word2vec_format('wiki.nl.vec', binary=False) word_vectors.save('wiki.nl.vec.gensim') custom_embedding = WordEmbeddings('wiki.nl.vec.gensim') char_lm_forward = CharLMEmbeddings('lm-nl-large-forward-v0.1.pt') char_lm_backward = CharLMEmbeddings('lm-nl-large-backward-v0.1.pt') embedding_types: List[TokenEmbeddings] = [ custom_embedding, char_lm_forward, char_lm_backward ] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) from flair.models import SequenceTagger tagger: SequenceTagger = SequenceTagger(hidden_size=512, embeddings=embeddings, tag_dictionary=tag_dictionary,
"universal-dependencies-1.2/UD_Bulgarian/bg-ud-test.conllu") corpus: TaggedCorpus = TaggedCorpus(sentences_train, sentences_dev, sentences_test) tag_type = 'upos' tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) word_vectors = gensim.models.KeyedVectors.load_word2vec_format('wiki.bg.vec', binary=False) word_vectors.save('wiki.bg.vec.gensim') custom_embedding = WordEmbeddings('custom', 'wiki.bg.vec.gensim') char_lm_forward = CharLMEmbeddings('lm-bg-small-forward-v0.1.pt') char_lm_backward = CharLMEmbeddings('lm-bg-small-backward-v0.1.pt') embedding_types: List[TokenEmbeddings] = [ custom_embedding, char_lm_forward, char_lm_backward ] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) from flair.models import SequenceTagger tagger: SequenceTagger = SequenceTagger(hidden_size=512, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True)
def test_loading_not_existing_char_lm_embedding(): with pytest.raises(ValueError): CharLMEmbeddings('other')
# 3. make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary.idx2item) # initialize embeddings embedding_types: List[TextEmbeddings] = [ WordEmbeddings('glove') # comment in this line to use character embeddings , CharacterEmbeddings() # comment in these lines to use contextual string embeddings , CharLMEmbeddings('news-forward'), CharLMEmbeddings('news-backward') ] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) # initialize sequence tagger from flair.tagging_model import SequenceTagger tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True) if torch.cuda.is_available(): tagger = tagger.cuda()
# 1. get the corpus corpus: TaggedCorpus = NLPTaskDataFetcher.fetch_data(NLPTask.CONLL_03).downsample(0.1) print(corpus) # 2. what tag do we want to predict? tag_type = 'ner' # 3. make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary.idx2item) # initialize embeddings embedding_types: List[TokenEmbeddings] = [ CharLMEmbeddings('/cl/work/shusuke-t/flair_myLM/resources/taggers/language_model_sub/best-lm.pt'), CharLMEmbeddings('/cl/work/shusuke-t/flair_myLM/resources/taggers/language_model_sub_back/best-lm.pt'), ] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) # initialize sequence tagger from flair.models import SequenceTagger tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True) # initialize trainer