예제 #1
0
def test_train_charlm_changed_chache_load_use_tagger(results_base_path,
                                                     tasks_base_path):

    corpus = NLPTaskDataFetcher.fetch_data(NLPTask.FASHION,
                                           base_path=tasks_base_path)
    tag_dictionary = corpus.make_tag_dictionary('ner')

    # make a temporary cache directory that we remove afterwards
    cache_dir = results_base_path / 'cache'
    os.makedirs(cache_dir, exist_ok=True)
    embeddings = CharLMEmbeddings('news-forward-fast',
                                  cache_directory=cache_dir)

    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type='ner',
                                            use_crf=False)

    # initialize trainer
    trainer: SequenceTaggerTrainer = SequenceTaggerTrainer(tagger,
                                                           corpus,
                                                           test_mode=True)

    trainer.train(str(results_base_path),
                  learning_rate=0.1,
                  mini_batch_size=2,
                  max_epochs=2)

    # remove the cache directory
    shutil.rmtree(cache_dir)

    loaded_model: SequenceTagger = SequenceTagger.load_from_file(
        results_base_path / 'final-model.pt')

    sentence = Sentence('I love Berlin')
    sentence_empty = Sentence('       ')

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # clean up results directory
    shutil.rmtree(results_base_path)
예제 #2
0
def test_train_charlm_load_use_classifier(results_base_path, tasks_base_path):

    corpus = NLPTaskDataFetcher.load_corpus(NLPTask.IMDB,
                                            base_path=tasks_base_path)
    label_dict = corpus.make_label_dictionary()

    glove_embedding: TokenEmbeddings = CharLMEmbeddings('news-forward-fast')
    document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings(
        [glove_embedding], 128, 1, False, 64, False, False)

    model = TextClassifier(document_embeddings, label_dict, False)

    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path,
                  EvaluationMetric.MACRO_F1_SCORE,
                  max_epochs=2,
                  test_mode=True)

    sentence = Sentence("Berlin is a really nice city.")

    for s in model.predict(sentence):
        for l in s.labels:
            assert (l.value is not None)
            assert (0.0 <= l.score <= 1.0)
            assert (type(l.score) is float)

    loaded_model = TextClassifier.load_from_file(results_base_path /
                                                 'final-model.pt')

    sentence = Sentence('I love Berlin')
    sentence_empty = Sentence('       ')

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # clean up results directory
    shutil.rmtree(results_base_path)
예제 #3
0
def test_train_charlm_load_use_tagger(results_base_path, tasks_base_path):

    corpus = NLPTaskDataFetcher.load_corpus(NLPTask.FASHION,
                                            base_path=tasks_base_path)
    tag_dictionary = corpus.make_tag_dictionary('ner')

    embeddings = CharLMEmbeddings('news-forward-fast')

    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type='ner',
                                            use_crf=False)

    # initialize trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    trainer.train(results_base_path,
                  EvaluationMetric.MICRO_F1_SCORE,
                  learning_rate=0.1,
                  mini_batch_size=2,
                  max_epochs=2,
                  test_mode=True)

    loaded_model: SequenceTagger = SequenceTagger.load_from_file(
        results_base_path / 'final-model.pt')

    sentence = Sentence('I love Berlin')
    sentence_empty = Sentence('       ')

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # clean up results directory
    shutil.rmtree(results_base_path)
예제 #4
0
# Check, if our dataset reader works
assert len(sentences_train) == number_train_tweets
assert len(sentences_dev) == number_dev_tweets
assert len(sentences_test) == number_test_tweets

corpus: TaggedCorpus = TaggedCorpus(sentences_train, sentences_dev,
                                    sentences_test)

tag_type = 'pos'

tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

embedding_types: List[TokenEmbeddings] = [
    WordEmbeddings('de'),
    CharLMEmbeddings('german-forward'),
    CharLMEmbeddings('german-backward'),
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

from hyperopt import hp
from flair.hyperparameter.param_selection import SearchSpace, Parameter

search_space = SearchSpace()
search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[embeddings])
search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[32, 64, 128, 256, 512])
search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2])
search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5)
search_space.add(Parameter.LEARNING_RATE, hp.choice, options=[0.05, 0.1, 0.15, 0.2])
search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[8, 16, 24, 32])
예제 #5
0
"""

from typing import List
from flair.embeddings import StackedEmbeddings, CharLMEmbeddings, TokenEmbeddings
from flair.models import SequenceTagger
from flair.trainers import SequenceTaggerTrainer
from flair.data import TaggedCorpus

from models import FORWARD_LM, BACKWARD_LM, GLOVE
from embeddings import KeyedWordEmbeddings
from ne_groups import GROUPS
from corpora import read_group

embedding_types: List[TokenEmbeddings] = [
    KeyedWordEmbeddings(GLOVE),
    CharLMEmbeddings(FORWARD_LM),
    CharLMEmbeddings(BACKWARD_LM)
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

for entities in GROUPS:
    corpus: TaggedCorpus = read_group(entities)
    tag_dictionary = corpus.make_tag_dictionary(tag_type='ner')
    tagger: SequenceTagger = SequenceTagger(hidden_size=512,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type='ner',
                                            use_crf=True)
    trainer: SequenceTaggerTrainer = SequenceTaggerTrainer(tagger,
                                                           corpus,
예제 #6
0
                                                        tag_to_biloes='ner')

tag_type = 'ner'



tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

print(corpus)

word_vectors = gensim.models.KeyedVectors.load_word2vec_format('wiki.nl.vec', binary=False)
word_vectors.save('wiki.nl.vec.gensim')

custom_embedding = WordEmbeddings('wiki.nl.vec.gensim')

char_lm_forward = CharLMEmbeddings('lm-nl-large-forward-v0.1.pt')
char_lm_backward = CharLMEmbeddings('lm-nl-large-backward-v0.1.pt')

embedding_types: List[TokenEmbeddings] = [
    custom_embedding,
    char_lm_forward,
    char_lm_backward
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size=512,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
예제 #7
0
    "universal-dependencies-1.2/UD_Bulgarian/bg-ud-test.conllu")

corpus: TaggedCorpus = TaggedCorpus(sentences_train, sentences_dev,
                                    sentences_test)

tag_type = 'upos'

tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

word_vectors = gensim.models.KeyedVectors.load_word2vec_format('wiki.bg.vec',
                                                               binary=False)
word_vectors.save('wiki.bg.vec.gensim')

custom_embedding = WordEmbeddings('custom', 'wiki.bg.vec.gensim')

char_lm_forward = CharLMEmbeddings('lm-bg-small-forward-v0.1.pt')
char_lm_backward = CharLMEmbeddings('lm-bg-small-backward-v0.1.pt')

embedding_types: List[TokenEmbeddings] = [
    custom_embedding, char_lm_forward, char_lm_backward
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size=512,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)
예제 #8
0
def test_loading_not_existing_char_lm_embedding():
    with pytest.raises(ValueError):
        CharLMEmbeddings('other')
예제 #9
0
파일: train.py 프로젝트: younengma/flair
# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary.idx2item)

# initialize embeddings
embedding_types: List[TextEmbeddings] = [
    WordEmbeddings('glove')

    # comment in this line to use character embeddings
    ,
    CharacterEmbeddings()

    # comment in these lines to use contextual string embeddings
    ,
    CharLMEmbeddings('news-forward'),
    CharLMEmbeddings('news-backward')
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

# initialize sequence tagger
from flair.tagging_model import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)
if torch.cuda.is_available():
    tagger = tagger.cuda()
예제 #10
0
# 1. get the corpus
corpus: TaggedCorpus = NLPTaskDataFetcher.fetch_data(NLPTask.CONLL_03).downsample(0.1)
print(corpus)

# 2. what tag do we want to predict?
tag_type = 'ner'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary.idx2item)

# initialize embeddings

embedding_types: List[TokenEmbeddings] = [
    CharLMEmbeddings('/cl/work/shusuke-t/flair_myLM/resources/taggers/language_model_sub/best-lm.pt'),
    CharLMEmbeddings('/cl/work/shusuke-t/flair_myLM/resources/taggers/language_model_sub_back/best-lm.pt'),
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

# initialize sequence tagger
from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)

# initialize trainer