def test_train_resume_sequence_tagging_training(results_base_path, tasks_base_path):
    corpus_1 = flair.datasets.ColumnCorpus(
        data_folder=tasks_base_path / "fashion", column_format={0: "text", 2: "ner"}
    )
    corpus_2 = flair.datasets.GERMEVAL(base_path=tasks_base_path)

    corpus = MultiCorpus([corpus_1, corpus_2])
    tag_dictionary = corpus.make_tag_dictionary("ner")

    embeddings = WordEmbeddings("turian")

    model: SequenceTagger = SequenceTagger(
        hidden_size=64,
        embeddings=embeddings,
        tag_dictionary=tag_dictionary,
        tag_type="ner",
        use_crf=False,
    )

    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True)

    checkpoint = SequenceTagger.load_checkpoint(results_base_path / "checkpoint.pt")
    trainer = ModelTrainer.load_from_checkpoint(checkpoint, corpus)

    trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True)

    # clean up results directory
    shutil.rmtree(results_base_path)
Exemplo n.º 2
0
def test_train_resume_sequence_tagging_training(results_base_path,
                                                tasks_base_path):
    corpus_1 = flair.datasets.ColumnCorpus(data_folder=(tasks_base_path /
                                                        'fashion'),
                                           column_format={
                                               0: 'text',
                                               2: 'ner',
                                           })
    corpus_2 = flair.datasets.GERMEVAL(base_path=tasks_base_path)
    corpus = MultiCorpus([corpus_1, corpus_2])
    tag_dictionary = corpus.make_tag_dictionary('ner')
    embeddings = WordEmbeddings('turian')
    model = SequenceTagger(hidden_size=64,
                           embeddings=embeddings,
                           tag_dictionary=tag_dictionary,
                           tag_type='ner',
                           use_crf=False)
    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path,
                  max_epochs=2,
                  shuffle=False,
                  checkpoint=True)
    checkpoint = SequenceTagger.load_checkpoint(
        (results_base_path / 'checkpoint.pt'))
    trainer = ModelTrainer.load_from_checkpoint(checkpoint, corpus)
    trainer.train(results_base_path,
                  max_epochs=2,
                  shuffle=False,
                  checkpoint=True)
    shutil.rmtree(results_base_path)
Exemplo n.º 3
0
def test_train_load_use_tagger_multicorpus(results_base_path, tasks_base_path):
    corpus_1 = flair.datasets.ColumnCorpus(data_folder=(tasks_base_path /
                                                        'fashion'),
                                           column_format={
                                               0: 'text',
                                               2: 'ner',
                                           })
    corpus_2 = flair.datasets.GERMEVAL(base_path=tasks_base_path)
    corpus = MultiCorpus([corpus_1, corpus_2])
    tag_dictionary = corpus.make_tag_dictionary('ner')
    embeddings = WordEmbeddings('turian')
    tagger = SequenceTagger(hidden_size=64,
                            embeddings=embeddings,
                            tag_dictionary=tag_dictionary,
                            tag_type='ner',
                            use_crf=False)
    trainer = ModelTrainer(tagger, corpus)
    trainer.train(results_base_path,
                  learning_rate=0.1,
                  mini_batch_size=2,
                  max_epochs=2,
                  shuffle=False)
    loaded_model = SequenceTagger.load((results_base_path / 'final-model.pt'))
    sentence = Sentence('I love Berlin')
    sentence_empty = Sentence('       ')
    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])
    shutil.rmtree(results_base_path)
Exemplo n.º 4
0
def test_train_load_use_tagger_multicorpus(results_base_path, tasks_base_path):
    corpus_1 = flair.datasets.ColumnCorpus(data_folder=tasks_base_path /
                                           "fashion",
                                           column_format={
                                               0: "text",
                                               2: "ner"
                                           })
    corpus_2 = flair.datasets.GERMEVAL(base_path=tasks_base_path)

    corpus = MultiCorpus([corpus_1, corpus_2])
    tag_dictionary = corpus.make_tag_dictionary("ner")

    embeddings = WordEmbeddings("turian")

    tagger: SequenceTagger = SequenceTagger(
        hidden_size=64,
        embeddings=embeddings,
        tag_dictionary=tag_dictionary,
        tag_type="ner",
        use_crf=False,
    )

    # initialize trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    trainer.train(
        results_base_path,
        learning_rate=0.1,
        mini_batch_size=2,
        max_epochs=2,
        shuffle=False,
    )

    loaded_model: SequenceTagger = SequenceTagger.load(results_base_path /
                                                       "final-model.pt")

    sentence = Sentence("I love Berlin")
    sentence_empty = Sentence("       ")

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # clean up results directory
    shutil.rmtree(results_base_path)
Exemplo n.º 5
0
    print(" - Corpus Train Size: ", len(corpus.train), flush=True)
    print(" - Corpus Dev Size: ", len(corpus.dev), flush=True)
    print(" - Corpus Test Size: ", len(corpus.test), flush=True)
    print('-' * 150, flush=True)

# initialize embeddings
embedding_types: List[TokenEmbeddings] = [
    WordEmbeddings('extvec'),
    FlairEmbeddings('news-forward'),
    FlairEmbeddings('news-backward'),
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

multi_all_corpus = MultiCorpus(ALL_POS_ALL_CORPUS.values())
# 3. make the tag dictionary from the corpus
tag_dictionary = multi_all_corpus.make_tag_dictionary(tag_type=tag_type)
print('-' * 50,
      '\nTag_dictionary size: ',
      len(tag_dictionary),
      '\n',
      '-' * 50,
      flush=True)
# initialize sequence tagger
tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type)

for target_domain in ALL_POS_DOMAINS:

    train_output_dir = 'resources/taggers/example-pos/' + target_domain
Exemplo n.º 6
0
from flair.data import TaggedCorpus, MultiCorpus
from flair.data_fetcher import NLPTaskDataFetcher
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, BertEmbeddings
from typing import List
from flair.data import Dictionary
import flair, torch
flair.device = torch.device('cpu') 

columns = {0: 'text', 1: 'ner'}
data_folder = '../'
corpus1: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(data_folder, columns, train_file="de-da-te-ta.10E-4percent.conll.train.txt", test_file="de-da-te-ta.10E-4percent.conll.test.txt", dev_file="de-da-te-ta.10E-4percent.conll.dev.txt")
corpus2: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(data_folder, columns, train_file="de-da-te-ta.10E-4percent.conll.84max.train.txt", test_file="de-da-te-ta.10E-4percent.conll.84max.test.txt", dev_file="de-da-te-ta.10E-4percent.conll.84max.dev.txt")
corpus = MultiCorpus([corpus1, corpus2])
tag_type = 'ner'
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
#tag_dictionary: Dictionary = Dictionary.load('../vocab/m.model')

glove_embedding = WordEmbeddings('../../glove/GLOVE/GloVe/vectors.gensim')
word2vec_embedding = WordEmbeddings('../../huawei_w2v/vector.gensim')

#bert_embedding = BertEmbeddings('../bert_pretraining/pretraining_outputs/pretraining_output_batch_size_32')
embedding_types: List[TokenEmbeddings] = [WordEmbeddings('tr'), glove_embedding, word2vec_embedding]
#embedding_types: List[TokenEmbeddings] = [custom_embedding]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True, use_rnn=True, rnn_layers=2)

from flair.trainers import ModelTrainer
    FlairEmbeddings("pubmed-backward"),

]

embeddings = StackedEmbeddings(embeddings=embedding_types)

# 3. Initialize corpus
# We also train on the test portions of the corpora, because we evaluate on held-out corpora
from flair.data import MultiCorpus
from torch.utils.data import ConcatDataset
corpus = MultiCorpus(GENE_CORPORA)
corpus._train = ConcatDataset([corpus._train, corpus._test])

# 4. Initialize sequence tagger
from flair.models import SequenceTagger
tag_dictionary = corpus.make_tag_dictionary(tag_type="ner")

tagger = SequenceTagger(
    hidden_size=256,
    embeddings=embeddings,
    tag_dictionary=tag_dictionary,
    tag_type="ner",
    use_crf=True,
    locked_dropout=0.5
)

# 5. train the model
from flair.trainers import ModelTrainer
trainer = ModelTrainer(tagger, corpus)

trainer.train(