def test_sequence_classifier_trainer():
    corpus = TREC_6()

    # Instantiate AdaptNLP easy document embeddings module, which can take in a variable number of embeddings to make `Stacked Embeddings`.
    # You may also use custom Transformers LM models by specifying the path the the language model
    doc_embeddings = EasyDocumentEmbeddings("bert-base-cased", methods=["rnn"])

    # Instantiate Sequence Classifier Trainer by loading in the data, data column map, and embeddings as an encoder
    trainer = SequenceClassifierTrainer(corpus=corpus,
                                        encoder=doc_embeddings,
                                        column_name_map={
                                            0: "text",
                                            1: "label"
                                        })

    trainer
Exemplo n.º 2
0
    def fine_tune(self):
        if isinstance(self.document_embedding, TransformerDocumentEmbeddings):
            corpus = TREC_6()
            label_dict = corpus.make_label_dictionary()
            classifier = TextClassifier(self.document_embedding,
                                        label_dictionary=label_dict)
            trainer = ModelTrainer(classifier, corpus, optimizer=Adam)

            # 6. start the training
            trainer.train(
                'resources/taggers/trec',
                learning_rate=3e-5,  # use very small learning rate
                mini_batch_size=16,
                mini_batch_chunk_size=
                4,  # optionally set this if transformer is too much for your machine
                max_epochs=5,  # terminate after 5 epochs
            )
        else:
            raise UserWarning(
                "No fine tuning for this embedding type implemented")
from flair.data import Corpus
from flair.datasets import TREC_6
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentRNNEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from flair.data import Sentence

# 1. get the corpus
corpus: Corpus = TREC_6()

# 2. create the label dictionary
label_dict = corpus.make_label_dictionary()

# 3. make a list of word embeddings
word_embeddings = [
    WordEmbeddings(
        'glove'),  # comment in flair embrddings for state-of-the-art results
    # FlairEmbeddingd('news-forward'), # FlairEmbeddings('news-backward'),
]

# 4. initialize document embedding by passing list of word embeddings
# Can choose between many RNN types (GRU by default, to change use rnn_type parameter)
document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(
    word_embeddings,
    hidden_size=512,
    reproject_words=True,
    reproject_words_dimension=256,
)

# 5. create the text classifier
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict)
from FlairParamOptimizer import search_strategies, search_spaces, orchestrator
import FlairParamOptimizer.parameter_listings.parameters_for_user_input as param
from flair.datasets import TREC_6
from torch.optim import SGD, Adam

# 1.) Define your corpus
corpus = TREC_6()

# 2.) create an search space
search_space = search_spaces.TextClassifierSearchSpace()
search_strategy = search_strategies.GridSearch()

# 3.) depending on your task add the respective parameters you want to optimize over
search_space.add_budget(param.Budget.TIME_IN_H, 24)
search_space.add_evaluation_metric(param.EvaluationMetric.MICRO_F1_SCORE)
search_space.add_optimization_value(param.OptimizationValue.DEV_SCORE)
search_space.add_max_epochs_per_training_run(15)

#Depending on your downstream task, add embeddings and specify these with the respective Parameters below
search_space.add_parameter(param.ModelTrainer.LEARNING_RATE,
                           options=[0.01, 0.05, 0.1])
search_space.add_parameter(param.ModelTrainer.MINI_BATCH_SIZE,
                           options=[16, 32, 64])
search_space.add_parameter(param.ModelTrainer.ANNEAL_FACTOR,
                           options=[0.25, 0.5])
search_space.add_parameter(param.ModelTrainer.OPTIMIZER, options=[SGD, Adam])
search_space.add_parameter(param.Optimizer.WEIGHT_DECAY, options=[1e-2, 0])

#Define parameters for document embeddings RNN
search_space.add_parameter(param.DocumentRNNEmbeddings.HIDDEN_SIZE,
                           options=[128, 256, 512])
from flair.embeddings import WordEmbeddings, TransformerWordEmbeddings, TransformerDocumentEmbeddings
from flair.data import Sentence
from flair.data import MultiCorpus
from flair.datasets import TREC_6



# 1. define label names in natural language since some datasets come with cryptic set of labels
label_name_map = {'ENTY':'question about entity',
                  'DESC':'question about description',
                  'ABBR':'question about abbreviation',
                  'HUM':'question about person',
                  'NUM':'question about number',
                  'LOC':'question about location'
                  }
corpus = TREC_6(label_name_map=label_name_map)
print(corpus)
corpus = corpus.downsample(0.1)
print(corpus)


label_dictionary = corpus.make_label_dictionary()
print(label_dictionary)
tagger = TARSClassifier(label_dictionary=label_dictionary,label_type="label", task_name="TEST_CLASS")

trainer = ModelTrainer(tagger, corpus)
trainer.train(
    base_path='resources/taggers/tars',
    learning_rate=0.01,
    mini_batch_size=16,
    mini_batch_chunk_size=4,