Пример #1
0
    def load_from_checkpoint(checkpoint_file: Path,
                             model_type: str,
                             corpus: Corpus,
                             optimizer: Optimizer = SGD):
        if model_type == 'SequenceTagger':
            checkpoint = SequenceTagger.load_checkpoint(checkpoint_file)
            return ModelTrainer(
                checkpoint['model'],
                corpus,
                optimizer,
                epoch=checkpoint['epoch'],
                loss=checkpoint['loss'],
                optimizer_state=checkpoint['optimizer_state_dict'],
                scheduler_state=checkpoint['scheduler_state_dict'])

        if model_type == 'TextClassifier':
            checkpoint = TextClassifier.load_checkpoint(checkpoint_file)
            return ModelTrainer(
                checkpoint['model'],
                corpus,
                optimizer,
                epoch=checkpoint['epoch'],
                loss=checkpoint['loss'],
                optimizer_state=checkpoint['optimizer_state_dict'],
                scheduler_state=checkpoint['scheduler_state_dict'])

        raise ValueError(
            'Incorrect model type! Use one of the following: "SequenceTagger", "TextClassifier".'
        )
Пример #2
0
def test_train_resume_text_classification_training(results_base_path,
                                                   tasks_base_path):
    corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb")
    label_dict = corpus.make_label_dictionary()

    embeddings: TokenEmbeddings = FlairEmbeddings("news-forward-fast",
                                                  use_cache=False)
    document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(
        [embeddings], 128, 1, False)

    model = TextClassifier(document_embeddings, label_dict, False)

    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path,
                  max_epochs=2,
                  shuffle=False,
                  checkpoint=True)

    checkpoint = TextClassifier.load_checkpoint(results_base_path /
                                                "checkpoint.pt")
    trainer = ModelTrainer.load_from_checkpoint(checkpoint, corpus)
    trainer.train(results_base_path,
                  max_epochs=2,
                  shuffle=False,
                  checkpoint=True)

    # clean up results directory
    shutil.rmtree(results_base_path)
Пример #3
0
    def load_from_checkpoint(
        checkpoint_file: Path,
        model_type: str,
        corpus: Corpus,
        optimizer: Optimizer = SGD,
    ):
        if model_type == "SequenceTagger":
            checkpoint = SequenceTagger.load_checkpoint(checkpoint_file)
            return ModelTrainer(
                checkpoint["model"],
                corpus,
                optimizer,
                epoch=checkpoint["epoch"],
                loss=checkpoint["loss"],
                optimizer_state=checkpoint["optimizer_state_dict"],
                scheduler_state=checkpoint["scheduler_state_dict"],
            )

        if model_type == "TextClassifier":
            checkpoint = TextClassifier.load_checkpoint(checkpoint_file)
            return ModelTrainer(
                checkpoint["model"],
                corpus,
                optimizer,
                epoch=checkpoint["epoch"],
                loss=checkpoint["loss"],
                optimizer_state=checkpoint["optimizer_state_dict"],
                scheduler_state=checkpoint["scheduler_state_dict"],
            )

        raise ValueError(
            'Incorrect model type! Use one of the following: "SequenceTagger", "TextClassifier".'
        )
Пример #4
0
def test_train_resume_text_classification_training(results_base_path,
                                                   tasks_base_path):
    corpus = flair.datasets.ClassificationCorpus((tasks_base_path / 'imdb'))
    label_dict = corpus.make_label_dictionary()
    embeddings = FlairEmbeddings('news-forward-fast')
    document_embeddings = DocumentRNNEmbeddings([embeddings], 128, 1, False)
    model = TextClassifier(document_embeddings, label_dict, False)
    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path,
                  max_epochs=2,
                  shuffle=False,
                  checkpoint=True)
    checkpoint = TextClassifier.load_checkpoint(
        (results_base_path / 'checkpoint.pt'))
    trainer = ModelTrainer.load_from_checkpoint(checkpoint, corpus)
    trainer.train(results_base_path,
                  max_epochs=2,
                  shuffle=False,
                  checkpoint=True)
    shutil.rmtree(results_base_path)
Пример #5
0
from __future__ import absolute_import
def trainer(file_path: Path, filenames: Tuple[str, str, str], checkpoint: str,
            stack: str, n_epochs: int) -> None:
    """Train sentiment model using Flair NLP library:
    https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md

    To help provide added context, we can stack Glove, Bert or ELMo embeddings along with Flair embeddings.
    """
    # pip install flair allennlp
    from flair.datasets import ClassificationCorpus
    from flair.embeddings import FlairEmbeddings, DocumentRNNEmbeddings
    from flair.models import TextClassifier
    from flair.trainers import ModelTrainer
    from flair.training_utils import EvaluationMetric
    from flair.visual.training_curves import Plotter

    if stack == "glove":
        from flair.embeddings import WordEmbeddings
        stacked_embedding = WordEmbeddings('glove')
    elif stack == "elmo":
        from flair.embeddings import ELMoEmbeddings
        stacked_embedding = ELMoEmbeddings('original')
    elif stack == "bert":
        from flair.embeddings import BertEmbeddings
        stacked_embedding = BertEmbeddings('bert-base-cased')
    else:
        stacked_embedding = None

    # Define and Load corpus from the provided dataset
    train, dev, test = filenames
    corpus = ClassificationCorpus(
        file_path,
        train_file=train,
        dev_file=dev,
        test_file=test,
    )
    # Create label dictionary from provided labels in data
    label_dict = corpus.make_label_dictionary()

    # Stack Flair string-embeddings with optional embeddings
    word_embeddings = list(
        filter(None, [
            stacked_embedding,
            FlairEmbeddings('news-forward'),
            FlairEmbeddings('news-backward'),
        ]))
    # Initialize document embedding by passing list of word embeddings
    document_embeddings = DocumentRNNEmbeddings(
        word_embeddings,
        hidden_size=512,
        reproject_words=True,
        reproject_words_dimension=256,
    )
    # Define classifier
    classifier = TextClassifier(document_embeddings,
                                label_dictionary=label_dict,
                                multi_label=False)

    if not checkpoint:
        trainer = ModelTrainer(classifier, corpus)
    else:
        # If checkpoint file is defined, resume training
        checkpoint = classifier.load_checkpoint(Path(checkpoint))
        trainer = ModelTrainer.load_from_checkpoint(checkpoint, corpus)

    # Begin training (enable checkpointing to continue training at a later time, if desired)
    trainer.train(
        base_path=file_path,
        #EvaluationMetric.MACRO_F1_SCORE,
        max_epochs=n_epochs,
        checkpoint=True)

    # Plot curves and store weights and losses
    plotter = Plotter()
    plotter.plot_training_curves(file_path + '/loss.tsv')
    plotter.plot_weights(file_path + '/weights.txt')
Пример #7
0
from pathlib import Path