예제 #1
0
def test_train_load_use_tagger(results_base_path, tasks_base_path):
    corpus = flair.datasets.ColumnCorpus(
        data_folder=tasks_base_path / "fashion", column_format={0: "text", 2: "ner"}
    )
    tag_dictionary = corpus.make_tag_dictionary("ner")

    embeddings = WordEmbeddings("turian")

    tagger: SequenceTagger = SequenceTagger(
        hidden_size=64,
        embeddings=embeddings,
        tag_dictionary=tag_dictionary,
        tag_type="ner",
        use_crf=False,
    )

    # initialize trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    trainer.train(
        results_base_path,
        EvaluationMetric.MICRO_F1_SCORE,
        learning_rate=0.1,
        mini_batch_size=2,
        max_epochs=2,
        shuffle=False,
    )

    loaded_model: SequenceTagger = SequenceTagger.load(
        results_base_path / "final-model.pt"
    )

    sentence = Sentence("I love Berlin")
    sentence_empty = Sentence("       ")

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # clean up results directory
    shutil.rmtree(results_base_path)
예제 #2
0
def train(data_folder, model_output_folder):

    corpus: TaggedCorpus = NLPTaskDataFetcher.load_corpus(
        NLPTask.CONLL_03, base_path=data_folder)

    # 2. what tag do we want to predict?
    tag_type = 'ner'

    # 3. make the tag dictionary from the corpus
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
    print(tag_dictionary.idx2item)

    # 4. initialize embeddings
    embedding_types: List[TokenEmbeddings] = [
        WordEmbeddings('glove'),
        FlairEmbeddings('news-forward'),
        FlairEmbeddings('news-backward')
    ]

    embeddings: StackedEmbeddings = StackedEmbeddings(
        embeddings=embedding_types)

    # 5. initialize sequence tagger
    from flair.models import SequenceTagger
    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=tag_type)
    # 6. initialize trainer
    from flair.trainers import ModelTrainer

    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    # 7. start training
    trainer.train(model_output_folder, mini_batch_size=256, max_epochs=150)

    # 8. plot training curves (optional)
    from flair.visual.training_curves import Plotter
    plotter = Plotter()
    plotter.plot_training_curves(model_output_folder + '/loss.tsv')
    plotter.plot_weights(model_output_folder + '/weights.txt')
예제 #3
0
def get_corpus_and_tagger():
    columns = {0: 'text', 1: 'ner'}

    data_folder = 'data/'

    # init a corpus using column format, data folder and the names of the train, dev and test files
    corpus: Corpus = ColumnCorpus(data_folder,
                                  columns,
                                  train_file='train_IOB_Format_file.txt',
                                  test_file='train_IOB_Format_file.txt',
                                  dev_file="dev_IOB_format_file.txt")

    tag_type = 'ner'

    # 3. make the tag dictionary from the corpus
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

    embedding_types: List[TokenEmbeddings] = [
        WordEmbeddings('glove'),

        # comment in this line to use character embeddings
        #     CharacterEmbeddings(),

        # comment in these lines to use flair embeddings.
        FlairEmbeddings('news-forward'),
        FlairEmbeddings('news-backward'),
    ]

    embeddings: StackedEmbeddings = StackedEmbeddings(
        embeddings=embedding_types)

    # 5. initialize sequence tagger
    from flair.models import SequenceTagger

    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=tag_type,
                                            use_crf=True)

    return corpus, tagger
예제 #4
0
def test_train_optimizer(results_base_path, tasks_base_path):
    corpus = NLPTaskDataFetcher.load_corpus(NLPTask.FASHION,
                                            base_path=tasks_base_path)
    tag_dictionary = corpus.make_tag_dictionary("ner")

    embeddings = WordEmbeddings("turian")

    tagger: SequenceTagger = SequenceTagger(
        hidden_size=64,
        embeddings=embeddings,
        tag_dictionary=tag_dictionary,
        tag_type="ner",
        use_crf=False,
    )

    optimizer: Optimizer = Adam

    # initialize trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpus, optimizer=optimizer)

    trainer.train(
        results_base_path,
        EvaluationMetric.MICRO_F1_SCORE,
        learning_rate=0.1,
        mini_batch_size=2,
        max_epochs=2,
        test_mode=True,
    )

    loaded_model: SequenceTagger = SequenceTagger.load(results_base_path /
                                                       "final-model.pt")

    sentence = Sentence("I love Berlin")
    sentence_empty = Sentence("       ")

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # clean up results directory
    shutil.rmtree(results_base_path)
예제 #5
0
def test_train_load_use_tagger_adam(results_base_path, tasks_base_path):
    corpus = flair.datasets.ColumnCorpus(data_folder=tasks_base_path /
                                         "fashion",
                                         column_format={
                                             0: "text",
                                             3: "ner"
                                         })
    tag_dictionary = corpus.make_label_dictionary("ner")

    tagger: SequenceTagger = SequenceTagger(
        hidden_size=64,
        embeddings=turian_embeddings,
        tag_dictionary=tag_dictionary,
        tag_type="ner",
        use_crf=False,
    )

    # initialize trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    trainer.train(
        results_base_path,
        learning_rate=0.1,
        mini_batch_size=2,
        max_epochs=2,
        shuffle=False,
        optimizer=Adam,
    )

    del trainer, tagger, tag_dictionary, corpus
    loaded_model: SequenceTagger = SequenceTagger.load(results_base_path /
                                                       "final-model.pt")

    sentence = Sentence("I love Berlin")
    sentence_empty = Sentence("       ")

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    del loaded_model
예제 #6
0
def train_cat_tagger():
    corpus = flair.datasets.UD_CATALAN()
    tag_type = 'pos'
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

    embedding_types = [WordEmbeddings('ca')]
    embeddings: StackedEmbeddings = StackedEmbeddings(
        embeddings=embedding_types)

    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=tag_type,
                                            use_crf=True)

    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    trainer.train('/home/danielly/workspace',
                  learning_rate=0.1,
                  mini_batch_size=32,
                  max_epochs=150)
예제 #7
0
def get_model(corpus: flair.data.Corpus,
              corpus_name: str,
              pooled_contextual_embeddings: bool,
              contextual_forward_path: str = None,
              contextual_backward_path: str = None):
    tag_type = 'ner'
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

    embedding_types: List[TokenEmbeddings] = get_embeddings(
        corpus_name=corpus_name,
        pooled=pooled_contextual_embeddings,
        contextual_forward_path=contextual_forward_path,
        contextual_backward_path=contextual_backward_path)

    embeddings: StackedEmbeddings = StackedEmbeddings(
        embeddings=embedding_types)
    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=tag_type)
    return tagger
예제 #8
0
def main(
    data_folder: str,
    model_folder: str,
    dev_size: float,
    nb_epochs: int,
    nb_segment: Optional[int],
    segment: Optional[int],
) -> None:
    nlp = spacy.blank(name="fr")
    nlp.tokenizer = get_tokenizer(nlp)

    corpus: Corpus = prepare_flair_train_dev_corpus(
        spacy_model=nlp, data_folder=data_folder, dev_size=dev_size, nb_segment=nb_segment, segment=segment
    )
    tag_dictionary = corpus.make_tag_dictionary(tag_type="ner")
    print(tag_dictionary.idx2item)

    embedding_types: List[TokenEmbeddings] = [
        WordEmbeddings("fr"),
        FlairEmbeddings("fr-forward"),
        FlairEmbeddings("fr-backward"),
    ]

    embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

    tagger: SequenceTagger = SequenceTagger(
        hidden_size=256, embeddings=embeddings, use_crf=True, tag_dictionary=tag_dictionary, tag_type="ner"
    )

    trainer: ModelTrainer = ModelTrainer(model=tagger, corpus=corpus, use_tensorboard=False)

    # TODO optimize LR https://github.com/flairNLP/flair/blob/master/resources/docs/TUTORIAL_8_MODEL_OPTIMIZATION.md
    trainer.train(
        model_folder,
        max_epochs=nb_epochs,
        learning_rate=0.1,
        mini_batch_size=32,
        embeddings_storage_mode="cpu",
        checkpoint=False,
    )
예제 #9
0
파일: ner_flair.py 프로젝트: jt17383/flair
def run(args):
    # 1. get the corpus
    corpus: Corpus = WNUT_17()

    # 2. what tag do we want to predict?
    tag_type = 'ner'

    # 3. make the tag dictionary from the corpus
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

    # initialize embeddings
    embedding_types: List[TokenEmbeddings] = [
        WordEmbeddings('crawl'),
        WordEmbeddings('twitter'),
        FlairEmbeddings('news-forward'),
        FlairEmbeddings('news-backward'),
    ]

    embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

    # initialize sequence tagger
    from flair.models import SequenceTagger

    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=tag_type)

    # initialize trainer
    from flair.trainers import ModelTrainer

    trainer: ModelTrainer = ModelTrainer(tagger, corpus, use_tensorboard=True)

    trainer.train(
        args.job_dir,
        train_with_dev=True,
        max_epochs=args.epochs
    )

    upload_results(args)
예제 #10
0
def trainModel(serial_no):
    # define columns
    columns = {0: 'text', 1: 'ner'}
    # directory where the data resides
    data_folder = 'dummy-data/dummy-data-' + str(serial_no) + '/'
    # initializing the corpus
    from flair.datasets import ColumnCorpus
    corpus: Corpus = ColumnCorpus(data_folder,
                                  columns,
                                  train_file='train.txt',
                                  test_file='test.txt',
                                  dev_file='dev.txt')

    # Tag to predict
    tag_type = 'ner'
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
    # Use flair embeddings
    from flair.embeddings import WordEmbeddings, StackedEmbeddings
    from typing import List
    embedding_types: List[TokenEmbeddings] = [
        WordEmbeddings('glove'),
    ]
    embeddings: StackedEmbeddings = StackedEmbeddings(
        embeddings=embedding_types)
    # Initialize sequence tagger (bi-LSTM, CRF)
    from flair.models import SequenceTagger
    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=tag_type,
                                            use_crf=True)
    print(tagger)

    # Train model
    from flair.trainers import ModelTrainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)
    trainer.train('dummy-model/dummy-model-' + str(serial_no),
                  learning_rate=0.1,
                  mini_batch_size=32,
                  max_epochs=150)
예제 #11
0
def test_train_load_use_tagger_multicorpus(results_base_path, tasks_base_path):
    corpus = NLPTaskDataFetcher.load_corpora(
        [NLPTask.FASHION, NLPTask.GERMEVAL], base_path=tasks_base_path
    )
    tag_dictionary = corpus.make_tag_dictionary("ner")

    embeddings = WordEmbeddings("turian")

    tagger: SequenceTagger = SequenceTagger(
        hidden_size=64,
        embeddings=embeddings,
        tag_dictionary=tag_dictionary,
        tag_type="ner",
        use_crf=False,
    )

    # initialize trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    trainer.train(
        results_base_path,
        learning_rate=0.1,
        mini_batch_size=2,
        max_epochs=2,
        test_mode=True,
    )

    loaded_model: SequenceTagger = SequenceTagger.load_from_file(
        results_base_path / "final-model.pt"
    )

    sentence = Sentence("I love Berlin")
    sentence_empty = Sentence("       ")

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # clean up results directory
    shutil.rmtree(results_base_path)
예제 #12
0
파일: tagger.py 프로젝트: sadhudgp91/catma
    def train(self, trainfile, devfile, testfile, resfolder, embtype="bert", chunk_len=100, batch_len=8):
        """
        *** This method can be used to train new models with the settings used in project Redewiedergabe
        It is not accessible from rwtagger_script and not documented in detail. Use at your own risk. ;-)
        ***
        :param trainfile:
        :param devfile:
        :param testfile:
        :param resfolder:
        :param embtype:
        :param chunk_len:
        :param batch_len:
        :return:
        """
        emb_name, embeddings = self._get_embeddings(embtype)
        
        corpus: Corpus = self.create_corpus(trainfile, devfile, testfile, chunk_len)
        tag_dictionary = corpus.make_tag_dictionary(tag_type="cat")

        if not os.path.exists(resfolder):
            os.makedirs(resfolder)

        tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                                embeddings=embeddings,
                                                tag_dictionary=tag_dictionary,
                                                tag_type="cat",
                                                use_crf=True,
                                                rnn_layers=2
                                                )
        trainer: ModelTrainer = ModelTrainer(tagger, corpus)

        trainer.train(resfolder,
                      learning_rate=0.1,
                      mini_batch_size=batch_len,
                      max_epochs=150,
                      checkpoint=True)
        # plot training curves
        plotter = Plotter()
        plotter.plot_training_curves(os.path.join(resfolder, 'loss.tsv'))
        plotter.plot_weights(os.path.join(resfolder, 'weights.txt'))
예제 #13
0
def test_train_resume_sequence_tagging_training(results_base_path,
                                                tasks_base_path):
    corpus_1 = flair.datasets.ColumnCorpus(data_folder=tasks_base_path /
                                           "fashion",
                                           column_format={
                                               0: "text",
                                               2: "ner"
                                           })
    corpus_2 = flair.datasets.GERMEVAL(base_path=tasks_base_path)

    corpus = MultiCorpus([corpus_1, corpus_2])
    tag_dictionary = corpus.make_tag_dictionary("ner")

    embeddings = WordEmbeddings("turian")

    model: SequenceTagger = SequenceTagger(
        hidden_size=64,
        embeddings=embeddings,
        tag_dictionary=tag_dictionary,
        tag_type="ner",
        use_crf=False,
    )

    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path,
                  max_epochs=2,
                  shuffle=False,
                  checkpoint=True)

    checkpoint = SequenceTagger.load_checkpoint(results_base_path /
                                                "checkpoint.pt")
    trainer = ModelTrainer.load_from_checkpoint(checkpoint, corpus)

    trainer.train(results_base_path,
                  max_epochs=2,
                  shuffle=False,
                  checkpoint=True)

    # clean up results directory
    shutil.rmtree(results_base_path)
예제 #14
0
def test_train_load_use_tagger_large(results_base_path, tasks_base_path):
    corpus = flair.datasets.UD_ENGLISH().downsample(0.05)
    tag_dictionary = corpus.make_tag_dictionary('pos')
    embeddings = WordEmbeddings('turian')
    tagger = SequenceTagger(hidden_size=64,
                            embeddings=embeddings,
                            tag_dictionary=tag_dictionary,
                            tag_type='pos',
                            use_crf=False)
    trainer = ModelTrainer(tagger, corpus)
    trainer.train(results_base_path,
                  learning_rate=0.1,
                  mini_batch_size=32,
                  max_epochs=2,
                  shuffle=False)
    loaded_model = SequenceTagger.load((results_base_path / 'final-model.pt'))
    sentence = Sentence('I love Berlin')
    sentence_empty = Sentence('       ')
    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])
    shutil.rmtree(results_base_path)
예제 #15
0
def test_train_charlm_nochache_load_use_tagger(results_base_path,
                                               tasks_base_path):
    corpus = NLPTaskDataFetcher.load_corpus(NLPTask.FASHION,
                                            base_path=tasks_base_path)
    tag_dictionary = corpus.make_tag_dictionary("ner")

    embeddings = FlairEmbeddings("news-forward-fast", use_cache=False)

    tagger: SequenceTagger = SequenceTagger(
        hidden_size=64,
        embeddings=embeddings,
        tag_dictionary=tag_dictionary,
        tag_type="ner",
        use_crf=False,
    )

    # initialize trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    trainer.train(
        results_base_path,
        learning_rate=0.1,
        mini_batch_size=2,
        max_epochs=2,
        test_mode=True,
    )

    loaded_model: SequenceTagger = SequenceTagger.load(results_base_path /
                                                       "final-model.pt")

    sentence = Sentence("I love Berlin")
    sentence_empty = Sentence("       ")

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # clean up results directory
    shutil.rmtree(results_base_path)
예제 #16
0
    def train(self, training_dir=None):
        from flair.trainers import ModelTrainer

        if training_dir is None:
            training_dir = script_dir + "flair" + os.sep

        # define columns
        columns = {0: "text", 1: "ner"}

        # this is the folder in which train, test and dev files reside
        data_folder = training_dir + "data"

        # init a corpus using column format, data folder and the names of the train, dev and test files
        # note that training data should be unescaped, i.e. tokens like "&", not "&"
        corpus: Corpus = ColumnCorpus(
            data_folder,
            columns,
            train_file="sent_train.txt",
            test_file="sent_test.txt",
            dev_file="sent_dev.txt",
        )

        print(corpus)

        tag_type = "ner"
        tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
        print(tag_dictionary)

        # initialize embeddings
        embeddings: TransformerWordEmbeddings = TransformerWordEmbeddings('onlplab/alephbert-base')

        tagger: SequenceTagger = SequenceTagger(
            hidden_size=128, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True,
        )

        trainer: ModelTrainer = ModelTrainer(tagger, corpus)

        trainer.train(training_dir, learning_rate=0.1, mini_batch_size=32, max_epochs=50)
        self.model = tagger
예제 #17
0
def test_find_learning_rate(results_base_path, tasks_base_path):
    corpus = NLPTaskDataFetcher.load_corpus(NLPTask.FASHION,
                                            base_path=tasks_base_path)
    tag_dictionary = corpus.make_tag_dictionary('ner')

    embeddings = WordEmbeddings('turian')

    tagger: SequenceTagger = SequenceTagger(hidden_size=64,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type='ner',
                                            use_crf=False)

    optimizer: Optimizer = SGD

    # initialize trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpus, optimizer=optimizer)

    trainer.find_learning_rate(results_base_path, iterations=5)

    # clean up results directory
    shutil.rmtree(results_base_path)
예제 #18
0
    def __init__(self, args, model_name, load_model=False):
        super().__init__(args, model_name, load_model)
        self.embeds_unsupported_langs = ("am", "vi")
        (data_folder, train_file, test_file,
         dev_file) = self.format_data("train")
        self.corpus = UniversalDependenciesCorpus(data_folder,
                                                  train_file,
                                                  test_file,
                                                  dev_file,
                                                  split_multiwords=False)
        dictionary = self.corpus.make_tag_dictionary("upos")
        if not load_model:
            embeddings = self.get_embeddings()

            self.model = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=dictionary,
                                        tag_type="upos",
                                        rnn_layers=2,
                                        use_crf=True)
        else:
            self.model.tag_dictionary = dictionary
예제 #19
0
def test_train_load_use_tagger_large(results_base_path, tasks_base_path):
    corpus = NLPTaskDataFetcher.load_corpus(NLPTask.UD_ENGLISH).downsample(0.05)
    tag_dictionary = corpus.make_tag_dictionary("pos")

    embeddings = WordEmbeddings("turian")

    tagger: SequenceTagger = SequenceTagger(
        hidden_size=64,
        embeddings=embeddings,
        tag_dictionary=tag_dictionary,
        tag_type="pos",
        use_crf=False,
    )

    # initialize trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    trainer.train(
        results_base_path,
        EvaluationMetric.MICRO_F1_SCORE,
        learning_rate=0.1,
        mini_batch_size=32,
        max_epochs=2,
        test_mode=True,
    )

    loaded_model: SequenceTagger = SequenceTagger.load_from_file(
        results_base_path / "final-model.pt"
    )

    sentence = Sentence("I love Berlin")
    sentence_empty = Sentence("       ")

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # clean up results directory
    shutil.rmtree(results_base_path)
예제 #20
0
def test_train_optimizer_arguments(results_base_path, tasks_base_path):

    corpus = NLPTaskDataFetcher.load_corpus(NLPTask.FASHION,
                                            base_path=tasks_base_path)
    tag_dictionary = corpus.make_tag_dictionary('ner')

    embeddings = WordEmbeddings('glove')

    tagger: SequenceTagger = SequenceTagger(hidden_size=64,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type='ner',
                                            use_crf=False)

    optimizer: Optimizer = AdamW

    # initialize trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpus, optimizer=optimizer)

    trainer.train(results_base_path,
                  EvaluationMetric.MICRO_F1_SCORE,
                  learning_rate=0.1,
                  mini_batch_size=2,
                  max_epochs=2,
                  test_mode=True,
                  weight_decay=1e-3)

    loaded_model: SequenceTagger = SequenceTagger.load_from_file(
        results_base_path / 'final-model.pt')

    sentence = Sentence('I love Berlin')
    sentence_empty = Sentence('       ')

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # clean up results directory
    shutil.rmtree(results_base_path)
예제 #21
0
def test_find_learning_rate(results_base_path, tasks_base_path):
    corpus = flair.datasets.ColumnCorpus(data_folder=tasks_base_path /
                                         "fashion",
                                         column_format={
                                             0: "text",
                                             3: "ner"
                                         })
    tag_dictionary = corpus.make_label_dictionary("ner")

    tagger: SequenceTagger = SequenceTagger(
        hidden_size=64,
        embeddings=turian_embeddings,
        tag_dictionary=tag_dictionary,
        tag_type="ner",
        use_crf=False,
    )

    # initialize trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    trainer.find_learning_rate(results_base_path, optimizer=SGD, iterations=5)

    del trainer, tagger, tag_dictionary, corpus
예제 #22
0
def test_train_resume_sequence_tagging_training(results_base_path,
                                                tasks_base_path):
    corpus = NLPTaskDataFetcher.load_corpora(
        [NLPTask.FASHION, NLPTask.GERMEVAL], base_path=tasks_base_path)
    tag_dictionary = corpus.make_tag_dictionary(u'ner')
    embeddings = WordEmbeddings(u'glove')
    model = SequenceTagger(hidden_size=64,
                           embeddings=embeddings,
                           tag_dictionary=tag_dictionary,
                           tag_type=u'ner',
                           use_crf=False)
    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path,
                  max_epochs=2,
                  test_mode=True,
                  checkpoint=True)
    trainer = ModelTrainer.load_from_checkpoint(
        (results_base_path / u'checkpoint.pt'), u'SequenceTagger', corpus)
    trainer.train(results_base_path,
                  max_epochs=2,
                  test_mode=True,
                  checkpoint=True)
    shutil.rmtree(results_base_path)
예제 #23
0
def test_train_charlm_nochache_load_use_tagger(results_base_path,
                                               tasks_base_path):

    corpus = NLPTaskDataFetcher.fetch_data(NLPTask.FASHION,
                                           base_path=tasks_base_path)
    tag_dictionary = corpus.make_tag_dictionary('ner')

    embeddings = CharLMEmbeddings('news-forward-fast', use_cache=False)

    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type='ner',
                                            use_crf=False)

    # initialize trainer
    trainer: SequenceTaggerTrainer = SequenceTaggerTrainer(tagger,
                                                           corpus,
                                                           test_mode=True)

    trainer.train(str(results_base_path),
                  learning_rate=0.1,
                  mini_batch_size=2,
                  max_epochs=2)

    loaded_model: SequenceTagger = SequenceTagger.load_from_file(
        results_base_path / 'final-model.pt')

    sentence = Sentence('I love Berlin')
    sentence_empty = Sentence('       ')

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # clean up results directory
    shutil.rmtree(results_base_path)
예제 #24
0
def test_train_load_use_tagger_large(results_base_path, tasks_base_path):
    corpus = flair.datasets.UD_ENGLISH().downsample(0.05)
    tag_dictionary = corpus.make_tag_dictionary("pos")

    tagger: SequenceTagger = SequenceTagger(
        hidden_size=64,
        embeddings=turian_embeddings,
        tag_dictionary=tag_dictionary,
        tag_type="pos",
        use_crf=False,
    )

    # initialize trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    trainer.train(
        results_base_path,
        learning_rate=0.1,
        mini_batch_size=32,
        max_epochs=2,
        shuffle=False,
    )

    del trainer, tagger, tag_dictionary, corpus
    loaded_model: SequenceTagger = SequenceTagger.load(results_base_path /
                                                       "final-model.pt")

    sentence = Sentence("I love Berlin")
    sentence_empty = Sentence("       ")

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # clean up results directory
    shutil.rmtree(results_base_path)
    del loaded_model
예제 #25
0
def test_train_resume_tagger(results_base_path, tasks_base_path):

    corpus_1 = flair.datasets.ColumnCorpus(data_folder=tasks_base_path /
                                           "fashion",
                                           column_format={
                                               0: "text",
                                               3: "ner"
                                           })
    corpus_2 = flair.datasets.NER_GERMAN_GERMEVAL(
        base_path=tasks_base_path).downsample(0.1)

    corpus = MultiCorpus([corpus_1, corpus_2])
    tag_dictionary = corpus.make_label_dictionary("ner")

    model: SequenceTagger = SequenceTagger(
        hidden_size=64,
        embeddings=turian_embeddings,
        tag_dictionary=tag_dictionary,
        tag_type="ner",
        use_crf=False,
    )

    # train model for 2 epochs
    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path,
                  max_epochs=2,
                  shuffle=False,
                  checkpoint=True)

    del model

    # load the checkpoint model and train until epoch 4
    checkpoint_model = SequenceTagger.load(results_base_path / "checkpoint.pt")
    trainer.resume(model=checkpoint_model, max_epochs=4)

    # clean up results directory
    del trainer
def test_train_charlm_changed_chache_load_use_tagger(results_base_path, tasks_base_path):

    corpus = NLPTaskDataFetcher.load_corpus(NLPTask.FASHION, base_path=tasks_base_path)
    tag_dictionary = corpus.make_tag_dictionary('ner')

    # make a temporary cache directory that we remove afterwards
    cache_dir = results_base_path / 'cache'
    os.makedirs(cache_dir, exist_ok=True)
    embeddings = FlairEmbeddings('news-forward-fast', cache_directory=cache_dir)

    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type='ner',
                                            use_crf=False)

    # initialize trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    trainer.train(results_base_path, EvaluationMetric.MACRO_ACCURACY, learning_rate=0.1, mini_batch_size=2,
                  max_epochs=2, test_mode=True)

    # remove the cache directory
    shutil.rmtree(cache_dir)

    loaded_model: SequenceTagger = SequenceTagger.load_from_file(results_base_path / 'final-model.pt')

    sentence = Sentence('I love Berlin')
    sentence_empty = Sentence('       ')

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # clean up results directory
    shutil.rmtree(results_base_path)
예제 #27
0
def train_rus_tagger():
    corpus = flair.datasets.UD_RUSSIAN()
    tag_type = 'upos'
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

    embedding_types = [
        FastTextEmbeddings(
            '/home/danielly/workspace/trained_pos_models/ru/ru.bin')
    ]
    embeddings: StackedEmbeddings = StackedEmbeddings(
        embeddings=embedding_types)

    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=tag_type,
                                            use_crf=True)

    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    trainer.train('/home/danielly/workspace',
                  learning_rate=0.1,
                  mini_batch_size=32,
                  max_epochs=150)
예제 #28
0
def test_train_load_use_tagger_multicorpus(results_base_path, tasks_base_path):
    corpus = NLPTaskDataFetcher.load_corpora(
        [NLPTask.FASHION, NLPTask.GERMEVAL], base_path=tasks_base_path)
    tag_dictionary = corpus.make_tag_dictionary(u'ner')
    embeddings = WordEmbeddings(u'glove')
    tagger = SequenceTagger(hidden_size=64,
                            embeddings=embeddings,
                            tag_dictionary=tag_dictionary,
                            tag_type=u'ner',
                            use_crf=False)
    trainer = ModelTrainer(tagger, corpus)
    trainer.train(results_base_path,
                  learning_rate=0.1,
                  mini_batch_size=2,
                  max_epochs=2,
                  test_mode=True)
    loaded_model = SequenceTagger.load_from_file(
        (results_base_path / u'final-model.pt'))
    sentence = Sentence(u'I love Berlin')
    sentence_empty = Sentence(u'       ')
    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])
    shutil.rmtree(results_base_path)
예제 #29
0
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary)

# 4. initialize embeddings
embedding_types: List[TokenEmbeddings] = [
    #WordEmbeddings('glove'),
    BytePairEmbeddings('en')
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

# 5. initialize sequence tagger
from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)

# 6. initialize trainer
from flair.trainers import ModelTrainer

trainer: ModelTrainer = ModelTrainer(tagger, corpus)

# 7. start training
trainer.train('resources/taggers/bpe',
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=100)
예제 #30
0
def train_sequence_labeling_model(data_folder, proposed_tags_vocabulary_size,
                                  skf_split_no):
    """
    Trains the sequence labeling model (by default model uses one RNN layer).
    Model is trained to predict part of speech tag and takes into account information about:
    - text (plain text made of tokens that together form a sentence),
    - occurrence of separator before token,
    - proposed tags for given token.
    It is trained with use of Stacked Embeddings used to combine different embeddings together. Words are embedded
    using a concatenation of two vector embeddings:
    - Flair Embeddings - contextual string embeddings that capture latent syntactic-semantic
      information that goes beyond standard word embeddings. Key differences are: (1) they are trained without any
      explicit notion of words and thus fundamentally model words as sequences of characters. And (2) they are
      contextualized by their surrounding text, meaning that the same word will have different embeddings depending on
      its contextual use.
      There are forward (that goes through the given on input plain text form left to right) and backward model (that
      goes through the given on input plain text form right to left) used for part of speech (pos) tag training.
    - One Hot Embeddings - embeddings that encode each word in a vocabulary as a one-hot vector, followed by an
      embedding layer. These embeddings thus do not encode any prior knowledge as do most other embeddings. They also
      differ in that they require to see a Corpus during instantiation, so they can build up a vocabulary consisting of
      the most common words seen in the corpus, plus an UNK token for all rare words.
      There are two One Hot Embeddings used in training:
      - first to embed information about occurrence of separator before token,
      - second to embed information about concatenated with a ';' proposed tags.
    Model and training logs are saved in resources/taggers/example-pos directory.
    This is the method where internal states of forward and backward Flair models are taken at the end of each token
    and, supplemented by information about occurrence of separator before token and proposed tags for given token used
    to train model for one of stratified 10 fold cross validation splits.

    :param data_folder: folder where files with column corpus split are stored. Those columns are used to initialize
    ColumnCorpus object
    :param proposed_tags_vocabulary_size: number of proposed tags
    :param skf_split_no: number that indicates one of stratified 10 fold cross validation splits (from range 1 to 10)
    used to train the model
    """
    # define columns
    columns = {0: 'text', 1: 'pos', 2: 'is_separator', 3: 'proposed_tags'}
    # init a corpus using column format, data folder and the names of the train and test files
    # 1. get the corpus
    corpus: Corpus = ColumnCorpus(data_folder,
                                  columns,
                                  train_file='train_' + str(skf_split_no),
                                  test_file='test_' + str(skf_split_no),
                                  dev_file=None)
    log.info(corpus)
    # 2. what tag do we want to predict
    tag_type = 'pos'
    # 3. make the tag dictionary from the corpus
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
    log.info(tag_dictionary)
    # 4. initialize embeddings
    embedding_types: List[TokenEmbeddings] = [
        FlairEmbeddings('pl-forward', chars_per_chunk=64),
        FlairEmbeddings('pl-backward', chars_per_chunk=64),
        OneHotEmbeddings(corpus=corpus,
                         field='is_separator',
                         embedding_length=3,
                         min_freq=3),
        OneHotEmbeddings(corpus=corpus,
                         field='proposed_tags',
                         embedding_length=math.ceil(
                             (proposed_tags_vocabulary_size + 1)**0.25),
                         min_freq=3)
    ]
    embeddings: StackedEmbeddings = StackedEmbeddings(
        embeddings=embedding_types)
    # 5. initialize sequence tagger
    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=tag_type,
                                            use_crf=False,
                                            rnn_layers=1)
    # 6. initialize trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)
    # 7. start training
    trainer.train(
        use_scratch_dir_if_available('resources/taggers/example-pos/it-' +
                                     str(skf_split_no)),
        learning_rate=0.1,
        mini_batch_size=32,
        embeddings_storage_mode='gpu',
        max_epochs=sys.maxsize,
        monitor_test=True)
    # 8. plot weight traces (optional)
    plotter = Plotter()
    plotter.plot_weights(
        use_scratch_dir_if_available('resources/taggers/example-pos/it-' +
                                     str(skf_split_no) + '/weights.txt'))