def fine_tune(base_model, corpus_dir, output_dir):

    # print stats
    print(f'Fine tuning base model: {base_model}')
    print(f'Corpus dir: {corpus_dir}')
    print(f'Output dir: {output_dir}')

    # instantiate an existing LM, such as one from the FlairEmbeddings
    language_model = FlairEmbeddings(base_model).lm

    # are you fine-tuning a forward or backward LM?
    is_forward_lm = language_model.is_forward_lm

    # get the dictionary from the existing language model
    dictionary: Dictionary = language_model.dictionary

    # get your corpus, process forward and at the character level
    corpus = TextCorpus(corpus_dir,
                        dictionary,
                        is_forward_lm,
                        character_level=True)

    # use the model trainer to fine-tune this model on your corpus
    trainer = LanguageModelTrainer(language_model, corpus)

    trainer.train(output_dir,
                  sequence_length=100,
                  mini_batch_size=100,
                  learning_rate=20,
                  patience=10,
                  checkpoint=True)
Пример #2
0
def train_LM(file_path, model_path, is_forward_lm=True):
    from flair.data import Dictionary
    from flair.models import LanguageModel
    from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus

    dictionary = Dictionary.load_from_file(file_path + 'mappings')

    # get your corpus, process forward and at the character level
    corpus = TextCorpus(file_path,
                        dictionary,
                        is_forward_lm,
                        character_level=True)

    # instantiate your language model, set hidden size and number of layers
    language_model = LanguageModel(dictionary,
                                   is_forward_lm,
                                   hidden_size=128,
                                   nlayers=1)

    # train your language model
    trainer = LanguageModelTrainer(language_model, corpus)

    trainer.train(model_path,
                  sequence_length=100,
                  mini_batch_size=32,
                  max_epochs=10)
Пример #3
0
def test_train_resume_language_model_training(resources_path,
                                              results_base_path,
                                              tasks_base_path):
    dictionary = Dictionary.load(u'chars')
    language_model = LanguageModel(dictionary,
                                   is_forward_lm=True,
                                   hidden_size=128,
                                   nlayers=1)
    corpus = TextCorpus((resources_path / u'corpora/lorem_ipsum'),
                        dictionary,
                        language_model.is_forward_lm,
                        character_level=True)
    trainer = LanguageModelTrainer(language_model, corpus, test_mode=True)
    trainer.train(results_base_path,
                  sequence_length=10,
                  mini_batch_size=10,
                  max_epochs=2,
                  checkpoint=True)
    trainer = LanguageModelTrainer.load_from_checkpoint(
        (results_base_path / u'checkpoint.pt'), corpus)
    trainer.train(results_base_path,
                  sequence_length=10,
                  mini_batch_size=10,
                  max_epochs=2)
    shutil.rmtree(results_base_path)
Пример #4
0
def test_training():
    # get default dictionary
    dictionary: Dictionary = Dictionary.load('chars')

    # init forward LM with 128 hidden states and 1 layer
    language_model: LanguageModel = LanguageModel(dictionary,
                                                  is_forward_lm=True,
                                                  hidden_size=128,
                                                  nlayers=1)

    # get the example corpus and process at character level in forward direction
    corpus: TextCorpus = TextCorpus('resources/corpora/lorem_ipsum',
                                    dictionary,
                                    language_model.is_forward_lm,
                                    character_level=True)

    # train the language model
    trainer: LanguageModelTrainer = LanguageModelTrainer(
        language_model, corpus)
    trainer.train('./results',
                  sequence_length=10,
                  mini_batch_size=10,
                  max_epochs=5)

    # use the character LM as embeddings to embed the example sentence 'I love Berlin'
    char_lm_embeddings = CharLMEmbeddings('./results/best-lm.pt')
    sentence = Sentence('I love Berlin')
    char_lm_embeddings.embed(sentence)
    print(sentence[1].embedding.size())

    # clean up results directory
    shutil.rmtree('./results', ignore_errors=True)
def test_train_language_model(results_base_path, resources_path):
    # get default dictionary
    dictionary: Dictionary = Dictionary.load('chars')

    # init forward LM with 128 hidden states and 1 layer
    language_model: LanguageModel = LanguageModel(dictionary, is_forward_lm=True, hidden_size=128, nlayers=1)

    # get the example corpus and process at character level in forward direction
    corpus: TextCorpus = TextCorpus(resources_path / 'corpora/lorem_ipsum',
                                    dictionary,
                                    language_model.is_forward_lm,
                                    character_level=True)

    # train the language model
    trainer: LanguageModelTrainer = LanguageModelTrainer(language_model, corpus, test_mode=True)
    trainer.train(results_base_path, sequence_length=10, mini_batch_size=10, max_epochs=2)

    # use the character LM as embeddings to embed the example sentence 'I love Berlin'
    char_lm_embeddings = FlairEmbeddings(str(results_base_path / 'best-lm.pt'))
    sentence = Sentence('I love Berlin')
    char_lm_embeddings.embed(sentence)

    text, likelihood = language_model.generate_text(number_of_characters=100)
    assert (text is not None)
    assert (len(text) >= 100)

    # clean up results directory
    shutil.rmtree(results_base_path, ignore_errors=True)
Пример #6
0
 def ft(self):
     if isinstance(self.document_embedding, LanguageModel):
         trainer = LanguageModelTrainer(self.document_embedding, corpus)
         trainer.train('resources/taggers/language_model',
                       sequence_length=100,
                       mini_batch_size=100,
                       learning_rate=20,
                       patience=10,
                       checkpoint=True)
Пример #7
0
def train_elmo(args):

    if args.finetune and args.checkpoint_path == '':
        print("finetune")
        from flair.embeddings import FlairEmbeddings
        language_model = FlairEmbeddings('he-forward').lm
        corpus: TextCorpus = TextCorpus(args.corpus_path,
                                        language_model.dictionary,
                                        language_model.is_forward_lm,
                                        character_level=True)
        trainer = LanguageModelTrainer(language_model, corpus)

    elif args.checkpoint_path == '' and not args.finetune:

        # Training from scrach
        print('Training from scarch')

        #Downloading data
        if not os.path.exists(args.corpus_path):
            print('Corpus _path', args.corpus_path)
            download_corpus(args)

        language_model, corpus = create_corpus(args)
        trainer = LanguageModelTrainer(language_model, corpus)

    else:
        print("Training from checpoint")

        from pathlib import Path
        checkpoint = Path(args.checkpoint_path)
        if args.finetune:
            load_dict_from_lm = True
        else:
            load_dict_from_lm = False

        trainer = LanguageModelTrainer.load_from_checkpoint(
            checkpoint,
            create_corpus(args, load_dict_from_lm, return_back='corpus'))

    trainer.train(args.save_model,
                  sequence_length=args.seq_length,
                  mini_batch_size=args.mini_batch,
                  max_epochs=args.epochs,
                  checkpoint=args.checkpoint)
Пример #8
0
 def train(self) -> None:
     trainer = LanguageModelTrainer(self.lm, self.corpus)
     trainer.train(self.save_dir,
                   sequence_length=self.sequence_length,
                   mini_batch_size=self.mini_batch_size,
                   learning_rate=self.learning_rate,
                   patience=self.patience,
                   checkpoint=self.checkpoint,
                   write_weights=True,
                   use_tensorboard=True)
Пример #9
0
    def trainLanguage(self, corpusPath):
        self.corpus = TextCorpus(Path(corpusPath),
                                 self.dictionary,
                                 self.is_forward_lm,
                                 character_level=True)

        self.language_model = LanguageModel(self.dictionary,
                                            self.is_forward_lm,
                                            hidden_size=128,
                                            nlayers=10)

        self.trainer = LanguageModelTrainer(self.language_model, self.corpus)

        self.trainer.train('resources/taggers/language_model',
                           sequence_length=10,
                           mini_batch_size=10,
                           max_epochs=10)
Пример #10
0
def test_train_resume_language_model_training(resources_path,
                                              results_base_path,
                                              tasks_base_path):
    # get default dictionary
    dictionary: Dictionary = Dictionary.load("chars")

    # init forward LM with 128 hidden states and 1 layer
    language_model: LanguageModel = LanguageModel(dictionary,
                                                  is_forward_lm=True,
                                                  hidden_size=128,
                                                  nlayers=1)

    # get the example corpus and process at character level in forward direction
    corpus: TextCorpus = TextCorpus(
        resources_path / "corpora/lorem_ipsum",
        dictionary,
        language_model.is_forward_lm,
        character_level=True,
    )

    # train the language model
    trainer: LanguageModelTrainer = LanguageModelTrainer(language_model,
                                                         corpus,
                                                         test_mode=True)
    trainer.train(
        results_base_path,
        sequence_length=10,
        mini_batch_size=10,
        max_epochs=2,
        checkpoint=True,
    )
    del trainer, language_model

    trainer = LanguageModelTrainer.load_from_checkpoint(
        results_base_path / "checkpoint.pt", corpus)
    trainer.train(results_base_path,
                  sequence_length=10,
                  mini_batch_size=10,
                  max_epochs=2)

    # clean up results directory
    shutil.rmtree(results_base_path)
    del trainer
Пример #11
0
    def retrain_flair(cls,
                      corpus_path: str,
                      model_path_dest: str,
                      flair_algorithm: str = 'de-forward',
                      epochs: int = 10):
        use_embedding, algorithm = cls.determine_algorithm_from_string(
            flair_algorithm_string=flair_algorithm)
        # instantiate an existing LM, such as one from the FlairEmbeddings
        model = use_embedding(flair_algorithm)
        if algorithm == 'bert':
            language_model = model.model
        else:
            language_model = model.lm

        # are you fine-tuning a forward or backward LM?
        try:
            is_forward_lm = language_model.is_forward_lm
        except AttributeError:
            is_forward_lm = True

        # todo: no support for finetuning BERT with Flair Library for now
        # get the dictionary from the existing language model
        dictionary: Dictionary = language_model.dictionary

        # get your corpus, process forward and at the character level
        corpus = TextCorpus(corpus_path,
                            dictionary,
                            is_forward_lm,
                            character_level=True)

        # use the model trainer to fine-tune this model on your corpus
        trainer = LanguageModelTrainer(language_model, corpus)

        trainer.train(model_path_dest,
                      sequence_length=10,
                      mini_batch_size=10,
                      learning_rate=20,
                      max_epochs=epochs,
                      patience=10,
                      checkpoint=True)
Пример #12
0
def process(options):
    """
    Do the processing
    """

    # are you training a forward or backward LM?
    is_forward_lm = not options.is_backward_lm

    # load the default character dictionary
    dictionary: Dictionary = Dictionary.load('chars')

    # get your corpus, process forward and at the character level
    corpus = TextCorpus(options.corpus_dir,
                        dictionary,
                        is_forward_lm,
                        character_level=True)

    # instantiate your language model, set hidden size and number of layers
    language_model = LanguageModel(
        dictionary,
        is_forward_lm,
        hidden_size=2048,
        nlayers=1,
        embedding_size=100,  # recommendations?
        dropout=0)  # dropout probs?

    # train your language model
    trainer = LanguageModelTrainer(language_model, corpus)

    trainer.train(
        options.
        model_dir,  # embeddings_in_memory=False: effect on 'RuntimeError: CUDA out of memory'?
        sequence_length=250,
        learning_rate=20,
        mini_batch_size=100,
        anneal_factor=0.25,
        patience=
        22,  # 'patience' value of the learning rate scheduler: 1/2 training splits
        clip=0.25,  # clipping gradients?
        max_epochs=75)
Пример #13
0
def test_train_language_model(results_base_path, resources_path):
    dictionary = Dictionary.load(u'chars')
    language_model = LanguageModel(dictionary,
                                   is_forward_lm=True,
                                   hidden_size=128,
                                   nlayers=1)
    corpus = TextCorpus((resources_path / u'corpora/lorem_ipsum'),
                        dictionary,
                        language_model.is_forward_lm,
                        character_level=True)
    trainer = LanguageModelTrainer(language_model, corpus, test_mode=True)
    trainer.train(results_base_path,
                  sequence_length=10,
                  mini_batch_size=10,
                  max_epochs=2)
    char_lm_embeddings = FlairEmbeddings(
        unicode((results_base_path / u'best-lm.pt')))
    sentence = Sentence(u'I love Berlin')
    char_lm_embeddings.embed(sentence)
    (text, likelihood) = language_model.generate_text(number_of_characters=100)
    assert (text is not None)
    assert (len(text) >= 100)
    shutil.rmtree(results_base_path, ignore_errors=True)
with open(os.path.join(tmp_path.name, "test.txt"), 'w') as f:
    f.writelines("\n".join(dev_set))

print("load original model")
language_model = FlairEmbeddings('fr-backward').lm
is_forward_lm = language_model.is_forward_lm
dictionary: Dictionary = language_model.dictionary

print("load corpus")
corpus = TextCorpus(tmp_path.name,
                    dictionary,
                    is_forward_lm,
                    character_level=True)

print("start training")
trainer = LanguageModelTrainer(language_model, corpus)

trainer.train('resources/flair_ner/lm/ca_backward',
              sequence_length=100,
              mini_batch_size=100,
              learning_rate=20,
              patience=10,
              max_epochs=5,
              checkpoint=True)

print("load original model")
language_model = FlairEmbeddings('fr-forward').lm
is_forward_lm = language_model.is_forward_lm
dictionary: Dictionary = language_model.dictionary

print("load corpus")
Пример #15
0
    item2idx = model.dictionary.item2idx
    print(item2idx["\n".encode()])

    inputs = open('corpus/train/train.txt', 'r').read().splitlines()[-1]
    inputs = [item2idx.get(char.encode(), 0) for char in inputs]
    inputs = torch.LongTensor(inputs).unsqueeze(-1)  # (seqlen, 1)
    inputs = inputs.to(device)

    print("# load corpus")
    corpus = TextCorpus(Path('corpus/'),
                        model.dictionary,
                        model.is_forward_lm,
                        character_level=True)

    print("# trainer")
    trainer = LanguageModelTrainer(model, corpus)

    print("# Generating characters with pretraned model")
    generate(model, inputs, hp.n_chars, f"{hp.output_dir}/0.out", device)

    print("# continue training the model on the new corpus")
    for epoch in range(1, hp.n_epochs):
        print(f"# epoch: {epoch}")
        print("training ..")
        trainer.train(f'{hp.ckpt_dir}', sequence_length=hp.seqlen, max_epochs=1)

        print("Generating ..")
        generate(model, inputs, hp.n_chars, f"{hp.output_dir}/{epoch}.out", device)

        print("Loading saved model")
        model = LanguageModel.load_language_model(f'{hp.ckpt_dir}/best-lm.pt')