Пример #1
0
def test_train_resume_language_model_training(resources_path,
                                              results_base_path,
                                              tasks_base_path):
    dictionary = Dictionary.load(u'chars')
    language_model = LanguageModel(dictionary,
                                   is_forward_lm=True,
                                   hidden_size=128,
                                   nlayers=1)
    corpus = TextCorpus((resources_path / u'corpora/lorem_ipsum'),
                        dictionary,
                        language_model.is_forward_lm,
                        character_level=True)
    trainer = LanguageModelTrainer(language_model, corpus, test_mode=True)
    trainer.train(results_base_path,
                  sequence_length=10,
                  mini_batch_size=10,
                  max_epochs=2,
                  checkpoint=True)
    trainer = LanguageModelTrainer.load_from_checkpoint(
        (results_base_path / u'checkpoint.pt'), corpus)
    trainer.train(results_base_path,
                  sequence_length=10,
                  mini_batch_size=10,
                  max_epochs=2)
    shutil.rmtree(results_base_path)
Пример #2
0
def train_LM(file_path, model_path, is_forward_lm=True):
    from flair.data import Dictionary
    from flair.models import LanguageModel
    from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus

    dictionary = Dictionary.load_from_file(file_path + 'mappings')

    # get your corpus, process forward and at the character level
    corpus = TextCorpus(file_path,
                        dictionary,
                        is_forward_lm,
                        character_level=True)

    # instantiate your language model, set hidden size and number of layers
    language_model = LanguageModel(dictionary,
                                   is_forward_lm,
                                   hidden_size=128,
                                   nlayers=1)

    # train your language model
    trainer = LanguageModelTrainer(language_model, corpus)

    trainer.train(model_path,
                  sequence_length=100,
                  mini_batch_size=32,
                  max_epochs=10)
def fine_tune(base_model, corpus_dir, output_dir):

    # print stats
    print(f'Fine tuning base model: {base_model}')
    print(f'Corpus dir: {corpus_dir}')
    print(f'Output dir: {output_dir}')

    # instantiate an existing LM, such as one from the FlairEmbeddings
    language_model = FlairEmbeddings(base_model).lm

    # are you fine-tuning a forward or backward LM?
    is_forward_lm = language_model.is_forward_lm

    # get the dictionary from the existing language model
    dictionary: Dictionary = language_model.dictionary

    # get your corpus, process forward and at the character level
    corpus = TextCorpus(corpus_dir,
                        dictionary,
                        is_forward_lm,
                        character_level=True)

    # use the model trainer to fine-tune this model on your corpus
    trainer = LanguageModelTrainer(language_model, corpus)

    trainer.train(output_dir,
                  sequence_length=100,
                  mini_batch_size=100,
                  learning_rate=20,
                  patience=10,
                  checkpoint=True)
Пример #4
0
 def ft(self):
     if isinstance(self.document_embedding, LanguageModel):
         trainer = LanguageModelTrainer(self.document_embedding, corpus)
         trainer.train('resources/taggers/language_model',
                       sequence_length=100,
                       mini_batch_size=100,
                       learning_rate=20,
                       patience=10,
                       checkpoint=True)
Пример #5
0
 def train(self) -> None:
     trainer = LanguageModelTrainer(self.lm, self.corpus)
     trainer.train(self.save_dir,
                   sequence_length=self.sequence_length,
                   mini_batch_size=self.mini_batch_size,
                   learning_rate=self.learning_rate,
                   patience=self.patience,
                   checkpoint=self.checkpoint,
                   write_weights=True,
                   use_tensorboard=True)
Пример #6
0
def train_elmo(args):

    if args.finetune and args.checkpoint_path == '':
        print("finetune")
        from flair.embeddings import FlairEmbeddings
        language_model = FlairEmbeddings('he-forward').lm
        corpus: TextCorpus = TextCorpus(args.corpus_path,
                                        language_model.dictionary,
                                        language_model.is_forward_lm,
                                        character_level=True)
        trainer = LanguageModelTrainer(language_model, corpus)

    elif args.checkpoint_path == '' and not args.finetune:

        # Training from scrach
        print('Training from scarch')

        #Downloading data
        if not os.path.exists(args.corpus_path):
            print('Corpus _path', args.corpus_path)
            download_corpus(args)

        language_model, corpus = create_corpus(args)
        trainer = LanguageModelTrainer(language_model, corpus)

    else:
        print("Training from checpoint")

        from pathlib import Path
        checkpoint = Path(args.checkpoint_path)
        if args.finetune:
            load_dict_from_lm = True
        else:
            load_dict_from_lm = False

        trainer = LanguageModelTrainer.load_from_checkpoint(
            checkpoint,
            create_corpus(args, load_dict_from_lm, return_back='corpus'))

    trainer.train(args.save_model,
                  sequence_length=args.seq_length,
                  mini_batch_size=args.mini_batch,
                  max_epochs=args.epochs,
                  checkpoint=args.checkpoint)
Пример #7
0
def process(options):
    """
    Do the processing
    """

    # are you training a forward or backward LM?
    is_forward_lm = not options.is_backward_lm

    # load the default character dictionary
    dictionary: Dictionary = Dictionary.load('chars')

    # get your corpus, process forward and at the character level
    corpus = TextCorpus(options.corpus_dir,
                        dictionary,
                        is_forward_lm,
                        character_level=True)

    # instantiate your language model, set hidden size and number of layers
    language_model = LanguageModel(
        dictionary,
        is_forward_lm,
        hidden_size=2048,
        nlayers=1,
        embedding_size=100,  # recommendations?
        dropout=0)  # dropout probs?

    # train your language model
    trainer = LanguageModelTrainer(language_model, corpus)

    trainer.train(
        options.
        model_dir,  # embeddings_in_memory=False: effect on 'RuntimeError: CUDA out of memory'?
        sequence_length=250,
        learning_rate=20,
        mini_batch_size=100,
        anneal_factor=0.25,
        patience=
        22,  # 'patience' value of the learning rate scheduler: 1/2 training splits
        clip=0.25,  # clipping gradients?
        max_epochs=75)
Пример #8
0
    def retrain_flair(cls,
                      corpus_path: str,
                      model_path_dest: str,
                      flair_algorithm: str = 'de-forward',
                      epochs: int = 10):
        use_embedding, algorithm = cls.determine_algorithm_from_string(
            flair_algorithm_string=flair_algorithm)
        # instantiate an existing LM, such as one from the FlairEmbeddings
        model = use_embedding(flair_algorithm)
        if algorithm == 'bert':
            language_model = model.model
        else:
            language_model = model.lm

        # are you fine-tuning a forward or backward LM?
        try:
            is_forward_lm = language_model.is_forward_lm
        except AttributeError:
            is_forward_lm = True

        # todo: no support for finetuning BERT with Flair Library for now
        # get the dictionary from the existing language model
        dictionary: Dictionary = language_model.dictionary

        # get your corpus, process forward and at the character level
        corpus = TextCorpus(corpus_path,
                            dictionary,
                            is_forward_lm,
                            character_level=True)

        # use the model trainer to fine-tune this model on your corpus
        trainer = LanguageModelTrainer(language_model, corpus)

        trainer.train(model_path_dest,
                      sequence_length=10,
                      mini_batch_size=10,
                      learning_rate=20,
                      max_epochs=epochs,
                      patience=10,
                      checkpoint=True)
Пример #9
0
class trainLanguage(object):
    def __init__(self, charPath, is_forward=True):
        self.is_forward_lm = is_forward
        self.dictionary: Dictionary = Dictionary.load(charPath)

    def trainLanguage(self, corpusPath):
        self.corpus = TextCorpus(Path(corpusPath),
                                 self.dictionary,
                                 self.is_forward_lm,
                                 character_level=True)

        self.language_model = LanguageModel(self.dictionary,
                                            self.is_forward_lm,
                                            hidden_size=128,
                                            nlayers=10)

        self.trainer = LanguageModelTrainer(self.language_model, self.corpus)

        self.trainer.train('resources/taggers/language_model',
                           sequence_length=10,
                           mini_batch_size=10,
                           max_epochs=10)
Пример #10
0
def test_train_language_model(results_base_path, resources_path):
    dictionary = Dictionary.load(u'chars')
    language_model = LanguageModel(dictionary,
                                   is_forward_lm=True,
                                   hidden_size=128,
                                   nlayers=1)
    corpus = TextCorpus((resources_path / u'corpora/lorem_ipsum'),
                        dictionary,
                        language_model.is_forward_lm,
                        character_level=True)
    trainer = LanguageModelTrainer(language_model, corpus, test_mode=True)
    trainer.train(results_base_path,
                  sequence_length=10,
                  mini_batch_size=10,
                  max_epochs=2)
    char_lm_embeddings = FlairEmbeddings(
        unicode((results_base_path / u'best-lm.pt')))
    sentence = Sentence(u'I love Berlin')
    char_lm_embeddings.embed(sentence)
    (text, likelihood) = language_model.generate_text(number_of_characters=100)
    assert (text is not None)
    assert (len(text) >= 100)
    shutil.rmtree(results_base_path, ignore_errors=True)
Пример #11
0
from flair.data import Dictionary
from flair.models import LanguageModel
from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus

# are you training a forward or backward LM?
### NOTE: you have to train forward and backward separately ###
is_forward_lm = True

# load the default character dictionary
dictionary: Dictionary = Dictionary.load('chars')

# get your corpus, process forward and at the character level
corpus = TextCorpus(Path('/local/kevinshih/BioFlair/data/PMC_Case_Rep/'),
                    dictionary,
                    is_forward_lm,
                    character_level=True)

# instantiate your language model, set hidden size and number of layers
language_model = LanguageModel(dictionary,
                               is_forward_lm,
                               hidden_size=2048,
                               nlayers=1)

# train your language model
trainer = LanguageModelTrainer(language_model, corpus)

trainer.train('resources/taggers/language_model',
              sequence_length=250,
              mini_batch_size=100,
              max_epochs=50)
Пример #12
0
from flair.data import Dictionary
from flair.models import LanguageModel
from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus

# are you training a forward or backward LM?
is_forward_lm = True

# load the default character dictionary
dictionary: Dictionary = Dictionary.load('chars')

# get your corpus, process forward and at the character level
corpus = TextCorpus('corpus', dictionary, is_forward_lm, character_level=True)

# instantiate your language model, set hidden size and number of layers
language_model = LanguageModel(dictionary,
                               is_forward_lm,
                               hidden_size=2048,
                               nlayers=1)
#language_model = LanguageModel(dictionary, is_forward_lm, hidden_size=128, nlayers=1)

# train your language model
trainer = LanguageModelTrainer(language_model, corpus)
trainer.train('resources/taggers/language_model',
              sequence_length=250,
              mini_batch_size=100,
              max_epochs=1000,
              patience=25,
              num_workers=8)
#trainer.train('resources/taggers/language_model', sequence_length=10, mini_batch_size=10, max_epochs=10)
Пример #13
0
from flair.embeddings import FlairEmbeddings
from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus
from util import data_path, flair_datapath, train_flair_datapath

# instantiate an existing LM, such as one from the FlairEmbeddings
language_model = FlairEmbeddings("id-forward").lm

# are you fine-tuning a forward or backward LM?
is_forward_lm = language_model.is_forward_lm

# get the dictionary from the existing language model
dictionary: Dictionary = language_model.dictionary

# get your corpus, process forward and at the character level
corpus = TextCorpus(flair_datapath,
                    dictionary,
                    is_forward_lm,
                    character_level=True)

# use the model trainer to fine-tune this model on your corpus
trainer = LanguageModelTrainer(language_model, corpus)

trainer.train(
    "models/",
    sequence_length=108,  # max(len(tweets))
    mini_batch_size=100,
    learning_rate=20,
    patience=10,
    checkpoint=True,
)
from flair.data import Dictionary
from flair.models import LanguageModel
from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus
is_forward_lm = False
dictionary: Dictionary = Dictionary.load('chars')
corpus = TextCorpus('FLAIR/corpus',
                    dictionary,
                    is_forward_lm,
                    character_level=True)

language_model = LanguageModel(dictionary,
                               is_forward_lm,
                               hidden_size=1024,
                               nlayers=2)

trainer = LanguageModelTrainer(language_model, corpus)

trainer.train('FLAIR/resources/taggers/language_model_backward',
              sequence_length=50,
              mini_batch_size=50,
              learning_rate=10,
              patience=3,
              max_epochs=50,
              checkpoint=True)
    if not os.path.exists('./trained_embeddings/' + str(mesinesp_subset) +
                          '/fwd/'):
        os.makedirs('./trained_embeddings/' + str(mesinesp_subset) + '/fwd/')

    output_dir = './trained_embeddings/' + str(mesinesp_subset) + '/fwd/'

else:
    if not os.path.exists('./trained_embeddings/' + str(mesinesp_subset) +
                          '/bwd/'):
        os.makedirs('./trained_embeddings/' + str(mesinesp_subset) + '/bwd/')

    output_dir = './trained_embeddings/' + str(mesinesp_subset) + '/bwd/'

trainer.train(output_dir,
              sequence_length=250,
              mini_batch_size=100,
              max_epochs=2000,
              patience=25,
              checkpoint=True)

## To see the training process:
#from flair.visual.training_curves import Plotter
#plotter = Plotter()

#f is_forward_lm:
#    plotter.plot_training_curves('resources/taggers/fwd_embeds/loss.tsv')
#    plotter.plot_weights('resources/taggers/fwd_embeds/weights.txt')
#else:
#    plotter.plot_training_curves('resources/taggers/bwd_embeds/loss.tsv')
#    plotter.plot_weights('resources/taggers/bwd_embeds/weights.txt')

print("Total time (aprox.):", int((time.time() - start_time)),
Пример #16
0
    print(item2idx["\n".encode()])

    inputs = open('corpus/train/train.txt', 'r').read().splitlines()[-1]
    inputs = [item2idx.get(char.encode(), 0) for char in inputs]
    inputs = torch.LongTensor(inputs).unsqueeze(-1)  # (seqlen, 1)
    inputs = inputs.to(device)

    print("# load corpus")
    corpus = TextCorpus(Path('corpus/'),
                        model.dictionary,
                        model.is_forward_lm,
                        character_level=True)

    print("# trainer")
    trainer = LanguageModelTrainer(model, corpus)

    print("# Generating characters with pretraned model")
    generate(model, inputs, hp.n_chars, f"{hp.output_dir}/0.out", device)

    print("# continue training the model on the new corpus")
    for epoch in range(1, hp.n_epochs):
        print(f"# epoch: {epoch}")
        print("training ..")
        trainer.train(f'{hp.ckpt_dir}', sequence_length=hp.seqlen, max_epochs=1)

        print("Generating ..")
        generate(model, inputs, hp.n_chars, f"{hp.output_dir}/{epoch}.out", device)

        print("Loading saved model")
        model = LanguageModel.load_language_model(f'{hp.ckpt_dir}/best-lm.pt')
        model.to(device)
Пример #17
0
    # TODO: add possibility for other dictionary!
    # (https://github.com/zalandoresearch/flair/issues/179#issuecomment-433942853)
    print("loading Dictionary")
    dictionary = Dictionary.load('chars')
    # instantiate corpus
    log.info("Making corpus from folder: {}".format(args.corpus_path))
    corpus = TextCorpus(args.corpus_path,
                        dictionary,
                        options['is_forward_lm'],
                        **options['corpus'])

    # TRAINING
    if args.continue_training:
        # load checkpoint
        cp_path = args.train_path + '/checkpoint.pt'
        log.info("Continue training from {}".format(cp_path))
        # load LM-Trainer
        trainer = LanguageModelTrainer.load_from_checkpoint(cp_path, corpus)
    else:
        # instantiate language model
        log.info("Creating language model")
        language_model = LanguageModel(dictionary,
                                       options['is_forward_lm'],
                                       **options['language_model'])
        # instantiate LM Trainer
        trainer = LanguageModelTrainer(language_model, corpus)

    log.info("Starting training. See {}".format(args.train_path))
    trainer.log_interval = 500
    trainer.train(args.train_path, **options['training'])
Пример #18
0
    parser.add_argument('-m', '--model_path', type=str, help='path to model,logs and checkpoints')
    parser.add_argument('-o', '--options_file', type=str, help='file with parameters')
    args = parser.parse_args()

    # import options
    try:
        options = importlib.import_module(args.options_file).options
    except ImportError as err:
        print('Error:', err)

    # instantiate an existing LM, such as one from the FlairEmbeddings
    language_model = FlairEmbeddings(args.pretrained_model).lm

    # are you fine-tuning a forward or backward LM?
    is_forward_lm = language_model.is_forward_lm

    # get the dictionary from the existing language model
    dictionary = language_model.dictionary

    # instantiate corpus
    corpus = TextCorpus(Path(args.corpus_path),
                        dictionary,
                        is_forward_lm,
                        **options['corpus'])

    # use the model trainer to fine-tune this model on your corpus
    trainer = LanguageModelTrainer(language_model, corpus)
    trainer.log_interval = 500

    trainer.train(Path(args.model_path), **options['training'])
is_forward_lm = language_model.is_forward_lm
dictionary: Dictionary = language_model.dictionary

print("load corpus")
corpus = TextCorpus(tmp_path.name,
                    dictionary,
                    is_forward_lm,
                    character_level=True)

print("start training")
trainer = LanguageModelTrainer(language_model, corpus)

trainer.train('resources/flair_ner/lm/ca_backward',
              sequence_length=100,
              mini_batch_size=100,
              learning_rate=20,
              patience=10,
              max_epochs=5,
              checkpoint=True)

print("load original model")
language_model = FlairEmbeddings('fr-forward').lm
is_forward_lm = language_model.is_forward_lm
dictionary: Dictionary = language_model.dictionary

print("load corpus")
corpus = TextCorpus(tmp_path.name,
                    dictionary,
                    is_forward_lm,
                    character_level=True)
Пример #20
0
from flair.data import Dictionary
from flair.models import LanguageModel
from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus
from flair.embeddings import FlairEmbeddings
dictionary: Dictionary = Dictionary.load('chars')
#dictionary: Dictionary = language_model.dictionary
language_model = FlairEmbeddings('pubmed-forward').lm
# get your corpus, process forward and at the character level
is_forward_lm = True

corpus = TextCorpus('/content/corpus',
                    dictionary,
                    is_forward_lm,
                    character_level=True)

trainer = LanguageModelTrainer(language_model, corpus)

trainer.train('/content/language_model',
              sequence_length=10,
              mini_batch_size=10,
              max_epochs=10)
Пример #21
0
    corpus = TextCorpus('/root/.fastai/data/idwiki/',
                        dictionary,
                        is_forward_lm,
                        character_level=True)
    logger.info('serializing corpus')
    joblib.dump(corpus, '../flair_models/backwards/corpus.flair')
    logger.info('saving the corpus to ../flair_models')

logger.info('loading corpus done, now creating language model')
# instantiate your language model, set hidden size and number of layers
language_model = LanguageModel(dictionary,
                               is_forward_lm,
                               hidden_size=2048,
                               nlayers=1)

if Path(MODEL_PATHLIB / 'checkpoint.pt').is_file():
    logger.info('checkpoint detected, resuming training')
    trainer = LanguageModelTrainer.load_from_checkpoint(
        MODEL_PATHLIB / 'checkpoint.pt', corpus)
else:
    # train your language model
    trainer = LanguageModelTrainer(language_model, corpus)

logger.info('we have lift off, good luck ground control')
trainer.train(MODEL_PATH,
              learning_rate=0.1,
              sequence_length=250,
              mini_batch_size=650,
              max_epochs=100,
              checkpoint=True)
Пример #22
0
        corpus = pickle.load(f)
else:
    corpus = TextCorpus('/mnt/disk1/tan_hm/corpus',
                        dictionary,
                        is_forward_lm,
                        character_level=True)

    with open('/mnt/disk1/tan_hm/saved_corpus.pkl', 'wb') as f:
        pickle.dump(corpus, f, protocol=pickle.HIGHEST_PROTOCOL)

# instantiate your language model, set hidden size and number of layers
language_model = LanguageModel(dictionary,
                               is_forward_lm,
                               hidden_size=2048,
                               nlayers=1)

trainer = LanguageModelTrainer(language_model, corpus)


trainer.train('/mnt/disk1/tan_hm/Flair_language_model_' + suffix,
              sequence_length=256,
              mini_batch_size=200,
              max_epochs=100,
              learning_rate=5,
              clip=0.5,
              patience=10,
              checkpoint=True,
              num_workers=4)


Пример #23
0
    # get the dictionary from the existing language model
    dictionary: Dictionary = language_model.dictionary

    # get your corpus, process forward and at the character level
    corpus = TextCorpus(corpus_dir,
                        dictionary,
                        is_forward_lm,
                        character_level=True)

    # use the model trainer to fine-tune this model on your corpus
    trainer = LanguageModelTrainer(language_model, corpus)

    trainer.train('resources/taggers/language_model',
                  sequence_length=100,
                  mini_batch_size=100,
                  learning_rate=20,
                  patience=10,
                  checkpoint=True)

    # flair_instance = FlairEmbedding(word_embedding_base='glove', document_embedding='pool')
    # print(flair_instance.embedd_document(doc))
    #
    # flair_instance = FlairEmbedding(word_embedding_base='de', document_embedding='pool')
    # print(flair_instance.embedd_document(doc))
    #
    # flair_instance = FlairEmbedding(word_embedding_base='en', document_embedding='pool')
    # print(flair_instance.embedd_document(doc))
    #
    # flair_instance = FlairEmbedding(word_embedding_base='glove', document_embedding='rnn')
    # print(flair_instance.embedd_document(doc))
    #