示例#1
0
def test_training():
    # get default dictionary
    dictionary: Dictionary = Dictionary.load('chars')

    # init forward LM with 128 hidden states and 1 layer
    language_model: LanguageModel = LanguageModel(dictionary,
                                                  is_forward_lm=True,
                                                  hidden_size=128,
                                                  nlayers=1)

    # get the example corpus and process at character level in forward direction
    corpus: TextCorpus = TextCorpus('resources/corpora/lorem_ipsum',
                                    dictionary,
                                    language_model.is_forward_lm,
                                    character_level=True)

    # train the language model
    trainer: LanguageModelTrainer = LanguageModelTrainer(
        language_model, corpus)
    trainer.train('./results',
                  sequence_length=10,
                  mini_batch_size=10,
                  max_epochs=5)

    # use the character LM as embeddings to embed the example sentence 'I love Berlin'
    char_lm_embeddings = CharLMEmbeddings('./results/best-lm.pt')
    sentence = Sentence('I love Berlin')
    char_lm_embeddings.embed(sentence)
    print(sentence[1].embedding.size())

    # clean up results directory
    shutil.rmtree('./results', ignore_errors=True)
def fine_tune(base_model, corpus_dir, output_dir):

    # print stats
    print(f'Fine tuning base model: {base_model}')
    print(f'Corpus dir: {corpus_dir}')
    print(f'Output dir: {output_dir}')

    # instantiate an existing LM, such as one from the FlairEmbeddings
    language_model = FlairEmbeddings(base_model).lm

    # are you fine-tuning a forward or backward LM?
    is_forward_lm = language_model.is_forward_lm

    # get the dictionary from the existing language model
    dictionary: Dictionary = language_model.dictionary

    # get your corpus, process forward and at the character level
    corpus = TextCorpus(corpus_dir,
                        dictionary,
                        is_forward_lm,
                        character_level=True)

    # use the model trainer to fine-tune this model on your corpus
    trainer = LanguageModelTrainer(language_model, corpus)

    trainer.train(output_dir,
                  sequence_length=100,
                  mini_batch_size=100,
                  learning_rate=20,
                  patience=10,
                  checkpoint=True)
示例#3
0
def test_train_resume_language_model_training(resources_path,
                                              results_base_path,
                                              tasks_base_path):
    dictionary = Dictionary.load(u'chars')
    language_model = LanguageModel(dictionary,
                                   is_forward_lm=True,
                                   hidden_size=128,
                                   nlayers=1)
    corpus = TextCorpus((resources_path / u'corpora/lorem_ipsum'),
                        dictionary,
                        language_model.is_forward_lm,
                        character_level=True)
    trainer = LanguageModelTrainer(language_model, corpus, test_mode=True)
    trainer.train(results_base_path,
                  sequence_length=10,
                  mini_batch_size=10,
                  max_epochs=2,
                  checkpoint=True)
    trainer = LanguageModelTrainer.load_from_checkpoint(
        (results_base_path / u'checkpoint.pt'), corpus)
    trainer.train(results_base_path,
                  sequence_length=10,
                  mini_batch_size=10,
                  max_epochs=2)
    shutil.rmtree(results_base_path)
示例#4
0
def create_corpus(args, load_dict_from_lm=False, return_back='both'):
    if not load_dict_from_lm:
        dictionary: Dictionary = Dictionary.load(
            os.path.join(args.corpus_path, args.mapfile))

    else:
        print("loading dictionary from finetune model")
        from flair.embeddings import FlairEmbeddings
        dictionary = FlairEmbeddings('he-forward').lm.dictionary

    language_model = LanguageModel(dictionary,
                                   args.is_forward_lm,
                                   hidden_size=args.hidden_size,
                                   nlayers=1)

    corpus = TextCorpus(args.corpus_path,
                        dictionary,
                        args.is_forward_lm,
                        character_level=True)
    if return_back == 'both':
        return language_model, corpus
    elif return_back == 'language_model':
        return language_model
    elif return_back == 'corpus':
        return corpus
    else:
        print('Specified what to return back')
def test_train_language_model(results_base_path, resources_path):
    # get default dictionary
    dictionary: Dictionary = Dictionary.load('chars')

    # init forward LM with 128 hidden states and 1 layer
    language_model: LanguageModel = LanguageModel(dictionary, is_forward_lm=True, hidden_size=128, nlayers=1)

    # get the example corpus and process at character level in forward direction
    corpus: TextCorpus = TextCorpus(resources_path / 'corpora/lorem_ipsum',
                                    dictionary,
                                    language_model.is_forward_lm,
                                    character_level=True)

    # train the language model
    trainer: LanguageModelTrainer = LanguageModelTrainer(language_model, corpus, test_mode=True)
    trainer.train(results_base_path, sequence_length=10, mini_batch_size=10, max_epochs=2)

    # use the character LM as embeddings to embed the example sentence 'I love Berlin'
    char_lm_embeddings = FlairEmbeddings(str(results_base_path / 'best-lm.pt'))
    sentence = Sentence('I love Berlin')
    char_lm_embeddings.embed(sentence)

    text, likelihood = language_model.generate_text(number_of_characters=100)
    assert (text is not None)
    assert (len(text) >= 100)

    # clean up results directory
    shutil.rmtree(results_base_path, ignore_errors=True)
示例#6
0
def train_LM(file_path, model_path, is_forward_lm=True):
    from flair.data import Dictionary
    from flair.models import LanguageModel
    from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus

    dictionary = Dictionary.load_from_file(file_path + 'mappings')

    # get your corpus, process forward and at the character level
    corpus = TextCorpus(file_path,
                        dictionary,
                        is_forward_lm,
                        character_level=True)

    # instantiate your language model, set hidden size and number of layers
    language_model = LanguageModel(dictionary,
                                   is_forward_lm,
                                   hidden_size=128,
                                   nlayers=1)

    # train your language model
    trainer = LanguageModelTrainer(language_model, corpus)

    trainer.train(model_path,
                  sequence_length=100,
                  mini_batch_size=32,
                  max_epochs=10)
示例#7
0
def test_train_resume_language_model_training(resources_path,
                                              results_base_path,
                                              tasks_base_path):
    dictionary = Dictionary.load(u'chars')
    corpus = TextCorpus((resources_path / u'corpora/lorem_ipsum'),
                        dictionary,
                        forward=True,
                        character_level=True)
    assert (corpus.test is not None)
    assert (corpus.train is not None)
    assert (corpus.valid is not None)
    assert (len(corpus.train) == 2)
示例#8
0
def test_train_resume_language_model_training(resources_path,
                                              results_base_path,
                                              tasks_base_path):
    # get default dictionary
    dictionary: Dictionary = Dictionary.load('chars')

    # get the example corpus and process at character level in forward direction
    corpus: TextCorpus = TextCorpus(resources_path / 'corpora/lorem_ipsum',
                                    dictionary,
                                    forward=True,
                                    character_level=True)

    assert (corpus.test is not None)
    assert (corpus.train_files is not None)
    assert (corpus.valid is not None)
    assert (len(corpus.train_files) == 2)
示例#9
0
    def trainLanguage(self, corpusPath):
        self.corpus = TextCorpus(Path(corpusPath),
                                 self.dictionary,
                                 self.is_forward_lm,
                                 character_level=True)

        self.language_model = LanguageModel(self.dictionary,
                                            self.is_forward_lm,
                                            hidden_size=128,
                                            nlayers=10)

        self.trainer = LanguageModelTrainer(self.language_model, self.corpus)

        self.trainer.train('resources/taggers/language_model',
                           sequence_length=10,
                           mini_batch_size=10,
                           max_epochs=10)
示例#10
0
def train_elmo(args):

    if args.finetune and args.checkpoint_path == '':
        print("finetune")
        from flair.embeddings import FlairEmbeddings
        language_model = FlairEmbeddings('he-forward').lm
        corpus: TextCorpus = TextCorpus(args.corpus_path,
                                        language_model.dictionary,
                                        language_model.is_forward_lm,
                                        character_level=True)
        trainer = LanguageModelTrainer(language_model, corpus)

    elif args.checkpoint_path == '' and not args.finetune:

        # Training from scrach
        print('Training from scarch')

        #Downloading data
        if not os.path.exists(args.corpus_path):
            print('Corpus _path', args.corpus_path)
            download_corpus(args)

        language_model, corpus = create_corpus(args)
        trainer = LanguageModelTrainer(language_model, corpus)

    else:
        print("Training from checpoint")

        from pathlib import Path
        checkpoint = Path(args.checkpoint_path)
        if args.finetune:
            load_dict_from_lm = True
        else:
            load_dict_from_lm = False

        trainer = LanguageModelTrainer.load_from_checkpoint(
            checkpoint,
            create_corpus(args, load_dict_from_lm, return_back='corpus'))

    trainer.train(args.save_model,
                  sequence_length=args.seq_length,
                  mini_batch_size=args.mini_batch,
                  max_epochs=args.epochs,
                  checkpoint=args.checkpoint)
def test_train_resume_language_model_training(resources_path,
                                              results_base_path,
                                              tasks_base_path):
    # get default dictionary
    dictionary: Dictionary = Dictionary.load("chars")

    # init forward LM with 128 hidden states and 1 layer
    language_model: LanguageModel = LanguageModel(dictionary,
                                                  is_forward_lm=True,
                                                  hidden_size=128,
                                                  nlayers=1)

    # get the example corpus and process at character level in forward direction
    corpus: TextCorpus = TextCorpus(
        resources_path / "corpora/lorem_ipsum",
        dictionary,
        language_model.is_forward_lm,
        character_level=True,
    )

    # train the language model
    trainer: LanguageModelTrainer = LanguageModelTrainer(language_model,
                                                         corpus,
                                                         test_mode=True)
    trainer.train(
        results_base_path,
        sequence_length=10,
        mini_batch_size=10,
        max_epochs=2,
        checkpoint=True,
    )
    del trainer, language_model

    trainer = LanguageModelTrainer.load_from_checkpoint(
        results_base_path / "checkpoint.pt", corpus)
    trainer.train(results_base_path,
                  sequence_length=10,
                  mini_batch_size=10,
                  max_epochs=2)

    # clean up results directory
    shutil.rmtree(results_base_path)
    del trainer
示例#12
0
def process(options):
    """
    Do the processing
    """

    # are you training a forward or backward LM?
    is_forward_lm = not options.is_backward_lm

    # load the default character dictionary
    dictionary: Dictionary = Dictionary.load('chars')

    # get your corpus, process forward and at the character level
    corpus = TextCorpus(options.corpus_dir,
                        dictionary,
                        is_forward_lm,
                        character_level=True)

    # instantiate your language model, set hidden size and number of layers
    language_model = LanguageModel(
        dictionary,
        is_forward_lm,
        hidden_size=2048,
        nlayers=1,
        embedding_size=100,  # recommendations?
        dropout=0)  # dropout probs?

    # train your language model
    trainer = LanguageModelTrainer(language_model, corpus)

    trainer.train(
        options.
        model_dir,  # embeddings_in_memory=False: effect on 'RuntimeError: CUDA out of memory'?
        sequence_length=250,
        learning_rate=20,
        mini_batch_size=100,
        anneal_factor=0.25,
        patience=
        22,  # 'patience' value of the learning rate scheduler: 1/2 training splits
        clip=0.25,  # clipping gradients?
        max_epochs=75)
示例#13
0
    def retrain_flair(cls,
                      corpus_path: str,
                      model_path_dest: str,
                      flair_algorithm: str = 'de-forward',
                      epochs: int = 10):
        use_embedding, algorithm = cls.determine_algorithm_from_string(
            flair_algorithm_string=flair_algorithm)
        # instantiate an existing LM, such as one from the FlairEmbeddings
        model = use_embedding(flair_algorithm)
        if algorithm == 'bert':
            language_model = model.model
        else:
            language_model = model.lm

        # are you fine-tuning a forward or backward LM?
        try:
            is_forward_lm = language_model.is_forward_lm
        except AttributeError:
            is_forward_lm = True

        # todo: no support for finetuning BERT with Flair Library for now
        # get the dictionary from the existing language model
        dictionary: Dictionary = language_model.dictionary

        # get your corpus, process forward and at the character level
        corpus = TextCorpus(corpus_path,
                            dictionary,
                            is_forward_lm,
                            character_level=True)

        # use the model trainer to fine-tune this model on your corpus
        trainer = LanguageModelTrainer(language_model, corpus)

        trainer.train(model_path_dest,
                      sequence_length=10,
                      mini_batch_size=10,
                      learning_rate=20,
                      max_epochs=epochs,
                      patience=10,
                      checkpoint=True)
示例#14
0
def test_train_language_model(results_base_path, resources_path):
    dictionary = Dictionary.load(u'chars')
    language_model = LanguageModel(dictionary,
                                   is_forward_lm=True,
                                   hidden_size=128,
                                   nlayers=1)
    corpus = TextCorpus((resources_path / u'corpora/lorem_ipsum'),
                        dictionary,
                        language_model.is_forward_lm,
                        character_level=True)
    trainer = LanguageModelTrainer(language_model, corpus, test_mode=True)
    trainer.train(results_base_path,
                  sequence_length=10,
                  mini_batch_size=10,
                  max_epochs=2)
    char_lm_embeddings = FlairEmbeddings(
        unicode((results_base_path / u'best-lm.pt')))
    sentence = Sentence(u'I love Berlin')
    char_lm_embeddings.embed(sentence)
    (text, likelihood) = language_model.generate_text(number_of_characters=100)
    assert (text is not None)
    assert (len(text) >= 100)
    shutil.rmtree(results_base_path, ignore_errors=True)
示例#15
0
from flair.data import Dictionary
from flair.models import LanguageModel
from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus

# are you training a forward or backward LM?
is_forward_lm = True

# load the default character dictionary
dictionary: Dictionary = Dictionary.load('chars')

# get your corpus, process forward and at the character level
corpus = TextCorpus('corpus', dictionary, is_forward_lm, character_level=True)

# instantiate your language model, set hidden size and number of layers
language_model = LanguageModel(dictionary,
                               is_forward_lm,
                               hidden_size=2048,
                               nlayers=1)
#language_model = LanguageModel(dictionary, is_forward_lm, hidden_size=128, nlayers=1)

# train your language model
trainer = LanguageModelTrainer(language_model, corpus)
trainer.train('resources/taggers/language_model',
              sequence_length=250,
              mini_batch_size=100,
              max_epochs=1000,
              patience=25,
              num_workers=8)
#trainer.train('resources/taggers/language_model', sequence_length=10, mini_batch_size=10, max_epochs=10)
示例#16
0
from flair.data import Dictionary
from flair.models import LanguageModel
from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus
import pickle

# are you train a forward or backward LM?
is_forward_lm = True

dictionaty = Dictionary.load_from_file('/home/anna/Desktop/markup/learning/dictionary/dict')

# get your corpus, process forward and at the character level
corpus = TextCorpus('/home/anna/Desktop/markup/learning', dictionaty, is_forward_lm, character_level=True)

# instantiate your language model, set hidden size and number of layers
language_model = LanguageModel(dictionaty, is_forward_lm, hidden_size=128, nlayers=1)

# train your language model
trainer = LanguageModelTrainer(language_model, corpus)

trainer.train('resources/taggers/language_model', sequence_length=10, mini_batch_size=10, max_epochs=10)
示例#17
0
    parser.add_argument('-m', '--model_path', type=str, help='path to model,logs and checkpoints')
    parser.add_argument('-o', '--options_file', type=str, help='file with parameters')
    args = parser.parse_args()

    # import options
    try:
        options = importlib.import_module(args.options_file).options
    except ImportError as err:
        print('Error:', err)

    # instantiate an existing LM, such as one from the FlairEmbeddings
    language_model = FlairEmbeddings(args.pretrained_model).lm

    # are you fine-tuning a forward or backward LM?
    is_forward_lm = language_model.is_forward_lm

    # get the dictionary from the existing language model
    dictionary = language_model.dictionary

    # instantiate corpus
    corpus = TextCorpus(Path(args.corpus_path),
                        dictionary,
                        is_forward_lm,
                        **options['corpus'])

    # use the model trainer to fine-tune this model on your corpus
    trainer = LanguageModelTrainer(language_model, corpus)
    trainer.log_interval = 500

    trainer.train(Path(args.model_path), **options['training'])
    is_forward_lm = True

elif direction == "bwd":
    is_forward_lm = False

## load the default character dictionary
dictionary: Dictionary = Dictionary.load('chars')

## get your corpus, process forward and at the character level
prepare_mesinesp_for_flair_embeds_training(
)  # prepare raw text from Spanish PubMed Abstracts for training
mesinesp_subset = sys.argv[1]
corpus_path = "./data/datasets/mesinesp/" + str(mesinesp_subset) + "/"

corpus = TextCorpus(corpus_path,
                    dictionary,
                    is_forward_lm,
                    character_level=True)

## instantiate your language model, set hidden size and number of layers (hidden_size=1024-small model, (hidden_size=2048-large model)
language_model = LanguageModel(dictionary,
                               is_forward_lm,
                               hidden_size=1024,
                               nlayers=1,
                               dropout=0.1)

## train your language model
trainer = LanguageModelTrainer(language_model, corpus)

#trainer.num_workers = 4 #Flair auto-detects whether you have a GPU available. If there is a GPU, it will automatically run training there.
output_dir = str()
示例#19
0
    # import options
    try:
        options = importlib.import_module(args.options_file).options
    except ImportError as err:
        print('Error:', err)

    # load the default character dictionary
    # TODO: add possibility for other dictionary!
    # (https://github.com/zalandoresearch/flair/issues/179#issuecomment-433942853)
    print("loading Dictionary")
    dictionary = Dictionary.load('chars')
    # instantiate corpus
    log.info("Making corpus from folder: {}".format(args.corpus_path))
    corpus = TextCorpus(args.corpus_path,
                        dictionary,
                        options['is_forward_lm'],
                        **options['corpus'])

    # TRAINING
    if args.continue_training:
        # load checkpoint
        cp_path = args.train_path + '/checkpoint.pt'
        log.info("Continue training from {}".format(cp_path))
        # load LM-Trainer
        trainer = LanguageModelTrainer.load_from_checkpoint(cp_path, corpus)
    else:
        # instantiate language model
        log.info("Creating language model")
        language_model = LanguageModel(dictionary,
                                       options['is_forward_lm'],
                                       **options['language_model'])
示例#20
0
    news_forward = FlairEmbeddings('news-forward')
    model = LanguageModel.load_language_model(news_forward)
    model.to(device)

    print("# load input data")
    item2idx = model.dictionary.item2idx
    print(item2idx["\n".encode()])

    inputs = open('corpus/train/train.txt', 'r').read().splitlines()[-1]
    inputs = [item2idx.get(char.encode(), 0) for char in inputs]
    inputs = torch.LongTensor(inputs).unsqueeze(-1)  # (seqlen, 1)
    inputs = inputs.to(device)

    print("# load corpus")
    corpus = TextCorpus(Path('corpus/'),
                        model.dictionary,
                        model.is_forward_lm,
                        character_level=True)

    print("# trainer")
    trainer = LanguageModelTrainer(model, corpus)

    print("# Generating characters with pretraned model")
    generate(model, inputs, hp.n_chars, f"{hp.output_dir}/0.out", device)

    print("# continue training the model on the new corpus")
    for epoch in range(1, hp.n_epochs):
        print(f"# epoch: {epoch}")
        print("training ..")
        trainer.train(f'{hp.ckpt_dir}', sequence_length=hp.seqlen, max_epochs=1)

        print("Generating ..")
示例#21
0
from flair.data import Dictionary
from flair.models import LanguageModel
from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus

# Script to train Flair LMs

# are you training a forward or backward LM?
is_forward_lm = True

# load the default character dictionary
dictionary: Dictionary = Dictionary.load('chars')

# get your corpus, process forward and at the character level
corpus = TextCorpus('/data/crawl/corpus',
                    dictionary,
                    is_forward_lm,
                    character_level=True)

# instantiate your language model, set hidden size and number of layers
language_model = LanguageModel(dictionary,
                               is_forward_lm,
                               hidden_size=2048,
                               nlayers=1)

# train your language model
trainer = LanguageModelTrainer(language_model, corpus)

trainer.train('resources/taggers/language_models/fwd',
              sequence_length=250,
              mini_batch_size=100,
              max_epochs=5000)
示例#22
0
# load the default character dictionary
dictionary: Dictionary = Dictionary.load('chars')
"""
# get your corpus, process forward and at the character level, then dump to harddisk
"""
# load joblib dump to memory

if Path(MODEL_PATHLIB / 'corpus.flair').is_file():
    logger.info('corpus found')
    logger.info('now loading the corpus')
    corpus = joblib.load(MODEL_PATHLIB / 'corpus.flair')
else:
    logger.info('making new corpus')
    corpus = TextCorpus('/root/.fastai/data/idwiki/',
                        dictionary,
                        is_forward_lm,
                        character_level=True)
    logger.info('serializing corpus')
    joblib.dump(corpus, '../flair_models/backwards/corpus.flair')
    logger.info('saving the corpus to ../flair_models')

logger.info('loading corpus done, now creating language model')
# instantiate your language model, set hidden size and number of layers
language_model = LanguageModel(dictionary,
                               is_forward_lm,
                               hidden_size=2048,
                               nlayers=1)

if Path(MODEL_PATHLIB / 'checkpoint.pt').is_file():
    logger.info('checkpoint detected, resuming training')
    trainer = LanguageModelTrainer.load_from_checkpoint(
示例#23
0
from flair.models import LanguageModel
from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus
import torch.nn as nn
import torch

# are you training a forward or backward LM?
is_forward_lm = False

# load the default character dictionary
dictionary: Dictionary = Dictionary.load('chars')

# get your corpus, process forward and at the character level

corpus = TextCorpus('./resources/tasks/wmt11_sub',
                    dictionary,
                    is_forward_lm,
                    character_level=True,
                    subword=True)

# instantiate your language model, set hidden size and number of layers
language_model = LanguageModel(dictionary,
                               is_forward_lm,
                               hidden_size=2048,
                               nlayers=1)

# train your language model
trainer = LanguageModelTrainer(language_model, corpus)

trainer.train('resources/taggers/language_model_sub_back',
              sequence_length=250,
              mini_batch_size=100,
示例#24
0
from flair.embeddings import FlairEmbeddings
from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus
from util import data_path, flair_datapath, train_flair_datapath

# instantiate an existing LM, such as one from the FlairEmbeddings
language_model = FlairEmbeddings("id-forward").lm

# are you fine-tuning a forward or backward LM?
is_forward_lm = language_model.is_forward_lm

# get the dictionary from the existing language model
dictionary: Dictionary = language_model.dictionary

# get your corpus, process forward and at the character level
corpus = TextCorpus(flair_datapath,
                    dictionary,
                    is_forward_lm,
                    character_level=True)

# use the model trainer to fine-tune this model on your corpus
trainer = LanguageModelTrainer(language_model, corpus)

trainer.train(
    "models/",
    sequence_length=108,  # max(len(tweets))
    mini_batch_size=100,
    learning_rate=20,
    patience=10,
    checkpoint=True,
)
示例#25
0
from pathlib import Path

from flair.data import Dictionary
from flair.models import LanguageModel
from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus

# are you training a forward or backward LM?
### NOTE: you have to train forward and backward separately ###
is_forward_lm = True

# load the default character dictionary
dictionary: Dictionary = Dictionary.load('chars')

# get your corpus, process forward and at the character level
corpus = TextCorpus(Path('/local/kevinshih/BioFlair/data/PMC_Case_Rep/'),
                    dictionary,
                    is_forward_lm,
                    character_level=True)

# instantiate your language model, set hidden size and number of layers
language_model = LanguageModel(dictionary,
                               is_forward_lm,
                               hidden_size=2048,
                               nlayers=1)

# train your language model
trainer = LanguageModelTrainer(language_model, corpus)

trainer.train('resources/taggers/language_model',
              sequence_length=250,
              mini_batch_size=100,
              max_epochs=50)
        f.writelines("\n".join(l))

with open(os.path.join(tmp_path.name, "valid.txt"), 'w') as f:
    f.writelines("\n".join(dev_set))

with open(os.path.join(tmp_path.name, "test.txt"), 'w') as f:
    f.writelines("\n".join(dev_set))

print("load original model")
language_model = FlairEmbeddings('fr-backward').lm
is_forward_lm = language_model.is_forward_lm
dictionary: Dictionary = language_model.dictionary

print("load corpus")
corpus = TextCorpus(tmp_path.name,
                    dictionary,
                    is_forward_lm,
                    character_level=True)

print("start training")
trainer = LanguageModelTrainer(language_model, corpus)

trainer.train('resources/flair_ner/lm/ca_backward',
              sequence_length=100,
              mini_batch_size=100,
              learning_rate=20,
              patience=10,
              max_epochs=5,
              checkpoint=True)

print("load original model")
language_model = FlairEmbeddings('fr-forward').lm
示例#27
0
 def _define_corpus(self) -> TextCorpus:
     return TextCorpus(
         self.corpus_dir,  # '/path/to/your/corpus'
         self.dictionary,
         self.is_forward_lm,
         character_level=True)
示例#28
0
文件: train.py 项目: TanHM-1211/Flair
# are you training a forward or backward LM?
is_forward_lm = True
suffix = 'forward' if is_forward_lm else 'backward'

# load the character dictionary
dictionary: Dictionary = Dictionary()
for i in vn_char:
    dictionary.add_item(i)

# get your corpus, process forward and at the character level
if os.path.isfile('/mnt/disk1/tan_hm/saved_corpus.pkl'):
    with open('/mnt/disk1/tan_hm/saved_corpus.pkl', 'rb') as f:
        corpus = pickle.load(f)
else:
    corpus = TextCorpus('/mnt/disk1/tan_hm/corpus',
                        dictionary,
                        is_forward_lm,
                        character_level=True)

    with open('/mnt/disk1/tan_hm/saved_corpus.pkl', 'wb') as f:
        pickle.dump(corpus, f, protocol=pickle.HIGHEST_PROTOCOL)

# instantiate your language model, set hidden size and number of layers
language_model = LanguageModel(dictionary,
                               is_forward_lm,
                               hidden_size=2048,
                               nlayers=1)

trainer = LanguageModelTrainer(language_model, corpus)


trainer.train('/mnt/disk1/tan_hm/Flair_language_model_' + suffix,
示例#29
0
# https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_9_TRAINING_LM_EMBEDDINGS.md

from pathlib import Path
from flair.data import Dictionary
from flair.models import LanguageModel
from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus

# are you training a forward or backward LM?
is_forward_lm = True

# load the default character dictionary
dictionary: Dictionary = Dictionary.load('chars')

# get your corpus, process forward and at the character level
corpus = TextCorpus(Path('patent_output/emb_texts/large_corpus/'),
                    dictionary,
                    is_forward_lm,
                    character_level=True)

# instantiate your language model, set hidden size and number of layers
language_model = LanguageModel(dictionary,
                               is_forward_lm,
                               hidden_size=2048,
                               nlayers=1)

# train your language model
trainer = LanguageModelTrainer(language_model, corpus)

trainer.train('resources/taggers/language_model_large_corpus',
              sequence_length=250,
              mini_batch_size=100,
              max_epochs=10)