def test_train_resume_language_model_training(resources_path, results_base_path, tasks_base_path): dictionary = Dictionary.load(u'chars') language_model = LanguageModel(dictionary, is_forward_lm=True, hidden_size=128, nlayers=1) corpus = TextCorpus((resources_path / u'corpora/lorem_ipsum'), dictionary, language_model.is_forward_lm, character_level=True) trainer = LanguageModelTrainer(language_model, corpus, test_mode=True) trainer.train(results_base_path, sequence_length=10, mini_batch_size=10, max_epochs=2, checkpoint=True) trainer = LanguageModelTrainer.load_from_checkpoint( (results_base_path / u'checkpoint.pt'), corpus) trainer.train(results_base_path, sequence_length=10, mini_batch_size=10, max_epochs=2) shutil.rmtree(results_base_path)
def train_LM(file_path, model_path, is_forward_lm=True): from flair.data import Dictionary from flair.models import LanguageModel from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus dictionary = Dictionary.load_from_file(file_path + 'mappings') # get your corpus, process forward and at the character level corpus = TextCorpus(file_path, dictionary, is_forward_lm, character_level=True) # instantiate your language model, set hidden size and number of layers language_model = LanguageModel(dictionary, is_forward_lm, hidden_size=128, nlayers=1) # train your language model trainer = LanguageModelTrainer(language_model, corpus) trainer.train(model_path, sequence_length=100, mini_batch_size=32, max_epochs=10)
def fine_tune(base_model, corpus_dir, output_dir): # print stats print(f'Fine tuning base model: {base_model}') print(f'Corpus dir: {corpus_dir}') print(f'Output dir: {output_dir}') # instantiate an existing LM, such as one from the FlairEmbeddings language_model = FlairEmbeddings(base_model).lm # are you fine-tuning a forward or backward LM? is_forward_lm = language_model.is_forward_lm # get the dictionary from the existing language model dictionary: Dictionary = language_model.dictionary # get your corpus, process forward and at the character level corpus = TextCorpus(corpus_dir, dictionary, is_forward_lm, character_level=True) # use the model trainer to fine-tune this model on your corpus trainer = LanguageModelTrainer(language_model, corpus) trainer.train(output_dir, sequence_length=100, mini_batch_size=100, learning_rate=20, patience=10, checkpoint=True)
def ft(self): if isinstance(self.document_embedding, LanguageModel): trainer = LanguageModelTrainer(self.document_embedding, corpus) trainer.train('resources/taggers/language_model', sequence_length=100, mini_batch_size=100, learning_rate=20, patience=10, checkpoint=True)
def train(self) -> None: trainer = LanguageModelTrainer(self.lm, self.corpus) trainer.train(self.save_dir, sequence_length=self.sequence_length, mini_batch_size=self.mini_batch_size, learning_rate=self.learning_rate, patience=self.patience, checkpoint=self.checkpoint, write_weights=True, use_tensorboard=True)
def train_elmo(args): if args.finetune and args.checkpoint_path == '': print("finetune") from flair.embeddings import FlairEmbeddings language_model = FlairEmbeddings('he-forward').lm corpus: TextCorpus = TextCorpus(args.corpus_path, language_model.dictionary, language_model.is_forward_lm, character_level=True) trainer = LanguageModelTrainer(language_model, corpus) elif args.checkpoint_path == '' and not args.finetune: # Training from scrach print('Training from scarch') #Downloading data if not os.path.exists(args.corpus_path): print('Corpus _path', args.corpus_path) download_corpus(args) language_model, corpus = create_corpus(args) trainer = LanguageModelTrainer(language_model, corpus) else: print("Training from checpoint") from pathlib import Path checkpoint = Path(args.checkpoint_path) if args.finetune: load_dict_from_lm = True else: load_dict_from_lm = False trainer = LanguageModelTrainer.load_from_checkpoint( checkpoint, create_corpus(args, load_dict_from_lm, return_back='corpus')) trainer.train(args.save_model, sequence_length=args.seq_length, mini_batch_size=args.mini_batch, max_epochs=args.epochs, checkpoint=args.checkpoint)
def process(options): """ Do the processing """ # are you training a forward or backward LM? is_forward_lm = not options.is_backward_lm # load the default character dictionary dictionary: Dictionary = Dictionary.load('chars') # get your corpus, process forward and at the character level corpus = TextCorpus(options.corpus_dir, dictionary, is_forward_lm, character_level=True) # instantiate your language model, set hidden size and number of layers language_model = LanguageModel( dictionary, is_forward_lm, hidden_size=2048, nlayers=1, embedding_size=100, # recommendations? dropout=0) # dropout probs? # train your language model trainer = LanguageModelTrainer(language_model, corpus) trainer.train( options. model_dir, # embeddings_in_memory=False: effect on 'RuntimeError: CUDA out of memory'? sequence_length=250, learning_rate=20, mini_batch_size=100, anneal_factor=0.25, patience= 22, # 'patience' value of the learning rate scheduler: 1/2 training splits clip=0.25, # clipping gradients? max_epochs=75)
def retrain_flair(cls, corpus_path: str, model_path_dest: str, flair_algorithm: str = 'de-forward', epochs: int = 10): use_embedding, algorithm = cls.determine_algorithm_from_string( flair_algorithm_string=flair_algorithm) # instantiate an existing LM, such as one from the FlairEmbeddings model = use_embedding(flair_algorithm) if algorithm == 'bert': language_model = model.model else: language_model = model.lm # are you fine-tuning a forward or backward LM? try: is_forward_lm = language_model.is_forward_lm except AttributeError: is_forward_lm = True # todo: no support for finetuning BERT with Flair Library for now # get the dictionary from the existing language model dictionary: Dictionary = language_model.dictionary # get your corpus, process forward and at the character level corpus = TextCorpus(corpus_path, dictionary, is_forward_lm, character_level=True) # use the model trainer to fine-tune this model on your corpus trainer = LanguageModelTrainer(language_model, corpus) trainer.train(model_path_dest, sequence_length=10, mini_batch_size=10, learning_rate=20, max_epochs=epochs, patience=10, checkpoint=True)
class trainLanguage(object): def __init__(self, charPath, is_forward=True): self.is_forward_lm = is_forward self.dictionary: Dictionary = Dictionary.load(charPath) def trainLanguage(self, corpusPath): self.corpus = TextCorpus(Path(corpusPath), self.dictionary, self.is_forward_lm, character_level=True) self.language_model = LanguageModel(self.dictionary, self.is_forward_lm, hidden_size=128, nlayers=10) self.trainer = LanguageModelTrainer(self.language_model, self.corpus) self.trainer.train('resources/taggers/language_model', sequence_length=10, mini_batch_size=10, max_epochs=10)
def test_train_language_model(results_base_path, resources_path): dictionary = Dictionary.load(u'chars') language_model = LanguageModel(dictionary, is_forward_lm=True, hidden_size=128, nlayers=1) corpus = TextCorpus((resources_path / u'corpora/lorem_ipsum'), dictionary, language_model.is_forward_lm, character_level=True) trainer = LanguageModelTrainer(language_model, corpus, test_mode=True) trainer.train(results_base_path, sequence_length=10, mini_batch_size=10, max_epochs=2) char_lm_embeddings = FlairEmbeddings( unicode((results_base_path / u'best-lm.pt'))) sentence = Sentence(u'I love Berlin') char_lm_embeddings.embed(sentence) (text, likelihood) = language_model.generate_text(number_of_characters=100) assert (text is not None) assert (len(text) >= 100) shutil.rmtree(results_base_path, ignore_errors=True)
from flair.data import Dictionary from flair.models import LanguageModel from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus # are you training a forward or backward LM? ### NOTE: you have to train forward and backward separately ### is_forward_lm = True # load the default character dictionary dictionary: Dictionary = Dictionary.load('chars') # get your corpus, process forward and at the character level corpus = TextCorpus(Path('/local/kevinshih/BioFlair/data/PMC_Case_Rep/'), dictionary, is_forward_lm, character_level=True) # instantiate your language model, set hidden size and number of layers language_model = LanguageModel(dictionary, is_forward_lm, hidden_size=2048, nlayers=1) # train your language model trainer = LanguageModelTrainer(language_model, corpus) trainer.train('resources/taggers/language_model', sequence_length=250, mini_batch_size=100, max_epochs=50)
from flair.data import Dictionary from flair.models import LanguageModel from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus # are you training a forward or backward LM? is_forward_lm = True # load the default character dictionary dictionary: Dictionary = Dictionary.load('chars') # get your corpus, process forward and at the character level corpus = TextCorpus('corpus', dictionary, is_forward_lm, character_level=True) # instantiate your language model, set hidden size and number of layers language_model = LanguageModel(dictionary, is_forward_lm, hidden_size=2048, nlayers=1) #language_model = LanguageModel(dictionary, is_forward_lm, hidden_size=128, nlayers=1) # train your language model trainer = LanguageModelTrainer(language_model, corpus) trainer.train('resources/taggers/language_model', sequence_length=250, mini_batch_size=100, max_epochs=1000, patience=25, num_workers=8) #trainer.train('resources/taggers/language_model', sequence_length=10, mini_batch_size=10, max_epochs=10)
from flair.embeddings import FlairEmbeddings from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus from util import data_path, flair_datapath, train_flair_datapath # instantiate an existing LM, such as one from the FlairEmbeddings language_model = FlairEmbeddings("id-forward").lm # are you fine-tuning a forward or backward LM? is_forward_lm = language_model.is_forward_lm # get the dictionary from the existing language model dictionary: Dictionary = language_model.dictionary # get your corpus, process forward and at the character level corpus = TextCorpus(flair_datapath, dictionary, is_forward_lm, character_level=True) # use the model trainer to fine-tune this model on your corpus trainer = LanguageModelTrainer(language_model, corpus) trainer.train( "models/", sequence_length=108, # max(len(tweets)) mini_batch_size=100, learning_rate=20, patience=10, checkpoint=True, )
from flair.data import Dictionary from flair.models import LanguageModel from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus is_forward_lm = False dictionary: Dictionary = Dictionary.load('chars') corpus = TextCorpus('FLAIR/corpus', dictionary, is_forward_lm, character_level=True) language_model = LanguageModel(dictionary, is_forward_lm, hidden_size=1024, nlayers=2) trainer = LanguageModelTrainer(language_model, corpus) trainer.train('FLAIR/resources/taggers/language_model_backward', sequence_length=50, mini_batch_size=50, learning_rate=10, patience=3, max_epochs=50, checkpoint=True)
if not os.path.exists('./trained_embeddings/' + str(mesinesp_subset) + '/fwd/'): os.makedirs('./trained_embeddings/' + str(mesinesp_subset) + '/fwd/') output_dir = './trained_embeddings/' + str(mesinesp_subset) + '/fwd/' else: if not os.path.exists('./trained_embeddings/' + str(mesinesp_subset) + '/bwd/'): os.makedirs('./trained_embeddings/' + str(mesinesp_subset) + '/bwd/') output_dir = './trained_embeddings/' + str(mesinesp_subset) + '/bwd/' trainer.train(output_dir, sequence_length=250, mini_batch_size=100, max_epochs=2000, patience=25, checkpoint=True) ## To see the training process: #from flair.visual.training_curves import Plotter #plotter = Plotter() #f is_forward_lm: # plotter.plot_training_curves('resources/taggers/fwd_embeds/loss.tsv') # plotter.plot_weights('resources/taggers/fwd_embeds/weights.txt') #else: # plotter.plot_training_curves('resources/taggers/bwd_embeds/loss.tsv') # plotter.plot_weights('resources/taggers/bwd_embeds/weights.txt') print("Total time (aprox.):", int((time.time() - start_time)),
print(item2idx["\n".encode()]) inputs = open('corpus/train/train.txt', 'r').read().splitlines()[-1] inputs = [item2idx.get(char.encode(), 0) for char in inputs] inputs = torch.LongTensor(inputs).unsqueeze(-1) # (seqlen, 1) inputs = inputs.to(device) print("# load corpus") corpus = TextCorpus(Path('corpus/'), model.dictionary, model.is_forward_lm, character_level=True) print("# trainer") trainer = LanguageModelTrainer(model, corpus) print("# Generating characters with pretraned model") generate(model, inputs, hp.n_chars, f"{hp.output_dir}/0.out", device) print("# continue training the model on the new corpus") for epoch in range(1, hp.n_epochs): print(f"# epoch: {epoch}") print("training ..") trainer.train(f'{hp.ckpt_dir}', sequence_length=hp.seqlen, max_epochs=1) print("Generating ..") generate(model, inputs, hp.n_chars, f"{hp.output_dir}/{epoch}.out", device) print("Loading saved model") model = LanguageModel.load_language_model(f'{hp.ckpt_dir}/best-lm.pt') model.to(device)
# TODO: add possibility for other dictionary! # (https://github.com/zalandoresearch/flair/issues/179#issuecomment-433942853) print("loading Dictionary") dictionary = Dictionary.load('chars') # instantiate corpus log.info("Making corpus from folder: {}".format(args.corpus_path)) corpus = TextCorpus(args.corpus_path, dictionary, options['is_forward_lm'], **options['corpus']) # TRAINING if args.continue_training: # load checkpoint cp_path = args.train_path + '/checkpoint.pt' log.info("Continue training from {}".format(cp_path)) # load LM-Trainer trainer = LanguageModelTrainer.load_from_checkpoint(cp_path, corpus) else: # instantiate language model log.info("Creating language model") language_model = LanguageModel(dictionary, options['is_forward_lm'], **options['language_model']) # instantiate LM Trainer trainer = LanguageModelTrainer(language_model, corpus) log.info("Starting training. See {}".format(args.train_path)) trainer.log_interval = 500 trainer.train(args.train_path, **options['training'])
parser.add_argument('-m', '--model_path', type=str, help='path to model,logs and checkpoints') parser.add_argument('-o', '--options_file', type=str, help='file with parameters') args = parser.parse_args() # import options try: options = importlib.import_module(args.options_file).options except ImportError as err: print('Error:', err) # instantiate an existing LM, such as one from the FlairEmbeddings language_model = FlairEmbeddings(args.pretrained_model).lm # are you fine-tuning a forward or backward LM? is_forward_lm = language_model.is_forward_lm # get the dictionary from the existing language model dictionary = language_model.dictionary # instantiate corpus corpus = TextCorpus(Path(args.corpus_path), dictionary, is_forward_lm, **options['corpus']) # use the model trainer to fine-tune this model on your corpus trainer = LanguageModelTrainer(language_model, corpus) trainer.log_interval = 500 trainer.train(Path(args.model_path), **options['training'])
is_forward_lm = language_model.is_forward_lm dictionary: Dictionary = language_model.dictionary print("load corpus") corpus = TextCorpus(tmp_path.name, dictionary, is_forward_lm, character_level=True) print("start training") trainer = LanguageModelTrainer(language_model, corpus) trainer.train('resources/flair_ner/lm/ca_backward', sequence_length=100, mini_batch_size=100, learning_rate=20, patience=10, max_epochs=5, checkpoint=True) print("load original model") language_model = FlairEmbeddings('fr-forward').lm is_forward_lm = language_model.is_forward_lm dictionary: Dictionary = language_model.dictionary print("load corpus") corpus = TextCorpus(tmp_path.name, dictionary, is_forward_lm, character_level=True)
from flair.data import Dictionary from flair.models import LanguageModel from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus from flair.embeddings import FlairEmbeddings dictionary: Dictionary = Dictionary.load('chars') #dictionary: Dictionary = language_model.dictionary language_model = FlairEmbeddings('pubmed-forward').lm # get your corpus, process forward and at the character level is_forward_lm = True corpus = TextCorpus('/content/corpus', dictionary, is_forward_lm, character_level=True) trainer = LanguageModelTrainer(language_model, corpus) trainer.train('/content/language_model', sequence_length=10, mini_batch_size=10, max_epochs=10)
corpus = TextCorpus('/root/.fastai/data/idwiki/', dictionary, is_forward_lm, character_level=True) logger.info('serializing corpus') joblib.dump(corpus, '../flair_models/backwards/corpus.flair') logger.info('saving the corpus to ../flair_models') logger.info('loading corpus done, now creating language model') # instantiate your language model, set hidden size and number of layers language_model = LanguageModel(dictionary, is_forward_lm, hidden_size=2048, nlayers=1) if Path(MODEL_PATHLIB / 'checkpoint.pt').is_file(): logger.info('checkpoint detected, resuming training') trainer = LanguageModelTrainer.load_from_checkpoint( MODEL_PATHLIB / 'checkpoint.pt', corpus) else: # train your language model trainer = LanguageModelTrainer(language_model, corpus) logger.info('we have lift off, good luck ground control') trainer.train(MODEL_PATH, learning_rate=0.1, sequence_length=250, mini_batch_size=650, max_epochs=100, checkpoint=True)
corpus = pickle.load(f) else: corpus = TextCorpus('/mnt/disk1/tan_hm/corpus', dictionary, is_forward_lm, character_level=True) with open('/mnt/disk1/tan_hm/saved_corpus.pkl', 'wb') as f: pickle.dump(corpus, f, protocol=pickle.HIGHEST_PROTOCOL) # instantiate your language model, set hidden size and number of layers language_model = LanguageModel(dictionary, is_forward_lm, hidden_size=2048, nlayers=1) trainer = LanguageModelTrainer(language_model, corpus) trainer.train('/mnt/disk1/tan_hm/Flair_language_model_' + suffix, sequence_length=256, mini_batch_size=200, max_epochs=100, learning_rate=5, clip=0.5, patience=10, checkpoint=True, num_workers=4)
# get the dictionary from the existing language model dictionary: Dictionary = language_model.dictionary # get your corpus, process forward and at the character level corpus = TextCorpus(corpus_dir, dictionary, is_forward_lm, character_level=True) # use the model trainer to fine-tune this model on your corpus trainer = LanguageModelTrainer(language_model, corpus) trainer.train('resources/taggers/language_model', sequence_length=100, mini_batch_size=100, learning_rate=20, patience=10, checkpoint=True) # flair_instance = FlairEmbedding(word_embedding_base='glove', document_embedding='pool') # print(flair_instance.embedd_document(doc)) # # flair_instance = FlairEmbedding(word_embedding_base='de', document_embedding='pool') # print(flair_instance.embedd_document(doc)) # # flair_instance = FlairEmbedding(word_embedding_base='en', document_embedding='pool') # print(flair_instance.embedd_document(doc)) # # flair_instance = FlairEmbedding(word_embedding_base='glove', document_embedding='rnn') # print(flair_instance.embedd_document(doc)) #