def test_training(): # get default dictionary dictionary: Dictionary = Dictionary.load('chars') # init forward LM with 128 hidden states and 1 layer language_model: LanguageModel = LanguageModel(dictionary, is_forward_lm=True, hidden_size=128, nlayers=1) # get the example corpus and process at character level in forward direction corpus: TextCorpus = TextCorpus('resources/corpora/lorem_ipsum', dictionary, language_model.is_forward_lm, character_level=True) # train the language model trainer: LanguageModelTrainer = LanguageModelTrainer( language_model, corpus) trainer.train('./results', sequence_length=10, mini_batch_size=10, max_epochs=5) # use the character LM as embeddings to embed the example sentence 'I love Berlin' char_lm_embeddings = CharLMEmbeddings('./results/best-lm.pt') sentence = Sentence('I love Berlin') char_lm_embeddings.embed(sentence) print(sentence[1].embedding.size()) # clean up results directory shutil.rmtree('./results', ignore_errors=True)
def fine_tune(base_model, corpus_dir, output_dir): # print stats print(f'Fine tuning base model: {base_model}') print(f'Corpus dir: {corpus_dir}') print(f'Output dir: {output_dir}') # instantiate an existing LM, such as one from the FlairEmbeddings language_model = FlairEmbeddings(base_model).lm # are you fine-tuning a forward or backward LM? is_forward_lm = language_model.is_forward_lm # get the dictionary from the existing language model dictionary: Dictionary = language_model.dictionary # get your corpus, process forward and at the character level corpus = TextCorpus(corpus_dir, dictionary, is_forward_lm, character_level=True) # use the model trainer to fine-tune this model on your corpus trainer = LanguageModelTrainer(language_model, corpus) trainer.train(output_dir, sequence_length=100, mini_batch_size=100, learning_rate=20, patience=10, checkpoint=True)
def test_train_resume_language_model_training(resources_path, results_base_path, tasks_base_path): dictionary = Dictionary.load(u'chars') language_model = LanguageModel(dictionary, is_forward_lm=True, hidden_size=128, nlayers=1) corpus = TextCorpus((resources_path / u'corpora/lorem_ipsum'), dictionary, language_model.is_forward_lm, character_level=True) trainer = LanguageModelTrainer(language_model, corpus, test_mode=True) trainer.train(results_base_path, sequence_length=10, mini_batch_size=10, max_epochs=2, checkpoint=True) trainer = LanguageModelTrainer.load_from_checkpoint( (results_base_path / u'checkpoint.pt'), corpus) trainer.train(results_base_path, sequence_length=10, mini_batch_size=10, max_epochs=2) shutil.rmtree(results_base_path)
def create_corpus(args, load_dict_from_lm=False, return_back='both'): if not load_dict_from_lm: dictionary: Dictionary = Dictionary.load( os.path.join(args.corpus_path, args.mapfile)) else: print("loading dictionary from finetune model") from flair.embeddings import FlairEmbeddings dictionary = FlairEmbeddings('he-forward').lm.dictionary language_model = LanguageModel(dictionary, args.is_forward_lm, hidden_size=args.hidden_size, nlayers=1) corpus = TextCorpus(args.corpus_path, dictionary, args.is_forward_lm, character_level=True) if return_back == 'both': return language_model, corpus elif return_back == 'language_model': return language_model elif return_back == 'corpus': return corpus else: print('Specified what to return back')
def test_train_language_model(results_base_path, resources_path): # get default dictionary dictionary: Dictionary = Dictionary.load('chars') # init forward LM with 128 hidden states and 1 layer language_model: LanguageModel = LanguageModel(dictionary, is_forward_lm=True, hidden_size=128, nlayers=1) # get the example corpus and process at character level in forward direction corpus: TextCorpus = TextCorpus(resources_path / 'corpora/lorem_ipsum', dictionary, language_model.is_forward_lm, character_level=True) # train the language model trainer: LanguageModelTrainer = LanguageModelTrainer(language_model, corpus, test_mode=True) trainer.train(results_base_path, sequence_length=10, mini_batch_size=10, max_epochs=2) # use the character LM as embeddings to embed the example sentence 'I love Berlin' char_lm_embeddings = FlairEmbeddings(str(results_base_path / 'best-lm.pt')) sentence = Sentence('I love Berlin') char_lm_embeddings.embed(sentence) text, likelihood = language_model.generate_text(number_of_characters=100) assert (text is not None) assert (len(text) >= 100) # clean up results directory shutil.rmtree(results_base_path, ignore_errors=True)
def train_LM(file_path, model_path, is_forward_lm=True): from flair.data import Dictionary from flair.models import LanguageModel from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus dictionary = Dictionary.load_from_file(file_path + 'mappings') # get your corpus, process forward and at the character level corpus = TextCorpus(file_path, dictionary, is_forward_lm, character_level=True) # instantiate your language model, set hidden size and number of layers language_model = LanguageModel(dictionary, is_forward_lm, hidden_size=128, nlayers=1) # train your language model trainer = LanguageModelTrainer(language_model, corpus) trainer.train(model_path, sequence_length=100, mini_batch_size=32, max_epochs=10)
def test_train_resume_language_model_training(resources_path, results_base_path, tasks_base_path): dictionary = Dictionary.load(u'chars') corpus = TextCorpus((resources_path / u'corpora/lorem_ipsum'), dictionary, forward=True, character_level=True) assert (corpus.test is not None) assert (corpus.train is not None) assert (corpus.valid is not None) assert (len(corpus.train) == 2)
def test_train_resume_language_model_training(resources_path, results_base_path, tasks_base_path): # get default dictionary dictionary: Dictionary = Dictionary.load('chars') # get the example corpus and process at character level in forward direction corpus: TextCorpus = TextCorpus(resources_path / 'corpora/lorem_ipsum', dictionary, forward=True, character_level=True) assert (corpus.test is not None) assert (corpus.train_files is not None) assert (corpus.valid is not None) assert (len(corpus.train_files) == 2)
def trainLanguage(self, corpusPath): self.corpus = TextCorpus(Path(corpusPath), self.dictionary, self.is_forward_lm, character_level=True) self.language_model = LanguageModel(self.dictionary, self.is_forward_lm, hidden_size=128, nlayers=10) self.trainer = LanguageModelTrainer(self.language_model, self.corpus) self.trainer.train('resources/taggers/language_model', sequence_length=10, mini_batch_size=10, max_epochs=10)
def train_elmo(args): if args.finetune and args.checkpoint_path == '': print("finetune") from flair.embeddings import FlairEmbeddings language_model = FlairEmbeddings('he-forward').lm corpus: TextCorpus = TextCorpus(args.corpus_path, language_model.dictionary, language_model.is_forward_lm, character_level=True) trainer = LanguageModelTrainer(language_model, corpus) elif args.checkpoint_path == '' and not args.finetune: # Training from scrach print('Training from scarch') #Downloading data if not os.path.exists(args.corpus_path): print('Corpus _path', args.corpus_path) download_corpus(args) language_model, corpus = create_corpus(args) trainer = LanguageModelTrainer(language_model, corpus) else: print("Training from checpoint") from pathlib import Path checkpoint = Path(args.checkpoint_path) if args.finetune: load_dict_from_lm = True else: load_dict_from_lm = False trainer = LanguageModelTrainer.load_from_checkpoint( checkpoint, create_corpus(args, load_dict_from_lm, return_back='corpus')) trainer.train(args.save_model, sequence_length=args.seq_length, mini_batch_size=args.mini_batch, max_epochs=args.epochs, checkpoint=args.checkpoint)
def test_train_resume_language_model_training(resources_path, results_base_path, tasks_base_path): # get default dictionary dictionary: Dictionary = Dictionary.load("chars") # init forward LM with 128 hidden states and 1 layer language_model: LanguageModel = LanguageModel(dictionary, is_forward_lm=True, hidden_size=128, nlayers=1) # get the example corpus and process at character level in forward direction corpus: TextCorpus = TextCorpus( resources_path / "corpora/lorem_ipsum", dictionary, language_model.is_forward_lm, character_level=True, ) # train the language model trainer: LanguageModelTrainer = LanguageModelTrainer(language_model, corpus, test_mode=True) trainer.train( results_base_path, sequence_length=10, mini_batch_size=10, max_epochs=2, checkpoint=True, ) del trainer, language_model trainer = LanguageModelTrainer.load_from_checkpoint( results_base_path / "checkpoint.pt", corpus) trainer.train(results_base_path, sequence_length=10, mini_batch_size=10, max_epochs=2) # clean up results directory shutil.rmtree(results_base_path) del trainer
def process(options): """ Do the processing """ # are you training a forward or backward LM? is_forward_lm = not options.is_backward_lm # load the default character dictionary dictionary: Dictionary = Dictionary.load('chars') # get your corpus, process forward and at the character level corpus = TextCorpus(options.corpus_dir, dictionary, is_forward_lm, character_level=True) # instantiate your language model, set hidden size and number of layers language_model = LanguageModel( dictionary, is_forward_lm, hidden_size=2048, nlayers=1, embedding_size=100, # recommendations? dropout=0) # dropout probs? # train your language model trainer = LanguageModelTrainer(language_model, corpus) trainer.train( options. model_dir, # embeddings_in_memory=False: effect on 'RuntimeError: CUDA out of memory'? sequence_length=250, learning_rate=20, mini_batch_size=100, anneal_factor=0.25, patience= 22, # 'patience' value of the learning rate scheduler: 1/2 training splits clip=0.25, # clipping gradients? max_epochs=75)
def retrain_flair(cls, corpus_path: str, model_path_dest: str, flair_algorithm: str = 'de-forward', epochs: int = 10): use_embedding, algorithm = cls.determine_algorithm_from_string( flair_algorithm_string=flair_algorithm) # instantiate an existing LM, such as one from the FlairEmbeddings model = use_embedding(flair_algorithm) if algorithm == 'bert': language_model = model.model else: language_model = model.lm # are you fine-tuning a forward or backward LM? try: is_forward_lm = language_model.is_forward_lm except AttributeError: is_forward_lm = True # todo: no support for finetuning BERT with Flair Library for now # get the dictionary from the existing language model dictionary: Dictionary = language_model.dictionary # get your corpus, process forward and at the character level corpus = TextCorpus(corpus_path, dictionary, is_forward_lm, character_level=True) # use the model trainer to fine-tune this model on your corpus trainer = LanguageModelTrainer(language_model, corpus) trainer.train(model_path_dest, sequence_length=10, mini_batch_size=10, learning_rate=20, max_epochs=epochs, patience=10, checkpoint=True)
def test_train_language_model(results_base_path, resources_path): dictionary = Dictionary.load(u'chars') language_model = LanguageModel(dictionary, is_forward_lm=True, hidden_size=128, nlayers=1) corpus = TextCorpus((resources_path / u'corpora/lorem_ipsum'), dictionary, language_model.is_forward_lm, character_level=True) trainer = LanguageModelTrainer(language_model, corpus, test_mode=True) trainer.train(results_base_path, sequence_length=10, mini_batch_size=10, max_epochs=2) char_lm_embeddings = FlairEmbeddings( unicode((results_base_path / u'best-lm.pt'))) sentence = Sentence(u'I love Berlin') char_lm_embeddings.embed(sentence) (text, likelihood) = language_model.generate_text(number_of_characters=100) assert (text is not None) assert (len(text) >= 100) shutil.rmtree(results_base_path, ignore_errors=True)
from flair.data import Dictionary from flair.models import LanguageModel from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus # are you training a forward or backward LM? is_forward_lm = True # load the default character dictionary dictionary: Dictionary = Dictionary.load('chars') # get your corpus, process forward and at the character level corpus = TextCorpus('corpus', dictionary, is_forward_lm, character_level=True) # instantiate your language model, set hidden size and number of layers language_model = LanguageModel(dictionary, is_forward_lm, hidden_size=2048, nlayers=1) #language_model = LanguageModel(dictionary, is_forward_lm, hidden_size=128, nlayers=1) # train your language model trainer = LanguageModelTrainer(language_model, corpus) trainer.train('resources/taggers/language_model', sequence_length=250, mini_batch_size=100, max_epochs=1000, patience=25, num_workers=8) #trainer.train('resources/taggers/language_model', sequence_length=10, mini_batch_size=10, max_epochs=10)
from flair.data import Dictionary from flair.models import LanguageModel from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus import pickle # are you train a forward or backward LM? is_forward_lm = True dictionaty = Dictionary.load_from_file('/home/anna/Desktop/markup/learning/dictionary/dict') # get your corpus, process forward and at the character level corpus = TextCorpus('/home/anna/Desktop/markup/learning', dictionaty, is_forward_lm, character_level=True) # instantiate your language model, set hidden size and number of layers language_model = LanguageModel(dictionaty, is_forward_lm, hidden_size=128, nlayers=1) # train your language model trainer = LanguageModelTrainer(language_model, corpus) trainer.train('resources/taggers/language_model', sequence_length=10, mini_batch_size=10, max_epochs=10)
parser.add_argument('-m', '--model_path', type=str, help='path to model,logs and checkpoints') parser.add_argument('-o', '--options_file', type=str, help='file with parameters') args = parser.parse_args() # import options try: options = importlib.import_module(args.options_file).options except ImportError as err: print('Error:', err) # instantiate an existing LM, such as one from the FlairEmbeddings language_model = FlairEmbeddings(args.pretrained_model).lm # are you fine-tuning a forward or backward LM? is_forward_lm = language_model.is_forward_lm # get the dictionary from the existing language model dictionary = language_model.dictionary # instantiate corpus corpus = TextCorpus(Path(args.corpus_path), dictionary, is_forward_lm, **options['corpus']) # use the model trainer to fine-tune this model on your corpus trainer = LanguageModelTrainer(language_model, corpus) trainer.log_interval = 500 trainer.train(Path(args.model_path), **options['training'])
is_forward_lm = True elif direction == "bwd": is_forward_lm = False ## load the default character dictionary dictionary: Dictionary = Dictionary.load('chars') ## get your corpus, process forward and at the character level prepare_mesinesp_for_flair_embeds_training( ) # prepare raw text from Spanish PubMed Abstracts for training mesinesp_subset = sys.argv[1] corpus_path = "./data/datasets/mesinesp/" + str(mesinesp_subset) + "/" corpus = TextCorpus(corpus_path, dictionary, is_forward_lm, character_level=True) ## instantiate your language model, set hidden size and number of layers (hidden_size=1024-small model, (hidden_size=2048-large model) language_model = LanguageModel(dictionary, is_forward_lm, hidden_size=1024, nlayers=1, dropout=0.1) ## train your language model trainer = LanguageModelTrainer(language_model, corpus) #trainer.num_workers = 4 #Flair auto-detects whether you have a GPU available. If there is a GPU, it will automatically run training there. output_dir = str()
# import options try: options = importlib.import_module(args.options_file).options except ImportError as err: print('Error:', err) # load the default character dictionary # TODO: add possibility for other dictionary! # (https://github.com/zalandoresearch/flair/issues/179#issuecomment-433942853) print("loading Dictionary") dictionary = Dictionary.load('chars') # instantiate corpus log.info("Making corpus from folder: {}".format(args.corpus_path)) corpus = TextCorpus(args.corpus_path, dictionary, options['is_forward_lm'], **options['corpus']) # TRAINING if args.continue_training: # load checkpoint cp_path = args.train_path + '/checkpoint.pt' log.info("Continue training from {}".format(cp_path)) # load LM-Trainer trainer = LanguageModelTrainer.load_from_checkpoint(cp_path, corpus) else: # instantiate language model log.info("Creating language model") language_model = LanguageModel(dictionary, options['is_forward_lm'], **options['language_model'])
news_forward = FlairEmbeddings('news-forward') model = LanguageModel.load_language_model(news_forward) model.to(device) print("# load input data") item2idx = model.dictionary.item2idx print(item2idx["\n".encode()]) inputs = open('corpus/train/train.txt', 'r').read().splitlines()[-1] inputs = [item2idx.get(char.encode(), 0) for char in inputs] inputs = torch.LongTensor(inputs).unsqueeze(-1) # (seqlen, 1) inputs = inputs.to(device) print("# load corpus") corpus = TextCorpus(Path('corpus/'), model.dictionary, model.is_forward_lm, character_level=True) print("# trainer") trainer = LanguageModelTrainer(model, corpus) print("# Generating characters with pretraned model") generate(model, inputs, hp.n_chars, f"{hp.output_dir}/0.out", device) print("# continue training the model on the new corpus") for epoch in range(1, hp.n_epochs): print(f"# epoch: {epoch}") print("training ..") trainer.train(f'{hp.ckpt_dir}', sequence_length=hp.seqlen, max_epochs=1) print("Generating ..")
from flair.data import Dictionary from flair.models import LanguageModel from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus # Script to train Flair LMs # are you training a forward or backward LM? is_forward_lm = True # load the default character dictionary dictionary: Dictionary = Dictionary.load('chars') # get your corpus, process forward and at the character level corpus = TextCorpus('/data/crawl/corpus', dictionary, is_forward_lm, character_level=True) # instantiate your language model, set hidden size and number of layers language_model = LanguageModel(dictionary, is_forward_lm, hidden_size=2048, nlayers=1) # train your language model trainer = LanguageModelTrainer(language_model, corpus) trainer.train('resources/taggers/language_models/fwd', sequence_length=250, mini_batch_size=100, max_epochs=5000)
# load the default character dictionary dictionary: Dictionary = Dictionary.load('chars') """ # get your corpus, process forward and at the character level, then dump to harddisk """ # load joblib dump to memory if Path(MODEL_PATHLIB / 'corpus.flair').is_file(): logger.info('corpus found') logger.info('now loading the corpus') corpus = joblib.load(MODEL_PATHLIB / 'corpus.flair') else: logger.info('making new corpus') corpus = TextCorpus('/root/.fastai/data/idwiki/', dictionary, is_forward_lm, character_level=True) logger.info('serializing corpus') joblib.dump(corpus, '../flair_models/backwards/corpus.flair') logger.info('saving the corpus to ../flair_models') logger.info('loading corpus done, now creating language model') # instantiate your language model, set hidden size and number of layers language_model = LanguageModel(dictionary, is_forward_lm, hidden_size=2048, nlayers=1) if Path(MODEL_PATHLIB / 'checkpoint.pt').is_file(): logger.info('checkpoint detected, resuming training') trainer = LanguageModelTrainer.load_from_checkpoint(
from flair.models import LanguageModel from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus import torch.nn as nn import torch # are you training a forward or backward LM? is_forward_lm = False # load the default character dictionary dictionary: Dictionary = Dictionary.load('chars') # get your corpus, process forward and at the character level corpus = TextCorpus('./resources/tasks/wmt11_sub', dictionary, is_forward_lm, character_level=True, subword=True) # instantiate your language model, set hidden size and number of layers language_model = LanguageModel(dictionary, is_forward_lm, hidden_size=2048, nlayers=1) # train your language model trainer = LanguageModelTrainer(language_model, corpus) trainer.train('resources/taggers/language_model_sub_back', sequence_length=250, mini_batch_size=100,
from flair.embeddings import FlairEmbeddings from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus from util import data_path, flair_datapath, train_flair_datapath # instantiate an existing LM, such as one from the FlairEmbeddings language_model = FlairEmbeddings("id-forward").lm # are you fine-tuning a forward or backward LM? is_forward_lm = language_model.is_forward_lm # get the dictionary from the existing language model dictionary: Dictionary = language_model.dictionary # get your corpus, process forward and at the character level corpus = TextCorpus(flair_datapath, dictionary, is_forward_lm, character_level=True) # use the model trainer to fine-tune this model on your corpus trainer = LanguageModelTrainer(language_model, corpus) trainer.train( "models/", sequence_length=108, # max(len(tweets)) mini_batch_size=100, learning_rate=20, patience=10, checkpoint=True, )
from pathlib import Path from flair.data import Dictionary from flair.models import LanguageModel from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus # are you training a forward or backward LM? ### NOTE: you have to train forward and backward separately ### is_forward_lm = True # load the default character dictionary dictionary: Dictionary = Dictionary.load('chars') # get your corpus, process forward and at the character level corpus = TextCorpus(Path('/local/kevinshih/BioFlair/data/PMC_Case_Rep/'), dictionary, is_forward_lm, character_level=True) # instantiate your language model, set hidden size and number of layers language_model = LanguageModel(dictionary, is_forward_lm, hidden_size=2048, nlayers=1) # train your language model trainer = LanguageModelTrainer(language_model, corpus) trainer.train('resources/taggers/language_model', sequence_length=250, mini_batch_size=100, max_epochs=50)
f.writelines("\n".join(l)) with open(os.path.join(tmp_path.name, "valid.txt"), 'w') as f: f.writelines("\n".join(dev_set)) with open(os.path.join(tmp_path.name, "test.txt"), 'w') as f: f.writelines("\n".join(dev_set)) print("load original model") language_model = FlairEmbeddings('fr-backward').lm is_forward_lm = language_model.is_forward_lm dictionary: Dictionary = language_model.dictionary print("load corpus") corpus = TextCorpus(tmp_path.name, dictionary, is_forward_lm, character_level=True) print("start training") trainer = LanguageModelTrainer(language_model, corpus) trainer.train('resources/flair_ner/lm/ca_backward', sequence_length=100, mini_batch_size=100, learning_rate=20, patience=10, max_epochs=5, checkpoint=True) print("load original model") language_model = FlairEmbeddings('fr-forward').lm
def _define_corpus(self) -> TextCorpus: return TextCorpus( self.corpus_dir, # '/path/to/your/corpus' self.dictionary, self.is_forward_lm, character_level=True)
# are you training a forward or backward LM? is_forward_lm = True suffix = 'forward' if is_forward_lm else 'backward' # load the character dictionary dictionary: Dictionary = Dictionary() for i in vn_char: dictionary.add_item(i) # get your corpus, process forward and at the character level if os.path.isfile('/mnt/disk1/tan_hm/saved_corpus.pkl'): with open('/mnt/disk1/tan_hm/saved_corpus.pkl', 'rb') as f: corpus = pickle.load(f) else: corpus = TextCorpus('/mnt/disk1/tan_hm/corpus', dictionary, is_forward_lm, character_level=True) with open('/mnt/disk1/tan_hm/saved_corpus.pkl', 'wb') as f: pickle.dump(corpus, f, protocol=pickle.HIGHEST_PROTOCOL) # instantiate your language model, set hidden size and number of layers language_model = LanguageModel(dictionary, is_forward_lm, hidden_size=2048, nlayers=1) trainer = LanguageModelTrainer(language_model, corpus) trainer.train('/mnt/disk1/tan_hm/Flair_language_model_' + suffix,
# https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_9_TRAINING_LM_EMBEDDINGS.md from pathlib import Path from flair.data import Dictionary from flair.models import LanguageModel from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus # are you training a forward or backward LM? is_forward_lm = True # load the default character dictionary dictionary: Dictionary = Dictionary.load('chars') # get your corpus, process forward and at the character level corpus = TextCorpus(Path('patent_output/emb_texts/large_corpus/'), dictionary, is_forward_lm, character_level=True) # instantiate your language model, set hidden size and number of layers language_model = LanguageModel(dictionary, is_forward_lm, hidden_size=2048, nlayers=1) # train your language model trainer = LanguageModelTrainer(language_model, corpus) trainer.train('resources/taggers/language_model_large_corpus', sequence_length=250, mini_batch_size=100, max_epochs=10)