"/mnt/clef-hipe-parser-master/transformers/examples/token-classification/german-large-2", layers="all", use_scalar_mix=True) ] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) # initialize sequence tagger from flair.models import SequenceTagger tagger: SequenceTagger = SequenceTagger( hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True, ) # initialize trainer from flair.trainers import ModelTrainer trainer: ModelTrainer = ModelTrainer(model=tagger, corpus=corpus, use_tensorboard=True) trainer.train("resources/taggers/baseline-de-stacked-we-bert-with-dev-3", mini_batch_size=16, patience=5, max_epochs=200, train_with_dev=True)
FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward'), ] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) # 5. initialize sequence tagger from flair.models import SequenceTagger tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True) # 6. initialize trainer from flair.trainers import ModelTrainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) # 7. start training trainer.train('resources/taggers/example-ner', learning_rate=0.1, mini_batch_size=32, max_epochs=150) # 8. plot training curves (optional) from flair.visual.training_curves import Plotter plotter = Plotter() plotter.plot_training_curves('resources/taggers/example-ner/loss.tsv') plotter.plot_weights('resources/taggers/example-ner/weights.txt')
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentLSTMEmbeddings from flair.models import TextClassifier from flair.trainers import ModelTrainer from pathlib import Path from torch.optim.adam import Adam from flair.embeddings import ELMoEmbeddings corpus = NLPTaskDataFetcher.load_classification_corpus(Path('./'), test_file='test.csv', dev_file='dev.csv', train_file='train.csv') word_embeddings = [ WordEmbeddings('glove'), FlairEmbeddings('news-forward-fast'), FlairEmbeddings('news-backward-fast') ] #word_embeddings = [BertEmbeddings('bert-base-uncased')] document_embeddings = DocumentLSTMEmbeddings(word_embeddings, hidden_size=512, bidirectional=True, reproject_words=True, reproject_words_dimension=256) classifier = TextClassifier(document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False) trainer = ModelTrainer(classifier, corpus, optimizer=Adam) trainer.train('./', learning_rate=0.001, mini_batch_size=6, embeddings_in_memory=False, max_epochs=150)
if tag_no_pref_encoded in tag_dictionary_no_prefix.idx2item and tag_countdown[ tag_dictionary_no_prefix.item2idx[tag_no_pref_encoded]] > 0: corpus_sents.append(sent) tag_countdown[ tag_dictionary_no_prefix.item2idx[tag_no_pref_encoded]] -= 1 sent_picked = True print("sents for training: " + str(len(corpus_sents))) print("amount of items in dict: " + str(len(tag_dictionary.item2idx))) training_dataset = SentenceDataset(corpus_sents) training_corpus = Corpus(train=training_dataset, dev=corpus_small.dev, test=corpus_small.test, sample_missing_splits=False) trainer = ModelTrainer(tagger, training_corpus, optimizer=torch.optim.AdamW) tag_dictionary = training_corpus.make_label_dictionary(tag_type) tagger.add_and_switch_to_new_task("fewshot-conll3-simple-to-moviecomplex", tag_dictionary=tag_dictionary, tag_type=tag_type) trainer.train( base_path='resources/v3/fewshot-conll_3-simple-to-moviecomplex-k' + str(k), learning_rate=5.0e-5, mini_batch_size=32, mini_batch_chunk_size=None, max_epochs=10, weight_decay=0., embeddings_storage_mode="none", scheduler=OneCycleLR, )
from flair.embeddings import DocumentRNNEmbeddings document_embeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256) # Create model from flair.models import TextClassifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict) # Create model trainer from flair.trainers import ModelTrainer trainer = ModelTrainer(classifier, corpus) # Train the model trainer.train('model-saves', learning_rate=0.1, mini_batch_size=32, anneal_factor=0.5, patience=8, max_epochs=200) # Load the model and make predictions from flair.data import Sentence classifier = TextClassifier.load('model-saves/final-model.pt') pos_sentence = Sentence(preprocess('I love Python!'))
def _objective(self, params: dict): log_line(log) log.info(f"Evaluation run: {self.run}") log.info(f"Evaluating parameter combination:") for k, v in params.items(): if isinstance(v, Tuple): v = ",".join([str(x) for x in v]) log.info(f"\t{k}: {str(v)}") log_line(log) for sent in self.corpus.get_all_sentences(): sent.clear_embeddings() scores = [] vars = [] for i in range(0, self.training_runs): log_line(log) log.info(f"Training run: {i + 1}") model = self._set_up_model(params) training_params = { key: params[key] for key in params if key in TRAINING_PARAMETERS } model_trainer_parameters = { key: params[key] for key in params if key in MODEL_TRAINER_PARAMETERS } trainer: ModelTrainer = ModelTrainer( model, self.corpus, **model_trainer_parameters ) result = trainer.train( self.base_path, max_epochs=self.max_epochs, param_selection_mode=True, **training_params, ) # take the average over the last three scores of training if self.optimization_value == OptimizationValue.DEV_LOSS: curr_scores = result["dev_loss_history"][-3:] else: curr_scores = list( map(lambda s: 1 - s, result["dev_score_history"][-3:]) ) score = sum(curr_scores) / float(len(curr_scores)) var = np.var(curr_scores) scores.append(score) vars.append(var) # take average over the scores from the different training runs final_score = sum(scores) / float(len(scores)) final_var = sum(vars) / float(len(vars)) test_score = result["test_score"] log_line(log) log.info(f"Done evaluating parameter combination:") for k, v in params.items(): if isinstance(v, Tuple): v = ",".join([str(x) for x in v]) log.info(f"\t{k}: {v}") log.info(f"{self.optimization_value.value}: {final_score}") log.info(f"variance: {final_var}") log.info(f"test_score: {test_score}\n") log_line(log) with open(self.param_selection_file, "a") as f: f.write(f"evaluation run {self.run}\n") for k, v in params.items(): if isinstance(v, Tuple): v = ",".join([str(x) for x in v]) f.write(f"\t{k}: {str(v)}\n") f.write(f"{self.optimization_value.value}: {final_score}\n") f.write(f"variance: {final_var}\n") f.write(f"test_score: {test_score}\n") f.write("-" * 100 + "\n") self.run += 1 return {"status": "ok", "loss": final_score, "loss_variance": final_var}
# elif GLOVE_CHARS is True: # embeddings = [WordEmbeddings('../../../../Data/Models/Chars/lemma_lowercased_estenten11_freeling_v4_virt.gensim.vec'), # WordEmbeddings('../../../../Data/Models/Glove/glove-sbwc_spanish.i25.gensim.vec')] # document_embeddings = DocumentPoolEmbeddings(embeddings, pooling='max') # prefix_model_output_dir = "glove_chars_maxpool" # # elif GLOVE_BPE is True: # embeddings = [BytePairEmbeddings(language='es'), # WordEmbeddings('../../../../Data/Models/Glove/glove-sbwc_spanish.i25.gensim.vec')] # document_embeddings = DocumentPoolEmbeddings(embeddings, pooling='max') # prefix_model_output_dir = "glove_chars_maxpool" if bTestPhase is False: classifier = TextClassifier(document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False) trainer = ModelTrainer(classifier, corpus) trainer.train('./' + prefix_model_output_dir + '_' + sLang + prefix + '/', learning_rate=cmd_args.lr, mini_batch_size=16, anneal_factor=0.5, patience=1, evaluation_metric=EvaluationMetric.MICRO_F1_SCORE, max_epochs=cmd_args.iters) plotter = Plotter() plotter.plot_training_curves('./' + prefix_model_output_dir + '_' + sLang + prefix + '/loss.tsv') plotter.plot_weights('./' + prefix_model_output_dir + '_' + sLang + prefix + '/weights.txt') # 7. find learning rate learning_rate_tsv = trainer.find_learning_rate('./' + prefix_model_output_dir + '_' + sLang + prefix + '/learning_rate.tsv')
# initialize sequence tagger tagger: SequenceTagger = SequenceTagger( hidden_size=args.hidden_size, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=args.crf, rnn_layers=args.rnn, train_initial_hidden_state=args.train_initial_hidden_state, loss_weights={'0': 10.}) # initialize trainer from flair.trainers import ModelTrainer trainer: ModelTrainer = ModelTrainer(tagger, corpus, use_tensorboard=False) # 7. start training trainer.train(args.output_folder, learning_rate=args.learning_rate, mini_batch_size=args.mini_batch_size, mini_batch_chunk_size=args.mini_batch_chunk_size, max_epochs=args.max_epochs, min_learning_rate=1e-6, shuffle=True, anneal_factor=0.5, patience=args.patience, num_workers=args.num_workers, embeddings_storage_mode=args.embeddings_storage_mode, monitor_test=True, monitor_train=args.monitor_train,
class SequenceClassifierTrainer: """Sequence Classifier Trainer Usage: ```python >>> sc_trainer = SequenceClassifierTrainer(corpus="/Path/to/data/dir") ``` **Parameters:** * **corpus** - A flair corpus data model or `Path`/string to a directory with train.csv/test.csv/dev.csv * **encoder** - A `EasyDocumentEmbeddings` object if training with a flair prediction head or `Path`/string if training with Transformer's prediction models * **column_name_map** - Required if corpus is not a `Corpus` object, it's a dictionary specifying the indices of the text and label columns of the csv i.e. {1:"text",2:"label"} * **corpus_in_memory** - Boolean for whether to store corpus embeddings in memory * **predictive_head** - For now either "flair" or "transformers" for the prediction head * ****kwargs** - Keyword arguments for Flair's `TextClassifier` model class """ def __init__( self, corpus: Union[Corpus, Path, str], encoder: Union[EasyDocumentEmbeddings, Path, str], column_name_map: None, corpus_in_memory: bool = True, predictive_head: str = "flair", **kwargs, ): if isinstance(corpus, Corpus): self.corpus = corpus else: if isinstance(corpus, str): corpus = Path(corpus) if not column_name_map: raise ValueError( "If not instantiating with `Corpus` object, must pass in `column_name_map` argument to specify text/label indices" ) self.corpus = CSVClassificationCorpus( corpus, column_name_map, skip_header=True, delimiter=",", in_memory=corpus_in_memory, ) # Verify predictive head is within available heads self.available_predictive_head = ["flair", "transformers"] if predictive_head not in self.available_predictive_head: raise ValueError( f"predictive_head param must be one of the following: {self.available_predictive_head}" ) self.predictive_head = predictive_head # Verify correct corresponding encoder is used with predictive head (This can be structured with better design in the future) if isinstance(encoder, EasyDocumentEmbeddings): if predictive_head == "transformers": raise ValueError( "If using `transformers` predictive head, pass in the path to the transformer's model" ) else: self.encoder = encoder else: if isinstance(encoder, str): encoder = Path(encoder) self.encoder = encoder # Create the label dictionary on init (store to keep from constantly generating label_dict) should we use dev/test set instead assuming all labels are provided? self.label_dict = self.corpus.make_label_dictionary() # Save trainer kwargs dict for reinitializations self.trainer_kwargs = kwargs # Load trainer with initial setup self._initial_setup(self.label_dict, **kwargs) def _initial_setup(self, label_dict: Dict, **kwargs): if self.predictive_head == "flair": # Get Document embeddings from `embeddings` document_embeddings: DocumentRNNEmbeddings = self.encoder.rnn_embeddings # Create the text classifier classifier = TextClassifier( document_embeddings, label_dictionary=label_dict, **kwargs, ) # Initialize the text classifier trainer self.trainer = ModelTrainer(classifier, self.corpus) # TODO: In internal transformers package, create ****ForSequenceClassification adaptations elif self.predictive_head == "transformers": with open(self.encoder / "config.json") as config_f: configs = json.load(config_f) model_name = configs["architectures"][-1] if model_name == "BertForMaskedLM": pass def train( self, output_dir: Union[Path, str], learning_rate: float = 0.07, mini_batch_size: int = 32, anneal_factor: float = 0.5, patience: int = 5, max_epochs: int = 150, plot_weights: bool = False, **kwargs, ) -> None: """ Train the Sequence Classifier * **output_dir** - The output directory where the model predictions and checkpoints will be written. * **learning_rate** - The initial learning rate * **mini_batch_size** - Batch size for the dataloader * **anneal_factor** - The factor by which the learning rate is annealed * **patience** - Patience is the number of epochs with no improvement the Trainer waits until annealing the learning rate * **max_epochs** - Maximum number of epochs to train. Terminates training if this number is surpassed. * **plot_weights** - Bool to plot weights or not * **kwargs** - Keyword arguments for the rest of Flair's `Trainer.train()` hyperparameters """ if isinstance(output_dir, str): output_dir = Path(output_dir) # Start the training self.trainer.train( output_dir, learning_rate=learning_rate, mini_batch_size=mini_batch_size, anneal_factor=anneal_factor, patience=patience, max_epochs=max_epochs, **kwargs, ) # Plot weight traces if plot_weights: plotter = Plotter() plotter.plot_weights(output_dir / "weights.txt") def find_learning_rate( self, output_dir: Union[Path, str], file_name: str = "learning_rate.tsv", start_learning_rate: float = 1e-8, end_learning_rate: float = 10, iterations: int = 100, mini_batch_size: int = 32, stop_early: bool = True, smoothing_factor: float = 0.7, plot_learning_rate: bool = True, **kwargs, ) -> float: """ Uses Leslie's cyclical learning rate finding method to generate and save the loss x learning rate plot This method returns a suggested learning rate using the static method `LMFineTuner.suggest_learning_rate()` which is implicitly run in this method. * **output_dir** - Path to dir for learning rate file to be saved * **file_name** - Name of learning rate .tsv file * **start_learning_rate** - Initial learning rate to start cyclical learning rate finder method * **end_learning_rate** - End learning rate to stop exponential increase of the learning rate * **iterations** - Number of optimizer iterations for the ExpAnnealLR scheduler * **mini_batch_size** - Batch size for dataloader * **stop_early** - Bool for stopping early once loss diverges * **smoothing_factor** - Smoothing factor on moving average of losses * **adam_epsilon** - Epsilon for Adam optimizer. * **weight_decay** - Weight decay if we apply some. * **kwargs** - Additional keyword arguments for the Adam optimizer **return** - Learning rate as a float """ # 7. find learning rate learning_rate_tsv = self.trainer.find_learning_rate( base_path=output_dir, file_name=file_name, start_learning_rate=start_learning_rate, end_learning_rate=end_learning_rate, iterations=iterations, mini_batch_size=mini_batch_size, stop_early=stop_early, smoothing_factor=smoothing_factor, ) # Reinitialize optimizer and parameters by reinitializing trainer self._initial_setup(self.label_dict, **self.trainer_kwargs) if plot_learning_rate: plotter = Plotter() plotter.plot_learning_rate(learning_rate_tsv) # Use the automated learning rate finder with open(learning_rate_tsv) as lr_f: lr_tsv = list(csv.reader(lr_f, delimiter="\t")) losses = np.array([float(row[-1]) for row in lr_tsv[1:]]) lrs = np.array([float(row[-2]) for row in lr_tsv[1:]]) lr_to_use = self.suggested_learning_rate(losses, lrs, **kwargs) print(f"Recommended Learning Rate {lr_to_use}") return lr_to_use @staticmethod def suggested_learning_rate( losses: np.array, lrs: np.array, lr_diff: int = 15, loss_threshold: float = 0.2, adjust_value: float = 1, ) -> float: # This seems redundant unless we can make this configured for each trainer/finetuner """ Attempts to find the optimal learning rate using a interval slide rule approach with the cyclical learning rate method * **losses** - Numpy array of losses * **lrs** - Numpy array of exponentially increasing learning rates (must match dim of `losses`) * **lr_diff** - Learning rate Interval of slide ruler * **loss_threshold** - Threshold of loss difference on interval where the sliding stops * **adjust_value** - Coefficient for adjustment **return** - the optimal learning rate as a float """ # Get loss values and their corresponding gradients, and get lr values assert lr_diff < len(losses) loss_grad = np.gradient(losses) # Search for index in gradients where loss is lowest before the loss spike # Initialize right and left idx using the lr_diff as a spacing unit # Set the local min lr as -1 to signify if threshold is too low r_idx = -1 l_idx = r_idx - lr_diff local_min_lr = lrs[l_idx] while (l_idx >= -len(losses)) and ( abs(loss_grad[r_idx] - loss_grad[l_idx]) > loss_threshold): local_min_lr = lrs[l_idx] r_idx -= 1 l_idx -= 1 lr_to_use = local_min_lr * adjust_value return lr_to_use
skip_header=True, delimiter='\t', # tab-separated files ) print(corpus) # 2. create the label dictionary label_dict = corpus.make_label_dictionary() class_weights = utils.get_inverted_class_balance(corpus.train.dataset) # 3. initialize transformer document embeddings (many models are available) document_embeddings = TransformerDocumentEmbeddings( 'allenai/scibert_scivocab_uncased', fine_tune=True) # 4. create the text classifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, loss_weights=class_weights) # 5. initialize the text classifier trainer with Adam optimizer trainer = ModelTrainer(classifier, corpus, optimizer=Adam) # 6. start the training trainer.train( sys.argv[2], learning_rate=3e-5, # use very small learning rate mini_batch_size=16, mini_batch_chunk_size= 4, # optionally set this if transformer is too much for your machine max_epochs=5, # terminate after 5 epochs )
def train_tagger(options): # Define columns columns = {1: 'text', 2: 'pos', 3: 'ner'} # What tag should be predicted? tag_type = 'ner' # Folder in which train, test and dev files reside data_folder = options.iob_dir + '/' + options.correction_mode # Folder in which to save tagging model and additional information tagger_folder = '/'.join([ options.tagger_dir, options.ner_cycle, options.lm_domain, options.correction_mode ]) + '-stringemb' # Retrieve corpus using column format, data folder and the names of the train, dev and test files corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus( data_folder, columns, train_file='train.txt', test_file='test.txt', dev_file='dev.txt') # Make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) # Initialize embeddings char_embeddings = [ FlairEmbeddings(options.lm_dir + options.lm_domain + '-fw/best-lm.pt', use_cache=False), FlairEmbeddings(options.lm_dir + options.lm_domain + '-bw/best-lm.pt', use_cache=False) ] if not options.use_wiki_wordemb: if not options.use_press_wordemb: embedding_types: List[TokenEmbeddings] = char_embeddings else: embedding_types: List[TokenEmbeddings] = [ WordEmbeddings( 'resources.d/embeddings/fasttext/pressfr-wikifr') ] + char_embeddings tagger_folder = tagger_folder + '-wordemb-pr' else: embedding_types: List[TokenEmbeddings] = [WordEmbeddings('fr') ] + char_embeddings tagger_folder = tagger_folder + '-wordemb' if options.use_crf: tagger_folder = tagger_folder + '-crf' # Print information print(tagger_folder) print(corpus) print(tag_dictionary.idx2item) embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) # Initialize sequence tagger tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=options.use_crf) # Initialize trainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) # Start training trainer.train( tagger_folder, learning_rate=0.1, mini_batch_size=32, max_epochs=50, patience=options.train_patience, #train_with_dev=True, anneal_against_train_loss=False, embeddings_in_memory=False) # Plot training curves (optional) plotter = Plotter() plotter.plot_training_curves(tagger_folder + '/loss.tsv') plotter.plot_weights(tagger_folder + '/weights.txt')
def create_trainer(tagger, corpus, optimizer=SGD): trainer: ModelTrainer = ModelTrainer(tagger, corpus, optimizer=optimizer) return trainer
def main(args): logger.info('Args = {}'.format(args)) corpus = CorpusLoader().load_corpus(CORPUS_PATH[args.corpus]) tokenizer = TokenizerFactory().tokenizer(args.corpus) logger.info('Loaded corpus: {}'.format(corpus)) model_dir = train_utils.model_dir(corpus.name, args.run_id) os.makedirs(model_dir, exist_ok=True) logger.info('Get sentences...') train_sents, train_docs = flair_utils.standoff_to_flair_sents(corpus.train, tokenizer, verbose=True) dev_sents, dev_docs = flair_utils.standoff_to_flair_sents(corpus.dev, tokenizer, verbose=True) test_sents, test_docs = flair_utils.standoff_to_flair_sents(corpus.test, tokenizer, verbose=True) flair_corpus = flair_utils.FilteredCorpus(train=train_sents, dev=dev_sents, test=test_sents, ignore_sentence=_ignore_sentence) logger.info(flair_corpus) if not args.model_file: logger.info('Train model...') tagger = get_model( flair_corpus, corpus_name=args.corpus, pooled_contextual_embeddings=args.pooled_contextual_embeddings, contextual_forward_path=args.contextual_forward_path, contextual_backward_path=args.contextual_backward_path) trainer = ModelTrainer(tagger, flair_corpus) trainer.train(join(model_dir, 'flair'), max_epochs=150, monitor_train=False, train_with_dev=args.train_with_dev) if not args.train_with_dev: # Model performance is judged by dev data, so we also pick the best performing model # according to the dev score to make our final predictions. tagger = SequenceTagger.load( join(model_dir, 'flair', 'best-model.pt')) else: # Training is stopped if train loss converges - here, we do not have a "best model" and # use the final model to make predictions. pass else: logger.info('Load existing model from {}'.format(args.model_file)) tagger = SequenceTagger.load(args.model_file) logger.info('Make predictions...') make_predictions(tagger, flair_corpus) train_utils.save_predictions( corpus_name=corpus.name, run_id=args.run_id, train=flair_utils.flair_sents_to_standoff(train_sents, train_docs), dev=flair_utils.flair_sents_to_standoff(dev_sents, dev_docs), test=flair_utils.flair_sents_to_standoff(test_sents, test_docs))