def test_plotting_training_curves_and_weights(resources_path): plotter = Plotter() plotter.plot_training_curves(resources_path / "visual/loss.tsv") plotter.plot_weights(resources_path / "visual/weights.txt") # clean up directory (resources_path / "visual/weights.png").unlink() (resources_path / "visual/training.png").unlink()
def test_plotting_training_curves_and_weights(): plotter = Plotter() plotter.plot_training_curves('./resources/visual/loss.tsv') plotter.plot_weights('./resources/visual/weights.txt') # clean up directory os.remove('./resources/visual/weights.png') os.remove('./resources/visual/training.png')
def plot_curve(self, traing_curve_path=os.path.normpath( r'./resources/taggers/slow_bert/loss.tsv'), weights_path=os.path.normpath( r'./resources/taggers/slow_bert/loss.tsv')): from flair.visual.training_curves import Plotter plotter = Plotter() plotter.plot_training_curves(traing_curve_path) plotter.plot_weights(weights_path)
def main(args): args = parser.parse_args() # 1. get the corpus corpus: TaggedCorpus = NLPTaskDataFetcher.load_classification_corpus( args.data_dir[0], train_file='train.txt', dev_file='dev.txt', test_file='test.txt') # 2. create the label dictionary label_dict = corpus.make_label_dictionary() # 3. make a list of word embeddings word_embeddings = [ WordEmbeddings('glove'), # comment in flair embeddings for state-of-the-art results # FlairEmbeddings('news-forward'), # FlairEmbeddings('news-backward'), # ELMoEmbeddings() ] # 4. init document embedding by passing list of word embeddings document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings( word_embeddings, hidden_size=128, reproject_words=True, reproject_words_dimension=64, ) # 5. create the text classifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=False) # 6. initialize the text classifier trainer trainer = ModelTrainer(classifier, corpus) # 7. start the training model_out = 'resources/classifiers/sentence-classification/glove' trainer.train(model_out, learning_rate=0.1, mini_batch_size=32, anneal_factor=0.5, patience=5, max_epochs=100) # 8. plot training curves (optional) from flair.visual.training_curves import Plotter plotter = Plotter() plotter.plot_training_curves(join(model_out, 'loss.tsv')) plotter.plot_weights(join(model_out, 'weights.txt'))
def __init__(self, corpus_name: str): corpus = NLPTaskDataFetcher.load_column_corpus( loc.abs_path([loc.ASSETS, loc.MODELS, loc.DIRKSON]), { 0: 'text', 1: 'ner' }, train_file=corpus_name + loc.DIRKSON_VALIDATION_TXT, test_file=corpus_name + loc.DIRKSON_TEST_TXT) embedding_types = [ BertEmbeddings('bert-base-uncased'), FlairEmbeddings('mix-forward'), FlairEmbeddings('mix-backward') ] tag_type = 'ner' embeddings = StackedEmbeddings(embeddings=embedding_types) tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True) trainer: ModelTrainer = ModelTrainer(tagger, corpus) if not path.exists: os.mkdir( loc.abs_path( [loc.ASSETS, loc.MODELS, loc.DIRKSON, corpus_name])) trainer.train(loc.abs_path( [loc.ASSETS, loc.MODELS, loc.DIRKSON, corpus_name]), learning_rate=0.1, mini_batch_size=32, max_epochs=150) plotter = Plotter() plotter.plot_training_curves( loc.abs_path([ loc.ASSETS, loc.MODELS, loc.DIRKSON, corpus_name, loc.LOSS_TSV ])) plotter.plot_weights( loc.abs_path([ loc.ASSETS, loc.MODELS, loc.DIRKSON, corpus_name, loc.WEIGHTS_TXT ]))
def train(self, trainfile, devfile, testfile, resfolder, embtype="bert", chunk_len=100, batch_len=8): """ *** This method can be used to train new models with the settings used in project Redewiedergabe It is not accessible from rwtagger_script and not documented in detail. Use at your own risk. ;-) *** :param trainfile: :param devfile: :param testfile: :param resfolder: :param embtype: :param chunk_len: :param batch_len: :return: """ emb_name, embeddings = self._get_embeddings(embtype) corpus: Corpus = self.create_corpus(trainfile, devfile, testfile, chunk_len) tag_dictionary = corpus.make_tag_dictionary(tag_type="cat") if not os.path.exists(resfolder): os.makedirs(resfolder) tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type="cat", use_crf=True, rnn_layers=2) trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train(resfolder, learning_rate=0.1, mini_batch_size=batch_len, max_epochs=150, checkpoint=True) # plot training curves plotter = Plotter() plotter.plot_training_curves(os.path.join(resfolder, 'loss.tsv')) plotter.plot_weights(os.path.join(resfolder, 'weights.txt'))
def train(data_folder, model_output_folder): corpus: TaggedCorpus = NLPTaskDataFetcher.load_corpus( NLPTask.CONLL_03, base_path=data_folder) # 2. what tag do we want to predict? tag_type = 'ner' # 3. make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary.idx2item) # init Flair embeddings flair_forward_embedding = FlairEmbeddings('multi-forward') flair_backward_embedding = FlairEmbeddings('multi-backward') # init multilingual BERT bert_embedding = BertEmbeddings('bert-base-multilingual-cased') # 4. initialize embeddings embedding_types: List[TokenEmbeddings] = [ flair_forward_embedding, flair_backward_embedding, bert_embedding ] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) # 5. initialize sequence tagger from flair.models import SequenceTagger tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type) # 6. initialize trainer from flair.trainers import ModelTrainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) # 7. start training trainer.train(model_output_folder, mini_batch_size=256, max_epochs=150) # 8. plot training curves (optional) from flair.visual.training_curves import Plotter plotter = Plotter() plotter.plot_training_curves(model_output_folder + '/loss.tsv') plotter.plot_weights(model_output_folder + '/weights.txt')
def main(): train_dev_corpus = NLPTaskDataFetcher.load_classification_corpus( Path(DATA_PATH), train_file='flair_train.csv', test_file='flair_test.csv', dev_file='flair_dev.csv') label_dict = train_dev_corpus.make_label_dictionary() word_embeddings = [ WordEmbeddings('crawl'), FlairEmbeddings('news-forward-fast', chars_per_chunk=128), FlairEmbeddings('news-backward-fast', chars_per_chunk=128) ] document_embeddings = DocumentRNNEmbeddings(word_embeddings, rnn_type='LSTM', hidden_size=128, reproject_words=True, reproject_words_dimension=64) classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=False) trainer = ModelTrainer(classifier, train_dev_corpus) trainer.train(PRETRAINED_FLAIR, max_epochs=40, learning_rate=0.2, mini_batch_size=32, embeddings_in_memory=False, checkpoint=True) plotter = Plotter() plotter.plot_training_curves(FLAIR_LOSS) plotter.plot_weights(FLAIR_WEIGHTS)
# comment in these lines to use contextual string embeddings # CharLMEmbeddings('news-forward'), # CharLMEmbeddings('news-backward'), ] embeddings = WordEmbeddings("tmp/glove.bin") # 5. initialize sequence tagger from flair.models import SequenceTagger tagger: SequenceTagger = SequenceTagger(hidden_size=1024, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True) # 6. initialize trainer from flair.trainers import SequenceTaggerTrainer trainer: SequenceTaggerTrainer = SequenceTaggerTrainer(tagger, corpus) # 7. start training trainer.train('resources/taggers/example-ner', learning_rate=0.1, mini_batch_size=8, max_epochs=150) # 8. plot training curves (optional) from flair.visual.training_curves import Plotter plotter = Plotter() plotter.plot_training_curves('resources/taggers1/example-ner/loss.tsv') plotter.plot_weights('resources/taggers1/example-ner/weights.txt')
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentRNNEmbeddings from flair.models import TextClassifier from flair.trainers import ModelTrainer import shelve from os import path #%% Loading classifier with shelve.open(path.join('data', 'prepared_data', 'bbc')) as db: classifier = db['classifier'] corpus=db['corpus'] #%% Model trainer definition trainer = ModelTrainer(classifier, corpus) model_path = path.join('models', 'bbc') # 7. start the training trainer.train(model_path, learning_rate=0.1, mini_batch_size=32, anneal_factor=0.5, patience=5, max_epochs=150) # 8. plot weight traces (optional) from flair.visual.training_curves import Plotter plotter = Plotter() plotter.plot_training_curves(path.join(model_path, 'loss.tsv')) plotter.plot_weights(path.join(model_path, 'weights.txt')) #%%
) # 5. create the text classifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict) # 6. initialize the text classifier trainer trainer = ModelTrainer(classifier, corpus) # 7. start the training trainer.train('resources/taggers/ag_news', learning_rate=0.1, mini_batch_size=32, anneal_factor=0.5, patience=5, max_epochs=150) # 8. plot training curves (optional) from flair.visual.training_curves import Plotter plotter = Plotter() plotter.plot_training_curves('resources/taggers/ag_news/loss.tsv') plotter.plot_weights('resources/taggers/ag_news/weights.txt') classifier = TextClassifier.load('resources/taggers/ag_news/final-model.pt') # create example sentence sentence = Sentence('France is the current world cup winner.') # predict tags and print classifier.predict(sentence) print(sentence.labels)
def run_experiments(input_dir: Path, output_dir: Path): output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # retrieve corpus using column format, data folder and the names of the train, dev and test files corpus = ColumnCorpus( input_dir, {0: 'text', 1: 'dep', 2: 'aspect'}, train_file='Laptops_poria-train.conll', # train_file='Restaurants_poria-train.conll', test_file='Laptops_poria-test.conll', # test_file='Restaurants_poria-test.conll', dev_file='Laptops_poria-train.conll' # dev_file='Restaurants_poria-train.conll' ) # 2. what tag do we want to predict? tag_type = 'aspect' # 3. make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary.idx2item) all_embedding_to_test = { # 'glove+aspects': [ # WordEmbeddings('glove'), # WordEmbeddings( # (output_dir / 'aspect_2_aspect_graph-en_core_web_lg.en_core_web_lg.word2vec_format.bin').as_posix() # ), # ], # 'glove': [ # WordEmbeddings('glove'), # ], # 'charlmembedding': [ # FlairEmbeddings('news-forward'), # FlairEmbeddings('news-backward'), # ], # 'glove-simple-char': [ # WordEmbeddings('glove'), # CharacterEmbeddings(), # ], 'bert+aspects': [ BertEmbeddings('bert-large-cased'), WordEmbeddings( (output_dir / 'aspect_2_aspect_graph-en_core_web_lg.en_core_web_lg.word2vec_format.bin').as_posix() ) ], 'bert': [ BertEmbeddings('bert-large-cased'), ], # 'elmo': [ # ELMoEmbeddings('original') # ] } for name, embeddings_to_stack in tqdm( all_embedding_to_test.items(), desc='Different embeddings stacked', total=len(all_embedding_to_test) ): results_folder = Path(DEFAULT_OUTPUT_PATH / f'sequence-tagging/aspects/laptops-{name}') embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embeddings_to_stack) # 5. initialize sequence tagger tagger: SequenceTagger = SequenceTagger( hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True ) trainer: ModelTrainer = ModelTrainer(tagger, corpus) # 7. start training trainer.train( results_folder.as_posix(), learning_rate=0.1, mini_batch_size=32, max_epochs=150 ) # 8. plot training curves (optional) plotter = Plotter() plotter.plot_training_curves(results_folder / 'loss.tsv') plotter.plot_weights(results_folder / 'weights.txt')
embedding_types: List[TokenEmbeddings] = [ CharacterEmbeddings(), WordEmbeddings("tmp/glove.1.8G.bin") ] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) # 5. initialize sequence tagger from flair.models import SequenceTagger tagger: SequenceTagger = SequenceTagger(hidden_size=1024, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True) # 6. initialize trainer from flair.trainers import SequenceTaggerTrainer trainer: SequenceTaggerTrainer = SequenceTaggerTrainer(tagger, corpus) # 7. start training model_path = "tmp/model2" trainer.train(model_path, learning_rate=0.1, mini_batch_size=8, max_epochs=150) # 8. plot training curves (optional) from flair.visual.training_curves import Plotter plotter = Plotter() plotter.plot_training_curves(f'{model_path}/loss.tsv') plotter.plot_weights(f'{model_path}/weights.txt')
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) # initialize sequence tagger from flair.models import SequenceTagger tagger: SequenceTagger = SequenceTagger( hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True, ) # initialize trainer from flair.trainers import ModelTrainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train( "resources/taggers/example-ner", learning_rate=0.1, mini_batch_size=32, max_epochs=1, shuffle=False, ) plotter = Plotter() plotter.plot_training_curves("resources/taggers/example-ner/loss.tsv") plotter.plot_weights("resources/taggers/example-ner/weights.txt")
def train_all(self): config_file = open(self.config, "r") if self.config.split('.')[-1] == "yml": datastore = yaml.load(config_file) elif self.config.split('.')[-1] == "json": datastore = json.loads(config_file.read()) else: print("Need a json or yaml file as config") sys.exit(0) columns = { int(datastore["dataset_reader"]["position_text"]): "text", int(datastore["dataset_reader"]["position_ner"]): "ner", } # focus_on = datastore["dataset_reader"]["focus_on"] if bool(datastore["dataset_reader"]["only_train"]): all_corpus = [] log.info("Reading data from {}".format(datastore["dataset_reader"]["data_folder"])) all_corpus = ColumnCorpusTrain( datastore["dataset_reader"]["data_folder"], columns, train_file=datastore["dataset_reader"]["train_name"], ) tag_type = "ner" tag_dictionary = all_corpus[0].make_tag_dictionary(tag_type=tag_type) else: iobes_corpus = ColumnCorpus( datastore["dataset_reader"]["data_folder"], columns, train_file=datastore["dataset_reader"]["train_name"], dev_file=datastore["dataset_reader"]["dev_name"], test_file=datastore["dataset_reader"]["test_name"], ) tag_type = "ner" tag_dictionary = iobes_corpus.make_tag_dictionary(tag_type=tag_type) try: train_ratio = float(datastore["dataset_reader"]["train_ratio"]) iobes_corpus = Corpus(iobes_corpus.train[0:int(len(iobes_corpus.train) * train_ratio)], iobes_corpus.dev, iobes_corpus.test) log_ratio = "Using only ", str(train_ratio * 100), "% of the train dataset" log.info(log_ratio) except: pass embed_list = [] word_char = [] char_word = [] for embed in datastore["embeddings"]["embeddings_list"]: if embed == "bpe": embed_list.append(BytePairEmbeddings(datastore["embeddings"]["lang"])) elif embed == "fasttext": embed_list.append(WordEmbeddings(datastore["embeddings"]["lang"])) elif embed == "flair" and datastore["embeddings"]["lang"] == "en": embed_list.append(FlairEmbeddings("news-forward")) embed_list.append(FlairEmbeddings("news-backward")) elif embed == "bert-base-uncased": if datastore["embeddings"]["lang"] == "en": embed_list.append(BertEmbeddings("bert-base-uncased")) elif embed == "bert-base-cased": if datastore["embeddings"]["lang"] == "en": embed_list.append(BertEmbeddings("bert-base-cased")) elif embed == "bert-large-uncased": if datastore["embeddings"]["lang"] == "en": embed_list.append(BertEmbeddings("bert-large-uncased")) elif embed == "bert-large-cased": if datastore["embeddings"]["lang"] == "en": embed_list.append(BertEmbeddings("bert-large-cased")) elif embed == "elmo-small": if datastore["embeddings"]["lang"] == "en": embed_list.append(ELMoEmbeddings("small")) elif embed == "elmo-medium": if datastore["embeddings"]["lang"] == "en": embed_list.append(ELMoEmbeddings("medium")) elif embed == "elmo-original": if datastore["embeddings"]["lang"] == "en": embed_list.append(ELMoEmbeddings("original")) elif embed == "bert-base-chinese": if datastore["embeddings"]["lang"] == "zh": embed_list.append(emb.BertEmbeddingsChinese("bert-base-chinese")) else: split_name = embed.split(".") ext = split_name[-1] kind = split_name[-2] if ext == "pt": # Flair type extra_index = 0 try: extra_index = int(datastore["embeddings"]["extra_index"]) except: pass if kind == "char": embed_list.append(emb.FlairEmbeddingsChar(embed, extra_index=extra_index)) elif kind == "char-seg": embed_list.append(emb.FlairEmbeddingsWordLevelCharSeg(embed, extra_index=extra_index)) if ext == "vec": # Char type if kind == "char-seg": embed_list.append(emb.WordEmbeddingsVecCharSeg(embed)) elif kind == "char": embed_list.append(emb.WordEmbeddingsVecFirst(embed)) elif kind == "word": embed_list.append(emb.WordEmbeddingsVecWord(embed)) elif kind == "bichar": embed_list.append(emb.WordEmbeddingsVecBichar(embed)) if ext == "bin": if kind == "word": embed_list.append(emb.WordEmbeddingsBinWord(embed)) elif kind == "bichar": embed_list.append(emb.WordEmbeddingsBinBichar(embed)) try: if bool(datastore["embeddings"]["ner_embed"]) == True: print("Generate NER embeddings..") embed_list.append( emb.nerEmbedding( generateNerEmbFromTrain( iobes_corpus.train, tag_dictionary.get_items() ) ) ) except: pass try: if bool(datastore["embeddings"]["one_hot"]) == True: print("Generate one hot embeddings..") embed_list.append(emb.OneHotEmbeddings(iobes_corpus)) except: pass try: if datastore["embeddings"]["embeddings_ngram_list"] != None: embed_list.append( emb.WordEmbeddingsVecNGramList( datastore["embeddings"]["embeddings_ngram_list"] ) ) except: pass if len(word_char) == 1 and len(char_word) == 1: embed_list.append(emb.WordEmbeddingsVecWordChar(word_char[0], char_word[0])) embedding_types: List[TokenEmbeddings] = embed_list embeddings: emb.StackedEmbeddingsNew = emb.StackedEmbeddingsNew( embeddings=embedding_types ) if bool(datastore["dataset_reader"]["only_train"]): score = [] for i in range(len(all_corpus)): tagger: SequenceTagger = SequenceTagger( hidden_size=int(datastore["model"]["hidden_size"]), embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=bool(datastore["model"]["use_crf"]), dropout=float(datastore["model"]["dropout"]), word_dropout=float(datastore["model"]["word_dropout"]), locked_dropout=float(datastore["model"]["locked_dropout"]), rnn_layers=int(datastore["model"]["rnn_layers"]), ) folder = datastore["train_config"]["folder"] + "/" + str(i) best = Path(folder + "/checkpoint.pt") iobes_corpus = all_corpus[i] if not best.exists(): best = Path(folder + "/best-model.pt") if best.exists(): trainer = ModelTrainer.load_checkpoint( tagger.load_checkpoint(best), iobes_corpus ) else: trainer: ModelTrainer = ModelTrainer(tagger, iobes_corpus) # 7. start training result = trainer.train( folder, learning_rate=float(datastore["train_config"]["learning_rate"]), anneal_factor=float(datastore["train_config"]["anneal_factor"]), min_learning_rate=float(datastore["train_config"]["min_learning_rate"]), mini_batch_size=int(datastore["train_config"]["batch_size"]), max_epochs=int(datastore["train_config"]["epoch"]), save_final_model=bool(datastore["train_config"]["save_final_model"]), checkpoint=bool(datastore["train_config"]["checkpoint"]), param_selection_mode=bool( datastore["train_config"]["param_selection_mode"] ), patience=int(datastore["train_config"]["patience"]), monitor_test=bool(datastore["train_config"]["monitor_test"]), embeddings_storage_mode=str(datastore["train_config"]["embeddings_storage_mode"]), shuffle=bool(datastore["train_config"]["shuffle"]), ) plotter = Plotter() if bool(datastore["train_config"]["save_plot_training_curve"]): curve = folder + "/loss.tsv" plotter.plot_training_curves(curve) if bool(datastore["train_config"]["save_plot_weights"]): weight = folder + "/weights.txt" plotter.plot_weights(weight) score.append(result["test_score"]) print(score, " \n Moyenne : ", round(sum(score) / len(score), 2)) else: tagger: SequenceTagger = SequenceTagger( hidden_size=int(datastore["model"]["hidden_size"]), embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=bool(datastore["model"]["use_crf"]), dropout=float(datastore["model"]["dropout"]), word_dropout=float(datastore["model"]["word_dropout"]), locked_dropout=float(datastore["model"]["locked_dropout"]), rnn_layers=int(datastore["model"]["rnn_layers"]), ) folder = datastore["train_config"]["folder"] best = Path(folder + "/checkpoint.pt") if not best.exists(): best = Path(folder + "/best-model.pt") if best.exists(): trainer = ModelTrainer.load_checkpoint( tagger.load_checkpoint(best), iobes_corpus ) else: trainer: ModelTrainer = ModelTrainer(tagger, iobes_corpus) # 7. start training trainer.train( folder, learning_rate=float(datastore["train_config"]["learning_rate"]), anneal_factor=float(datastore["train_config"]["anneal_factor"]), min_learning_rate=float(datastore["train_config"]["min_learning_rate"]), mini_batch_size=int(datastore["train_config"]["batch_size"]), max_epochs=int(datastore["train_config"]["epoch"]), save_final_model=bool(datastore["train_config"]["save_final_model"]), checkpoint=bool(datastore["train_config"]["checkpoint"]), param_selection_mode=bool( datastore["train_config"]["param_selection_mode"] ), patience=int(datastore["train_config"]["patience"]), monitor_test=bool(datastore["train_config"]["monitor_test"]), embeddings_storage_mode=str(datastore["train_config"]["embeddings_storage_mode"]), shuffle=bool(datastore["train_config"]["shuffle"]), ) plotter = Plotter() if bool(datastore["train_config"]["save_plot_training_curve"]): curve = folder + "/loss.tsv" plotter.plot_training_curves(curve) if bool(datastore["train_config"]["save_plot_weights"]): weight = folder + "/weights.txt" plotter.plot_weights(weight)
def train(model, selected_embeddings): # 1. get the corpus if model == 'AMT': corpus = read_in_AMT() elif model == 'CADEC': corpus = read_in_CADEC() elif model == 'TwitterADR': corpus = read_in_TwitterADR() elif model == 'Micromed': corpus = read_in_Micromed() print(corpus) # 2. what tag do we want to predict? tag_type = 'ner' # 3. make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary.idx2item) embedding_types: List[TokenEmbeddings] = [ ] if selected_embeddings['glove']: embedding_types.append(WordEmbeddings('glove')) if selected_embeddings['twitter']: embedding_types.append(WordEmbeddings('twitter')) if selected_embeddings['char']: embedding_types.append(CharacterEmbeddings()) # FlairEmbeddings if selected_embeddings['flair']: embedding_types.append(FlairEmbeddings('news-forward')) # sFlairEmbeddings if selected_embeddings['flair']: embedding_types.append(FlairEmbeddings('news-backward')) # PooledFlairEmbeddings if selected_embeddings['pooled-flair']: embedding_types.append(PooledFlairEmbeddings('news-forward', pooling='mean')) # PooledFlairEmbeddings if selected_embeddings['pooled-flair']: embedding_types.append(PooledFlairEmbeddings('news-backward', pooling='mean')) # init BERT if selected_embeddings['bert']: embedding_types.append(BertEmbeddings()) # init roberta if selected_embeddings['roberta']: embedding_types.append(RoBERTaEmbeddings()) # init BioBERT if selected_embeddings['biobert']: embedding_types.append(BertEmbeddings("data/embeddings/biobert-pubmed-pmc-cased")) # init clinical BERT if selected_embeddings['clinicalbiobert']: embedding_types.append(BertEmbeddings("data/embeddings/pretrained_bert_tf/biobert-base-clinical-cased")) # init multilingual ELMo if selected_embeddings['elmo']: embedding_types.append(ELMoEmbeddings()) embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True ) trainer: ModelTrainer = ModelTrainer(tagger, corpus) selected_embeddings_text = [key for key in selected_embeddings if selected_embeddings[key]] selected_embeddings_text = '_'.join(selected_embeddings_text) model_dir = 'resources/taggers/FA_' + model + selected_embeddings_text # 7. start training trainer.train(model_dir, train_with_dev=True, learning_rate=0.1, mini_batch_size=4, max_epochs=200, checkpoint=True) # 8. plot training curves (optional) from flair.visual.training_curves import Plotter plotter = Plotter() plotter.plot_training_curves(model_dir + '/loss.tsv') plotter.plot_weights(model_dir + '/weights.txt')
rnn_layers=1, rnn_type='RNN_RELU') classifier = TextClassifier(document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False) trainer = ModelTrainer(classifier, corpus) trainer.train('./', learning_rate=0.05, mini_batch_size=32, max_epochs=10, evaluation_metric=EvaluationMetric.MACRO_F1_SCORE) #plot training curves from flair.visual.training_curves import Plotter plotter = Plotter() plotter.plot_training_curves('loss.tsv') plotter.plot_weights('weights.txt') #make a table with predictions test_pred = pd.read_csv('test.tsv', sep='\t', encoding="utf-8", names=['text', 'label', 'prediction', 'confidence']) test_full = pd.concat([data_test_filtered, test_pred], axis=1, sort=False) #make a table with wrong predicions df = test_full.loc[test_full['prediction'] != test_full['label']] wrong_prediction = df[[ 'attribute::id', 'text', 'label', 'prediction', 'confidence' ]] wrong_prediction.to_csv("cat_without_lemma_flair_wrong_pred.csv", encoding='utf-8',
def trainer(file_path: Path, filenames: Tuple[str, str, str], checkpoint: str, stack: str, n_epochs: int) -> None: """Train sentiment model using Flair NLP library: https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md To help provide added context, we can stack Glove, Bert or ELMo embeddings along with Flair embeddings. """ # pip install flair allennlp from flair.datasets import ClassificationCorpus from flair.embeddings import FlairEmbeddings, DocumentRNNEmbeddings, DocumentPoolEmbeddings from flair.models import TextClassifier from flair.trainers import ModelTrainer from flair.training_utils import EvaluationMetric from flair.visual.training_curves import Plotter if stack == "glove": from flair.embeddings import WordEmbeddings stacked_embedding = WordEmbeddings('glove') elif stack == "fasttext": from flair.embeddings import WordEmbeddings stacked_embedding = WordEmbeddings('it') elif stack == "elmo": from flair.embeddings import ELMoEmbeddings stacked_embedding = ELMoEmbeddings('original') elif stack == "bert": from flair.embeddings import BertEmbeddings stacked_embedding = BertEmbeddings('bert-base-uncased') elif stack == "bert-multi": from flair.embeddings import BertEmbeddings stacked_embedding = BertEmbeddings('bert-base-multilingual-uncased') elif stack == 'bpe': from flair.embeddings import BytePairEmbeddings stacked_embedding = BytePairEmbeddings('it') else: stacked_embedding = None # Define and Load corpus from the provided dataset train, dev, test = filenames corpus = ClassificationCorpus( file_path, train_file=train, dev_file=dev, test_file=test, ) # Create label dictionary from provided labels in data label_dict = corpus.make_label_dictionary() # Stack Flair string-embeddings with optional embeddings word_embeddings = list( filter(None, [ stacked_embedding, FlairEmbeddings('it-forward'), FlairEmbeddings('it-backward'), ])) # Initialize document embedding by passing list of word embeddings document_embeddings = DocumentRNNEmbeddings( word_embeddings, hidden_size=256, reproject_words=True, dropout=0.5, reproject_words_dimension=256, ) #document_embeddings = DocumentPoolEmbeddings([ # stacked_embedding, # FlairEmbeddings('it-forward'), # FlairEmbeddings('it-backward')],pooling='mean') # Define classifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=True) if not checkpoint: trainer = ModelTrainer(classifier, corpus) else: # If checkpoint file is defined, resume training #checkpoint = classifier.load_checkpoint(Path(checkpoint)) trainer = ModelTrainer.load_checkpoint(checkpoint, corpus) # Begin training (enable checkpointing to continue training at a later time, if desired) trainer.train( file_path, max_epochs=n_epochs, checkpoint=True, ) # Plot curves and store weights and losses plotter = Plotter() plotter.plot_training_curves(file_path + '/loss.tsv') plotter.plot_weights(file_path + '/weights.txt')
# 5. create the text classifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=False) # 6. initialize the text classifier trainer trainer = ModelTrainer(classifier, corpus) # 7. start the training # Training aborted due to excessive size of documents. With each document limited to 5 sentences, training succesfully performed. # But the main reason I tried this tool was to overcome maximum length imposed in BERT. # So a workaround will not be helpful. trainer.train(base_path=DATA_FOLDER, learning_rate=0.1, mini_batch_size=32, anneal_factor=0.5, patience=5, max_epochs=2, embeddings_in_memory=False, evaluation_metric=EvaluationMetric.MACRO_F1_SCORE) #ilk ornekte True idi. False yapinca da bir sey degismedi sorunu cozmede. # 8. plot training curves (optional) from flair.visual.training_curves import Plotter plotter = Plotter() plotter.plot_training_curves(data_folder + '/loss.tsv') plotter.plot_weights(data_folder + '/weights.txt') # Test model test_data_folder = Path('/Users/buyukozb/git/berfu/thesis/data/all_data/india/flair_formatted/test') test_sentences = NLPTaskDataFetcher.load_sentences_from_data(test_data_folder, max_seq_len=128)
] # 4. init document embedding by passing list of word embeddings document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings( word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256, ) # 5. create the text classifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=True) # 6. initialize the text classifier trainer trainer = ModelTrainer(classifier, corpus) # 7. start the training trainer.train('C:/Users/jeanc/Documents/reviews/model', learning_rate=0.1, mini_batch_size=32, anneal_factor=0.5, patience=5, max_epochs=150) # 8. plot training curves (optional) plotter = Plotter() plotter.plot_training_curves('C:/Users/jeanc/Documents/reviews/model/loss.tsv') plotter.plot_weights('C:/Users/jeanc/Documents/reviews/model/weights.txt')
def main(train_file): # 1. get the corpus # define columns columns = {0: 'text', 1: '', 2: '', 3: 'ner'} # this is the folder in which train, test and dev files reside data_folder = './eng_data_mini_onefile/' # retrieve corpus using column format, data folder and the names of the train, dev and test files corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus( data_folder, columns, train_file=train_file, test_file='eng.testb', dev_file='eng.testa') print(corpus) # 2. what tag do we want to predict? tag_type = 'ner' # 3. make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary.idx2item) # 4. initialize embeddings embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('glove'), # comment in this line to use character embeddings # CharacterEmbeddings(), # comment in these lines to use flair embeddings # FlairEmbeddings('news-forward'), # FlairEmbeddings('news-backward'), ] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) # 5. initialize sequence tagger from flair.models import SequenceTagger tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True) # 6. initialize trainer from flair.trainers import ModelTrainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) # 7. start training trainer.train('resources/taggers/example-ner', learning_rate=0.1, mini_batch_size=32, max_epochs=150) # 8. plot training curves (optional) from flair.visual.training_curves import Plotter plotter = Plotter() plotter.plot_training_curves('resources/taggers/example-ner/loss.tsv') plotter.plot_weights('resources/taggers/example-ner/weights.txt')
class SequenceTaggerEvaluation(): def __init__(self, path: Union[Path, str], model: str = 'final-model.pt'): if type(path) == str: path = Path(path) assert path.exists() self.path = path self.model = SequenceTagger.load(path / model) self.cv_results = {} for file in ['summary', 'details']: try: self.cv_results[file] = pickle.load( (path / (file + '.pkl')).open(mode='rb')) except FileNotFoundError: print( f"{file+'.pkl'} not found. Setting cv_results['{file}'] to None" ) self.plotter = Plotter() def result_tables(self, save_as_html: bool = True): html_0 = self.cv_results['summary'].to_frame('value').to_html() html_1 = self.cv_results['details'].to_html() display(HTML(html_0)) print('\n') display(HTML(html_1)) if save_as_html: (self.path / 'summary.html').write_text(html_0) (self.path / 'details.html').write_text(html_1) def plot_tag_stats(self, mode: str, savefig: bool = False, **kwargs): """ mode tp-fn: stacked barplot - true-positives and false-negatives tp-fp: bar plot - true-positives and false-positives """ details = self.cv_results['details'] if mode == 'tp_fn': details[['true-positive', 'false-negative']].plot.bar(stacked=True, **kwargs) elif mode == 'tp_fp': details[['true-positive', 'false-positive']].plot.bar(stacked=False, **kwargs) else: details[mode.split('_')].plot.bar(stacked=False, **kwargs) plt.gca().yaxis.grid(True, linestyle='--') plt.tight_layout() if savefig: plt.savefig(self.path / (mode + '.png')) def confusion_matrix(self, ): # confusion matrix tags pass def predict(self, sentences: Union[str, Sentence, List[Sentence], List[str]], display_html: bool = True, html_file: str = None, display_str: bool = False, **kwargs): if type(sentences) == Sentence: sentences = [sentences] elif type(sentences) == str: sentences = split_single(sentences) if type(sentences[0]) == str: sentences = [Sentence(s, use_tokenizer=True) for s in sentences] self.model.predict(sentences) if display_html or html_file: html = render_ner_html(sentences, **kwargs) if display_html: display(HTML(html)) if html_file: (self.path / html_file).write_text(html) if display_str: for sentence in sentences: print(sentence.to_tagged_string()) def plot_training_curves(self, plot_values: List[str] = ["loss", "F1"]): self.plotter.plot_training_curves(self.path / 'loss.tsv', plot_values) def plot_weights(self): self.plotter.plot_weights(self.path / 'weights.txt') def plot_learning_rate(self, skip_first: int = 10, skip_last: int = 5): self.plotter.plot_learning_rate(self.path / 'loss.tsv', skip_first, skip_last) @staticmethod def _preprocess(text, mode=None): '''helper function to preprocess text. returns List of Sentences''' sentences = split_single(text) if mode: nlp = spacy.load('de_core_news_sm') if mode == 'lemmatize': sentences = [ Sentence((' ').join([token.lemma_ for token in nlp(s)])) for s in sentences ] elif mode == 'stem': stemmer = GermanStemmer() sentences = [ Sentence((' ').join( [stemmer.stem(token.text) for token in nlp(s)])) for s in sentences ] else: sentences = [Sentence(s, use_tokenizer=True) for s in sentences] return sentences
# Just replace the names of model and print the details of training and weights from flair.visual.training_curves import Plotter plotter = Plotter() plotter.plot_training_curves('FLAIR/resources/taggers/flairpos1/loss.tsv') plotter.plot_weights('FLAIR/resources/taggers/flairpos1/weights.txt')
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) #5. initialize sequence tagger from flair.models import SequenceTagger tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, rnn_layers=2, tag_type=tag_type, use_crf=True) # 6. initialize trainer from flair.trainers import ModelTrainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) # 7. start training trainer.train('resources/taggers/usDL2', learning_rate=0.01, embeddings_in_memory=False, mini_batch_size=32, max_epochs=150, checkpoint=True) # 8. plot training curves (optional) from flair.visual.training_curves import Plotter plotter = Plotter() plotter.plot_training_curves('resources/taggers/nerpan/loss.tsv') plotter.plot_weights('resources/taggers/nerpan/weights.txt')
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) # initialize sequence tagger from flair.models import SequenceTagger tagger: SequenceTagger = SequenceTagger( hidden_size=nb_cells, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True, ) # initialize trainer from flair.trainers import ModelTrainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train( "resources/taggers/" + exp_name, learning_rate=0.1, embeddings_storage_mode="cpu", mini_batch_size=32, max_epochs=150, shuffle=False, ) plotter = Plotter() plotter.plot_training_curves("resources/taggers/" + exp_name + "/loss.tsv") plotter.plot_weights("resources/taggers/" + exp_name + "/weights.txt")
def main(): datasets = os.listdir("./datasets") print(datasets) language = "fr" nb_cells = 32 dataset = "DESFOSSE_ARRAY" exp_name = dataset + "_" + str(nb_cells) # 1. get the corpus columns = {0: 'text', 1: 'position', 2: "array", 3: "line", 4: "col"} # this is the folder in which train, test and dev files reside data_folder = './datasets/' + dataset # init a corpus using column format, data folder and the names of the train, dev and test files corpus: Corpus = ColumnCorpus(data_folder, columns, train_file="train_" + dataset + '.txt', test_file="test_" + dataset + '.txt', dev_file="valid_" + dataset + '.txt') print(corpus) # 2. what tag do we want to predict? tag_type = "col" exp_name = dataset + "_" + tag_type # 3. make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary.idx2item) # initialize embeddings embedding_types: List[TokenEmbeddings] = [] embedding_types.append(FlairEmbeddings(language + '-forward')) embedding_types.append(FlairEmbeddings(language + '-backward')) embedding_types.append(FloatsEmbeddings(field='position', length=4)) embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) # initialize sequence tagger from flair.models import SequenceTagger tagger: SequenceTagger = SequenceTagger( hidden_size=nb_cells, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True, ) # initialize trainer from flair.trainers import ModelTrainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train( "resources/taggers/" + exp_name, learning_rate=0.1, embeddings_storage_mode="cpu", mini_batch_size=32, max_epochs=150, shuffle=False, ) plotter = Plotter() plotter.plot_training_curves("resources/taggers/" + exp_name + "/loss.tsv") plotter.plot_weights("resources/taggers/" + exp_name + "/weights.txt") predict_tagger(setId, nb_cells, rubric, rubric)
from flair.visual.training_curves import Plotter plotter = Plotter() # plotter.plot_weights('flair_outputs_glove/weights.txt') # plotter.plot_training_curves('flair_outputs_glove/loss.tsv') # plotter.plot_learning_rate('flair_outputs_glove/loss.tsv') plotter.plot_weights("flair_outputs_fastText/weights.txt") plotter.plot_training_curves("flair_outputs_fastText/loss.tsv") plotter.plot_learning_rate("flair_outputs_fastText/loss.tsv")
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) # initialize sequence tagger from flair.models import SequenceTagger tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionaries=tag_dictionaries, tag_types=tag_types, use_crf=True) # initialize trainer from flair.trainers import ModelTrainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) # trainer: ModelTrainer = ModelTrainer(tagger, corpus, optimizer=Adam) trainer.train('resources/taggers/famulus_eda_test_n_bert_long2', EvaluationMetric.MICRO_F1_SCORE, learning_rate=0.1, mini_batch_size=32, max_epochs=1000, test_mode=True) # trainer.train('resources/taggers/famulus_test', EvaluationMetric.MICRO_F1_SCORE, learning_rate=0.001, mini_batch_size=32, # max_epochs=100, test_mode=True) plotter = Plotter() plotter.plot_training_curves( 'resources/taggers/famulus_eda_test_n_bert_long2/loss.tsv') plotter.plot_weights( 'resources/taggers/famulus_eda_test_n_bert_long2/weights.txt')
from flair.visual.training_curves import Plotter clf_dir = 'resources/binary_unbiased_031219/' plotter = Plotter() plotter.plot_training_curves('./resources/loss.tsv') plotter.plot_weights(clf_dir + 'weights.txt')
def test_plotting_training_curves_and_weights(resources_path): plotter = Plotter() plotter.plot_training_curves((resources_path / u'visual/loss.tsv')) plotter.plot_weights((resources_path / u'visual/weights.txt')) (resources_path / u'visual/weights.png').unlink() (resources_path / u'visual/training.png').unlink()