def train(): # load training data in FastText format corpus = NLPTaskDataFetcher.load_classification_corpus( Path('./'), test_file='./data/test.txt', train_file='./data/train.txt') # Combine different embeddings: # Glove word ebmeddings + Flair contextual string embeddings word_embeddings = [ WordEmbeddings('glove'), FlairEmbeddings('news-forward-fast'), FlairEmbeddings('news-backward-fast') ] # use LSTM based method for combining the different embeddings document_embeddings = DocumentLSTMEmbeddings( word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256) classifier = TextClassifier( document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False) trainer = ModelTrainer(classifier, corpus) trainer.train('./models', max_epochs=10)
def test_train_load_use_classifier_multi_label(results_base_path, tasks_base_path): # corpus = NLPTaskDataFetcher.load_corpus('multi_class', base_path=tasks_base_path) corpus = NLPTaskDataFetcher.load_classification_corpus( data_folder=tasks_base_path / "multi_class" ) label_dict = corpus.make_label_dictionary() word_embedding: WordEmbeddings = WordEmbeddings("turian") document_embeddings = DocumentRNNEmbeddings( embeddings=[word_embedding], hidden_size=32, reproject_words=False, bidirectional=False, ) model = TextClassifier(document_embeddings, label_dict, multi_label=True) trainer = ModelTrainer(model, corpus) trainer.train( results_base_path, EvaluationMetric.MICRO_F1_SCORE, mini_batch_size=1, max_epochs=100, test_mode=True, checkpoint=False, ) sentence = Sentence("apple tv") for s in model.predict(sentence): for l in s.labels: print(l) assert l.value is not None assert 0.0 <= l.score <= 1.0 assert type(l.score) is float sentence = Sentence("apple tv") for s in model.predict(sentence): assert "apple" in sentence.get_label_names() assert "tv" in sentence.get_label_names() for l in s.labels: print(l) assert l.value is not None assert 0.0 <= l.score <= 1.0 assert type(l.score) is float loaded_model = TextClassifier.load_from_file(results_base_path / "final-model.pt") sentence = Sentence("I love Berlin") sentence_empty = Sentence(" ") loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path)
def test_train_load_use_classifier_multi_label(results_base_path, tasks_base_path): # corpus = NLPTaskDataFetcher.load_corpus('multi_class', base_path=tasks_base_path) corpus = NLPTaskDataFetcher.load_classification_corpus( data_folder=tasks_base_path / 'multi_class') label_dict = corpus.make_label_dictionary() glove_embedding: WordEmbeddings = WordEmbeddings('en-glove') document_embeddings = DocumentLSTMEmbeddings(embeddings=[glove_embedding], hidden_size=32, reproject_words=False, bidirectional=False) model = TextClassifier(document_embeddings, label_dict, multi_label=True) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, EvaluationMetric.MICRO_F1_SCORE, max_epochs=100, test_mode=True, checkpoint=False) sentence = Sentence('apple tv') for s in model.predict(sentence): for l in s.labels: print(l) assert (l.value is not None) assert (0.0 <= l.score <= 1.0) assert (type(l.score) is float) sentence = Sentence("apple tv") for s in model.predict(sentence): assert ('apple' in sentence.get_label_names()) assert ('tv' in sentence.get_label_names()) for l in s.labels: print(l) assert (l.value is not None) assert (0.0 <= l.score <= 1.0) assert (type(l.score) is float) loaded_model = TextClassifier.load_from_file(results_base_path / 'final-model.pt') sentence = Sentence('I love Berlin') sentence_empty = Sentence(' ') loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path)
def trainFlairClassifier(df, columns, trainNameCsv, testNameCsv, devNameCsv, classifierFileName): ids = df['id'].tolist() nSamples = len(ids) idx80 = int(nSamples * 0.7) idx90 = int(nSamples * 0.9) train_ids = ids[:idx80] test_ids = ids[idx80:idx90] dev_ids = ids[idx90:] with TemporaryDirectory() as temp_dir: trainCsv = temp_dir + trainNameCsv testCsv = temp_dir + testNameCsv devCsv = temp_dir + devNameCsv df[df['id'].isin(train_ids)].to_csv(trainCsv, columns=columns, sep='\t', index=False, header=False) df[df['id'].isin(test_ids)].to_csv(testCsv, columns=columns, sep='\t', index=False, header=False) df[df['id'].isin(dev_ids)].to_csv(devCsv, columns=columns, sep='\t', index=False, header=False) corpus = NLPTaskDataFetcher.load_classification_corpus( temp_dir, train_file=trainCsv, test_file=testCsv, dev_file=devCsv) word_embeddings = [ WordEmbeddings('glove'), FlairEmbeddings('news-forward-fast'), FlairEmbeddings('news-backward-fast') ] document_embeddings = DocumentLSTMEmbeddings( word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256) classifier = TextClassifier( document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False) trainer = ModelTrainer(classifier, corpus) trainer.train(temp_dir, max_epochs=50) classifier.save(classifierFileName)
def create_corpus(): if os.path.exists('./data/train.csv') and os.path.exists( './data/test.csv') and os.path.exists('./data/dev.csv'): corpus = NLPTaskDataFetcher.load_classification_corpus( Path('./data'), test_file='test.csv', dev_file='dev.csv', train_file='train.csv') return corpus else: return 0
def main(args): args = parser.parse_args() # 1. get the corpus corpus: TaggedCorpus = NLPTaskDataFetcher.load_classification_corpus( args.data_dir[0], train_file='train.txt', dev_file='dev.txt', test_file='test.txt') # 2. create the label dictionary label_dict = corpus.make_label_dictionary() # 3. make a list of word embeddings word_embeddings = [ WordEmbeddings('glove'), # comment in flair embeddings for state-of-the-art results # FlairEmbeddings('news-forward'), # FlairEmbeddings('news-backward'), # ELMoEmbeddings() ] # 4. init document embedding by passing list of word embeddings document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings( word_embeddings, hidden_size=128, reproject_words=True, reproject_words_dimension=64, ) # 5. create the text classifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=False) # 6. initialize the text classifier trainer trainer = ModelTrainer(classifier, corpus) # 7. start the training model_out = 'resources/classifiers/sentence-classification/glove' trainer.train(model_out, learning_rate=0.1, mini_batch_size=32, anneal_factor=0.5, patience=5, max_epochs=100) # 8. plot training curves (optional) from flair.visual.training_curves import Plotter plotter = Plotter() plotter.plot_training_curves(join(model_out, 'loss.tsv')) plotter.plot_weights(join(model_out, 'weights.txt'))
def train(self): corpus = NLPTaskDataFetcher.load_classification_corpus(Path(self.corpus_path), test_file="test_clean_text.txt", dev_file="dev_clean_text.txt", train_file="train_clean_text.txt") embeddings = [WordEmbeddings(self.word_emb_path), FlairEmbeddings('polish-forward'), FlairEmbeddings('polish-backward')] document_embeddings = DocumentRNNEmbeddings(embeddings, hidden_size=self.hidden_size, bidirectional=True) classifier = TextClassifier(document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False) trainer = ModelTrainer(classifier, corpus) trainer.train(self.model_path, evaluation_metric=EvaluationMetric.MACRO_F1_SCORE, max_epochs=self.epochs)
def classify(data, labels, test, train, validation): train_data = [k for k in data.keys() if k in train] train_labels = [labels[k] for k in train_data] train_data = [data[k] for k in train_data] test_data = [k for k in data.keys() if k in test] test_labels = [labels[k] for k in test_data] test_data = [data[k] for k in test_data] validation_data = [k for k in data.keys() if k in validation] validation_labels = [labels[k] for k in validation_data] validation_data = [data[k] for k in validation_data] save_training_files(train_data, train_labels, test_data, test_labels, validation_data, validation_labels) corpus = NLPTaskDataFetcher.load_classification_corpus( Path('./'), test_file='test.txt', dev_file='dev.txt', train_file='train.txt') word_embeddings = [ WordEmbeddings('pl'), FlairEmbeddings('polish-forward'), FlairEmbeddings('polish-backward') ] doc_embeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256) classifier = TextClassifier( doc_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False) trainer = ModelTrainer(classifier, corpus) trainer.train('./', max_epochs=25) classifier = TextClassifier.load_from_file('./best-model.pt') validation_data = [Sentence(x) for x in validation_data] for x in validation_data: classifier.predict(x) predicted = [int(x.labels[0].value) for x in validation_data] remove_training_files() precision, recall, f1, _ = precision_recall_fscore_support( validation_labels, predicted, average='binary') return { 'accuracy': float("{:.3f}".format(round(precision, 3))), 'recall': float("{:.3f}".format(round(recall, 3))), 'f1': float("{:.3f}".format(round(f1, 3))) }
def train_flair(dir_name): corpus = NLPTaskDataFetcher.load_classification_corpus( os.path.join("flair", dir_name), test_file='test.csv', dev_file='validation.csv', train_file='train.csv') word_embeddings = [ WordEmbeddings('pl'), FlairEmbeddings('polish-forward'), FlairEmbeddings('polish-backward') ] document_embeddings = DocumentLSTMEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256) classifier = TextClassifier( document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False) trainer = ModelTrainer(classifier, corpus) trainer.train(os.path.join("flair", dir_name), max_epochs=10)
def train(path, test, dev, train): corpus = NLPTaskDataFetcher.load_classification_corpus(Path(path), test_file=test, dev_file=dev, train_file=train) #word_embeddings = [WordEmbeddings(FASTTEXT), BertEmbeddings(BERT)] word_embeddings = [WordEmbeddings(FASTTEXT)] document_embeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=32, reproject_words=True, reproject_words_dimension=256) classifier = TextClassifier( document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False) trainer = ModelTrainer(classifier, corpus) trainer.train(path, save_final_model=False, train_with_dev=True, param_selection_mode=True) return path + 'best-model.pt'
def main(): train_dev_corpus = NLPTaskDataFetcher.load_classification_corpus( Path(DATA_PATH), train_file='flair_train.csv', test_file='flair_test.csv', dev_file='flair_dev.csv') label_dict = train_dev_corpus.make_label_dictionary() word_embeddings = [ WordEmbeddings('crawl'), FlairEmbeddings('news-forward-fast', chars_per_chunk=128), FlairEmbeddings('news-backward-fast', chars_per_chunk=128) ] document_embeddings = DocumentRNNEmbeddings(word_embeddings, rnn_type='LSTM', hidden_size=128, reproject_words=True, reproject_words_dimension=64) classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=False) trainer = ModelTrainer(classifier, train_dev_corpus) trainer.train(PRETRAINED_FLAIR, max_epochs=40, learning_rate=0.2, mini_batch_size=32, embeddings_in_memory=False, checkpoint=True) plotter = Plotter() plotter.plot_training_curves(FLAIR_LOSS) plotter.plot_weights(FLAIR_WEIGHTS)
index=False, header=False) data_test.to_csv('flair_test.csv', sep='\t', index=False, header=False) #train a model from flair.data_fetcher import NLPTaskDataFetcher from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentRNNEmbeddings, CharLMEmbeddings from flair.trainers import ModelTrainer from flair.data import TaggedCorpus from pathlib import Path #es Spanish Spanish FastText embeddings (Wiki) data_folder = Path('./') corpus: TaggedCorpus = NLPTaskDataFetcher.load_classification_corpus( data_folder, test_file='flair_test.csv', dev_file='flair_dev.csv', train_file='flair_train.csv') print(corpus) print(len(corpus.train)) from hyperopt import hp from flair.hyperparameter.param_selection import SearchSpace, Parameter search_space = SearchSpace() search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[ [WordEmbeddings('ca')], ]) search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[32, 64, 128])
from pathlib import Path from flair.data import TaggedCorpus from flair.data_fetcher import NLPTaskDataFetcher, NLPTask from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentLSTMEmbeddings from flair.models import TextClassifier from flair.trainers import ModelTrainer from flair.visual.training_curves import Plotter # we use our own data set data_folder = Path('C:/Users/jeanc/Documents/reviews/classif') # 1. load corpus containing training, test and dev data corpus: TaggedCorpus = NLPTaskDataFetcher.load_classification_corpus( data_folder, test_file='test.txt', dev_file='dev.txt', train_file='train.txt') # statistics about the dataset stats = corpus.obtain_statistics() print(stats) # 2. create the label dictionary label_dict = corpus.make_label_dictionary() # 3. make a list of word embeddings word_embeddings = [ WordEmbeddings('glove'), # comment in flair embeddings for state-of-the-art results # FlairEmbeddings('news-forward'), # FlairEmbeddings('news-backward'),
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentLSTMEmbeddings from flair.models import TextClassifier from flair.trainers import ModelTrainer from pathlib import Path from flair.hyperparameter.param_selection import TextClassifierParamSelector, OptimizationValue from hyperopt import hp from flair.hyperparameter.param_selection import SearchSpace, Parameter from flair.embeddings import WordEmbeddings, DocumentPoolEmbeddings, FlairEmbeddings, BertEmbeddings, ELMoEmbeddings, BytePairEmbeddings from flair.data_fetcher import NLPTaskDataFetcher from flair.training_utils import EvaluationMetric from pathlib import Path corpus = NLPTaskDataFetcher.load_classification_corpus(Path('../TASS2019/DATASETS/public_data/cr'), train_file='intertass_cr_train.txt', dev_file='intertass_cr_dev_prevTASS.txt', test_file='intertass_cr_dev.txt') #word_embeddings = [WordEmbeddings('glove'), FlairEmbeddings('spanish-forward-fast'), FlairEmbeddings('spanish-backward-fast')] # word_embeddings = [WordEmbeddings('/media/lfdharo/97481d74-4cb5-4983-9a69-a748c32711ba/Data/Models/Glove/glove-sbwc_spanish.i25.vec'), # FlairEmbeddings('spanish-forward-fast'), # FlairEmbeddings('spanish-backward-fast')] # word_embeddings = [BertEmbeddings('bert-base-multilingual-cased'), # FlairEmbeddings('spanish-forward-fast'), # FlairEmbeddings('spanish-backward-fast')] # word_embeddings = [WordEmbeddings('../../../../Data/Models/Word2Vec/Spanish_CoNLL17/w2v_es_conll17.gensim.vec'), # WordEmbeddings('../../../../Data/Models/Glove/glove-sbwc_spanish.i25.gensim.vec'), # ELMoEmbeddings('../../../../Data/Models/Elmo/Spanish_CoNLL17/')] # word_embeddings = [FlairEmbeddings('spanish-forward-fast'), FlairEmbeddings('spanish-backward-fast')]
from pathlib import Path from fastai.text import * X = X_train_s.to_list() + X_test_s.to_list() + X_dev_s.to_list() y = y_train_s.to_list() + y_test_s.to_list() + y_dev_s.to_list() data = pd.DataFrame(list(zip(y, X)), columns =['label','text']) data['label'] = '__label__' + data['label'].map({1: 'Insincere', 0: 'Sincere'}) data.iloc[0:int(len(data)*0.8)].to_csv(path+'flair/trainflair.csv', sep='\t', index = False, header = False) data.iloc[int(len(data)*0.8):int(len(data)*0.9)].to_csv(path+'flair/testflair.csv', sep='\t', index = False, header = False) data.iloc[int(len(data)*0.9):].to_csv(path+'flair/devflair.csv', sep='\t', index = False, header = False); corpus = NLPTaskDataFetcher.load_classification_corpus(Path(path+'flair/'),\ test_file='testflair.csv',\ dev_file='devflair.csv',\ train_file='trainflair.csv') word_embeddings = [WordEmbeddings('glove'),\ FlairEmbeddings('news-forward-fast'),\ FlairEmbeddings('news-backward-fast')] document_embeddings = DocumentLSTMEmbeddings(word_embeddings, hidden_size=512,\ reproject_words=True,\ reproject_words_dimension=256) classifier = TextClassifier(document_embeddings,\ label_dictionary=corpus.make_label_dictionary(),\ multi_label=False)
#%% FLAIR w_em = [ WordEmbeddings('pl'), FlairEmbeddings('polish-forward'), FlairEmbeddings('polish-backward') ] d_em = DocumentLSTMEmbeddings(w_em, hidden_size=512, reproject_words=True, reproject_words_dimension=256) #%% corpus = NLPTaskDataFetcher.load_classification_corpus( Path('resources/fasttext/'), test_file='test-3.txt', dev_file='val-3.txt', train_file='train-3.txt') classifier = TextClassifier(d_em, label_dictionary=corpus.make_label_dictionary(), multi_label=False) #%% trainer = ModelTrainer(classifier, corpus) trainer.train('resources/fasttext/', max_epochs=5) # 2019-05-20 13:13:38,072 Testing using best model ... # 2019-05-20 13:13:38,074 loading file resources/fasttext/best-model.pt # 2019-05-20 13:13:49,123 MICRO_AVG: acc 0.481 - f1-score 0.6495 # 2019-05-20 13:13:49,128 MACRO_AVG: acc 0.4318 - f1-score 0.5839 # 2019-05-20 13:13:49,130 False tp: 27 - fp: 12 - fn: 63 - tn: 112 - precision: 0.6923 - recall: 0.3000 - accuracy: 0.2647 - f1-score: 0.4186
from flair.data_fetcher import NLPTaskDataFetcher from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentLSTMEmbeddings from flair.models import TextClassifier from flair.trainers import ModelTrainer from pathlib import Path corpus = NLPTaskDataFetcher.load_classification_corpus(Path("data/"), test_file="test.csv", dev_file="dev.csv", train_file="train.csv") word_embeddings = [ WordEmbeddings("glove"), FlairEmbeddings("news-forward-fast"), FlairEmbeddings("news-backward-fast"), ] document_embeddings = DocumentLSTMEmbeddings( word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256, ) classifier = TextClassifier( document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False, ) trainer = ModelTrainer(classifier, corpus) trainer.train("model/", max_epochs=10)
import flair import torch from flair.data_fetcher import NLPTaskDataFetcher from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentLSTMEmbeddings from flair.models import TextClassifier from flair.trainers import ModelTrainer from pathlib import Path if __name__ == '__main__': # flair.device = torch.device('cpu') for dir in ['ten']: print(dir) corpus = NLPTaskDataFetcher.load_classification_corpus( Path('./'), test_file='validation/%s/labeled.txt' % dir, dev_file='testing/max_50/labeled.txt', train_file='training/max_50/labeled.txt') word_embeddings = [ WordEmbeddings('glove'), FlairEmbeddings('news-forward-fast'), FlairEmbeddings('news-backward-fast', chars_per_chunk=64) ] document_embeddings = DocumentLSTMEmbeddings( word_embeddings, hidden_size=256, reproject_words=True, reproject_words_dimension=256) classifier = TextClassifier( document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False)
sep='\t', index=False, header=False) data.iloc[int(len(data) * 0.9):].to_csv('dev.csv', sep='\t', index=False, header=False) from flair.data_fetcher import NLPTaskDataFetcher from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentLSTMEmbeddings from flair.models import TextClassifier from flair.trainers import ModelTrainer from pathlib import Path corpus = NLPTaskDataFetcher.load_classification_corpus(Path('./'), test_file='test.csv', dev_file='dev.csv', train_file='train.csv') #word embeddings for tagging individual words as vector representation word_embeddings = [ WordEmbeddings('glove'), FlairEmbeddings('news-forward-fast'), FlairEmbeddings('news-backward-fast') ] #document embeddings for taking individual word embeddings to tag a whole document as vector representation document_embeddings = DocumentLSTMEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256)
else: prefix += '_full' train_filename = "intertass_" + sLang + prefix + "_train.txt" dev_filename = "intertass_" + sLang + prefix + "_dev.txt" tst_filename = "intertass_" + sLang + "_test_prevTASS.txt" eval_filename = "intertass_" + sLang + prefix + "_test.txt" labels = ['ID', 'N', 'NEU', 'NONE', 'P'] # train_data = read_file(data_path + sLang + "/intertass_" + sLang + "_train.xml") # train_data = resample_dataset(train_data) # train_filename = "intertass_" + sLang + "_train.txt2" # save_file(train_data, data_path + sLang + "/intertass_" + sLang + "_train.txt2") if bTestPhase is False: corpus = NLPTaskDataFetcher.load_classification_corpus(Path(data_path + sLang), train_file=train_filename, dev_file=dev_filename, test_file=tst_filename) else: train_filename = train_filename.replace('_train.txt', '_train_dev.txt') corpus = NLPTaskDataFetcher.load_classification_corpus(Path(data_path + sLang), train_file=train_filename, test_file=tst_filename) # prefix_model_output_dir = '' # if FLAIR_EMBEDDINGS is True: # embeddings = [FlairEmbeddings('spanish-forward-fast'), # FlairEmbeddings('spanish-backward-fast')] # # # document_embeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=512, # # reproject_words=True, # # reproject_words_dimension=256,