def test_document_bidirectional_lstm_embeddings(): (sentence, glove, charlm) = init_document_embeddings() embeddings = DocumentLSTMEmbeddings([glove, charlm], hidden_size=128, bidirectional=True) embeddings.embed(sentence) assert (len(sentence.get_embedding()) == 512) assert (len(sentence.get_embedding()) == embeddings.embedding_length) sentence.clear_embeddings() assert (len(sentence.get_embedding()) == 0)
def run_splits(word_embeddings, embeddings_name): for i in range(1, 6): print('##########') print('Split', str(i)) print('##########') data_folder = '<path_to_splits>/split_' + str(i) + '/' corpus = ClassificationCorpus(data_folder, test_file='test.csv', dev_file='dev.csv', train_file='train.csv') document_embeddings = DocumentLSTMEmbeddings( word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256) classifier = TextClassifier( document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False) trainer = ModelTrainer(classifier, corpus) trainer.train(data_folder + '/' + embeddings_name, max_epochs=150)
def test_train_charlm_nocache_load_use_classifier(results_base_path, tasks_base_path): corpus = NLPTaskDataFetcher.load_corpus(NLPTask.IMDB, base_path=tasks_base_path) label_dict = corpus.make_label_dictionary() glove_embedding: TokenEmbeddings = FlairEmbeddings('news-forward-fast', use_cache=False) document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings([glove_embedding], 128, 1, False, 64, False, False) model = TextClassifier(document_embeddings, label_dict, False) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, test_mode=True) sentence = Sentence("Berlin is a really nice city.") for s in model.predict(sentence): for l in s.labels: assert (l.value is not None) assert (0.0 <= l.score <= 1.0) assert (type(l.score) is float) loaded_model = TextClassifier.load_from_file(results_base_path / 'final-model.pt') sentence = Sentence('I love Berlin') sentence_empty = Sentence(' ') loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path)
def test_train_resume_text_classification_training(results_base_path, tasks_base_path): corpus = NLPTaskDataFetcher.load_corpus('imdb', base_path=tasks_base_path) label_dict = corpus.make_label_dictionary() embeddings: TokenEmbeddings = FlairEmbeddings('news-forward-fast', use_cache=False) document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings( [embeddings], 128, 1, False) model = TextClassifier(document_embeddings, label_dict, False) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, test_mode=True, checkpoint=True) trainer = ModelTrainer.load_from_checkpoint( results_base_path / 'checkpoint.pt', 'TextClassifier', corpus) trainer.train(results_base_path, max_epochs=2, test_mode=True, checkpoint=True) # clean up results directory shutil.rmtree(results_base_path)
def test_train_charlm_load_use_classifier(): corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB) label_dict = corpus.make_label_dictionary() glove_embedding: TokenEmbeddings = CharLMEmbeddings('news-forward-fast') document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings([glove_embedding], 128, 1, False, 64, False, False) model = TextClassifier(document_embeddings, label_dict, False) trainer = TextClassifierTrainer(model, corpus, label_dict, False) trainer.train('./results', max_epochs=2) sentence = Sentence("Berlin is a really nice city.") for s in model.predict(sentence): for l in s.labels: assert (l.value is not None) assert (0.0 <= l.score <= 1.0) assert (type(l.score) is float) loaded_model = TextClassifier.load_from_file('./results/final-model.pt') sentence = Sentence('I love Berlin') sentence_empty = Sentence(' ') loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree('./results')
def train(): # load training data in FastText format corpus = NLPTaskDataFetcher.load_classification_corpus( Path('./'), test_file='./data/test.txt', train_file='./data/train.txt') # Combine different embeddings: # Glove word ebmeddings + Flair contextual string embeddings word_embeddings = [ WordEmbeddings('glove'), FlairEmbeddings('news-forward-fast'), FlairEmbeddings('news-backward-fast') ] # use LSTM based method for combining the different embeddings document_embeddings = DocumentLSTMEmbeddings( word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256) classifier = TextClassifier( document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False) trainer = ModelTrainer(classifier, corpus) trainer.train('./models', max_epochs=10)
def test_train_charlm_load_use_classifier(results_base_path, tasks_base_path): corpus = NLPTaskDataFetcher.load_corpus(u'imdb', base_path=tasks_base_path) label_dict = corpus.make_label_dictionary() glove_embedding = FlairEmbeddings(u'news-forward-fast') document_embeddings = DocumentLSTMEmbeddings([glove_embedding], 128, 1, False, 64, False, False) model = TextClassifier(document_embeddings, label_dict, False) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, EvaluationMetric.MACRO_F1_SCORE, max_epochs=2, test_mode=True) sentence = Sentence(u'Berlin is a really nice city.') for s in model.predict(sentence): for l in s.labels: assert (l.value is not None) assert (0.0 <= l.score <= 1.0) assert (type(l.score) is float) loaded_model = TextClassifier.load_from_file( (results_base_path / u'final-model.pt')) sentence = Sentence(u'I love Berlin') sentence_empty = Sentence(u' ') loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) shutil.rmtree(results_base_path)
def init(tasks_base_path): corpus = NLPTaskDataFetcher.load_corpus(NLPTask.AG_NEWS, tasks_base_path) label_dict = corpus.make_label_dictionary() glove_embedding = WordEmbeddings(u'en-glove') document_embeddings = DocumentLSTMEmbeddings([glove_embedding], 128, 1, False, 64, False, False) model = TextClassifier(document_embeddings, label_dict, False) return (corpus, label_dict, model)
def test_train_load_use_classifier_multi_label(results_base_path, tasks_base_path): # corpus = NLPTaskDataFetcher.load_corpus('multi_class', base_path=tasks_base_path) corpus = NLPTaskDataFetcher.load_classification_corpus( data_folder=tasks_base_path / 'multi_class') label_dict = corpus.make_label_dictionary() glove_embedding: WordEmbeddings = WordEmbeddings('en-glove') document_embeddings = DocumentLSTMEmbeddings(embeddings=[glove_embedding], hidden_size=32, reproject_words=False, bidirectional=False) model = TextClassifier(document_embeddings, label_dict, multi_label=True) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, EvaluationMetric.MICRO_F1_SCORE, max_epochs=100, test_mode=True, checkpoint=False) sentence = Sentence('apple tv') for s in model.predict(sentence): for l in s.labels: print(l) assert (l.value is not None) assert (0.0 <= l.score <= 1.0) assert (type(l.score) is float) sentence = Sentence("apple tv") for s in model.predict(sentence): assert ('apple' in sentence.get_label_names()) assert ('tv' in sentence.get_label_names()) for l in s.labels: print(l) assert (l.value is not None) assert (0.0 <= l.score <= 1.0) assert (type(l.score) is float) loaded_model = TextClassifier.load_from_file(results_base_path / 'final-model.pt') sentence = Sentence('I love Berlin') sentence_empty = Sentence(' ') loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path)
def init() -> Tuple[TaggedCorpus, Dictionary, TextClassifier]: corpus = NLPTaskDataFetcher.fetch_data(NLPTask.AG_NEWS) label_dict = corpus.make_label_dictionary() glove_embedding: WordEmbeddings = WordEmbeddings('en-glove') document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings([glove_embedding], 128, 1, False, 64, False, False) model = TextClassifier(document_embeddings, label_dict, False) return corpus, label_dict, model
def trainFlairClassifier(df, columns, trainNameCsv, testNameCsv, devNameCsv, classifierFileName): ids = df['id'].tolist() nSamples = len(ids) idx80 = int(nSamples * 0.7) idx90 = int(nSamples * 0.9) train_ids = ids[:idx80] test_ids = ids[idx80:idx90] dev_ids = ids[idx90:] with TemporaryDirectory() as temp_dir: trainCsv = temp_dir + trainNameCsv testCsv = temp_dir + testNameCsv devCsv = temp_dir + devNameCsv df[df['id'].isin(train_ids)].to_csv(trainCsv, columns=columns, sep='\t', index=False, header=False) df[df['id'].isin(test_ids)].to_csv(testCsv, columns=columns, sep='\t', index=False, header=False) df[df['id'].isin(dev_ids)].to_csv(devCsv, columns=columns, sep='\t', index=False, header=False) corpus = NLPTaskDataFetcher.load_classification_corpus( temp_dir, train_file=trainCsv, test_file=testCsv, dev_file=devCsv) word_embeddings = [ WordEmbeddings('glove'), FlairEmbeddings('news-forward-fast'), FlairEmbeddings('news-backward-fast') ] document_embeddings = DocumentLSTMEmbeddings( word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256) classifier = TextClassifier( document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False) trainer = ModelTrainer(classifier, corpus) trainer.train(temp_dir, max_epochs=50) classifier.save(classifierFileName)
def main(args): args = parser.parse_args() # 1. get the corpus corpus: TaggedCorpus = NLPTaskDataFetcher.load_classification_corpus( args.data_dir[0], train_file='train.txt', dev_file='dev.txt', test_file='test.txt') # 2. create the label dictionary label_dict = corpus.make_label_dictionary() # 3. make a list of word embeddings word_embeddings = [ WordEmbeddings('glove'), # comment in flair embeddings for state-of-the-art results # FlairEmbeddings('news-forward'), # FlairEmbeddings('news-backward'), # ELMoEmbeddings() ] # 4. init document embedding by passing list of word embeddings document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings( word_embeddings, hidden_size=128, reproject_words=True, reproject_words_dimension=64, ) # 5. create the text classifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=False) # 6. initialize the text classifier trainer trainer = ModelTrainer(classifier, corpus) # 7. start the training model_out = 'resources/classifiers/sentence-classification/glove' trainer.train(model_out, learning_rate=0.1, mini_batch_size=32, anneal_factor=0.5, patience=5, max_epochs=100) # 8. plot training curves (optional) from flair.visual.training_curves import Plotter plotter = Plotter() plotter.plot_training_curves(join(model_out, 'loss.tsv')) plotter.plot_weights(join(model_out, 'weights.txt'))
def test_document_bidirectional_lstm_embeddings_using_first_representation(): sentence, glove, charlm = init_document_embeddings() embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings( [glove, charlm], hidden_size=128, bidirectional=True) embeddings.embed(sentence) assert (len(sentence.get_embedding()) != 0) assert (sentence.get_embedding().shape[1] == embeddings.embedding_length) sentence.clear_embeddings() assert (len(sentence.get_embedding()) == 0)
def test_document_lstm_embeddings(): sentence, glove, charlm = init_document_embeddings() embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings( [glove, charlm], hidden_states=128, bidirectional=False) embeddings.embed(sentence) assert (len(sentence.get_embedding()) != 0) assert (sentence.get_embedding().shape[1] == embeddings.embedding_length) sentence.clear_embeddings() assert (len(sentence.get_embedding()) == 0)
def _set_up_model(self, params): embdding_params = { key: params[key] for key in params if (key in DOCUMENT_EMBEDDING_PARAMETERS) } if (self.document_embedding_type == u'lstm'): document_embedding = DocumentLSTMEmbeddings(**embdding_params) else: document_embedding = DocumentPoolEmbeddings(**embdding_params) text_classifier = TextClassifier( label_dictionary=self.label_dictionary, multi_label=self.multi_label, document_embeddings=document_embedding) return text_classifier
def test_text_classifier_single_label(): corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB) label_dict = corpus.make_label_dictionary() glove_embedding: WordEmbeddings = WordEmbeddings('en-glove') document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings( [glove_embedding], 128, 1, False, 64, False, False) model = TextClassifier(document_embeddings, label_dict, False) trainer = TextClassifierTrainer(model, corpus, label_dict, False) trainer.train('./results', max_epochs=2) # clean up results directory shutil.rmtree('./results')
def text_classification(): corpus: TaggedCorpus = NLPTaskDataFetcher.fetch_data(NLPTask.AG_NEWS) corpus.train = [sentence for sentence in corpus.train if len(sentence) > 0] corpus.test = [sentence for sentence in corpus.test if len(sentence) > 0] corpus.dev = [sentence for sentence in corpus.dev if len(sentence) > 0] print("corpus created") #print(corpus.get_all_sentences()) label_dict = corpus.make_label_dictionary() print("created label dict") #for sent in corpus.get_all_sentences(): # print(sent.labels) word_embeddings = [ WordEmbeddings('glove'), CharLMEmbeddings('news-forward'), CharLMEmbeddings('news-backward') ] print("loaded word embeddings") document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings( word_embeddings, hidden_states=512, reproject_words=True, reproject_words_dimension=256, ) print("loaded document embeddings") classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=True) print("created classifier") # 6. initialize the text classifier trainer trainer = TextClassifierTrainer(classifier, corpus, label_dict) print("starting training") # 7. start the trainig trainer.train('results', learning_rate=0.1, mini_batch_size=32, anneal_factor=0.5, patience=5, max_epochs=50) print("training finished")
def train_flair(dir_name): corpus = NLPTaskDataFetcher.load_classification_corpus( os.path.join("flair", dir_name), test_file='test.csv', dev_file='validation.csv', train_file='train.csv') word_embeddings = [ WordEmbeddings('pl'), FlairEmbeddings('polish-forward'), FlairEmbeddings('polish-backward') ] document_embeddings = DocumentLSTMEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256) classifier = TextClassifier( document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False) trainer = ModelTrainer(classifier, corpus) trainer.train(os.path.join("flair", dir_name), max_epochs=10)
def test_text_classifier_single_label(): corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB) label_dict = corpus.make_label_dictionary() glove_embedding: WordEmbeddings = WordEmbeddings('en-glove') document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings([glove_embedding], 128, 1, False, 64, False, False) model = TextClassifier(document_embeddings, label_dict, False) trainer = TextClassifierTrainer(model, corpus, label_dict, False) trainer.train('./results', max_epochs=2) sentence = Sentence("Berlin is a really nice city.") for s in model.predict(sentence): for l in s.labels: assert(l.name is not None) assert(0.0 <= l.confidence <= 1.0) assert(type(l.confidence) is float) # clean up results directory shutil.rmtree('./results')
from flair.data_fetcher import NLPTaskDataFetcher from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentLSTMEmbeddings from flair.models import TextClassifier from flair.trainers import ModelTrainer from pathlib import Path corpus = NLPTaskDataFetcher.load_classification_corpus(Path('./'), test_file='test.csv', dev_file='dev.csv', train_file='train.csv') #word embeddings for tagging individual words as vector representation word_embeddings = [ WordEmbeddings('glove'), FlairEmbeddings('news-forward-fast'), FlairEmbeddings('news-backward-fast') ] #document embeddings for taking individual word embeddings to tag a whole document as vector representation document_embeddings = DocumentLSTMEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256) classifier = TextClassifier(document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=True) trainer = ModelTrainer(classifier, corpus) trainer.train('./', max_epochs=10)
def train(self, X, y): X_text = X[:, self.args.TEXT_COL] y = y.flatten() #corpus: TaggedCorpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03) train: List[Sentence] = [] for tweet, label in zip(X_text, y): if tweet == '': tweet = 'dummy' s: Sentence = Sentence(tweet) s.add_label(str(label)) train.append(s) corpus: TaggedCorpus = TaggedCorpus(train, train, train) # 2. create the label dictionary label_dict = corpus.make_label_dictionary() # 3. make a list of word embeddings word_embeddings = [ glove_embeddings, #twitter_embeddings, # comment in this line to use character embeddings #CharacterEmbeddings(), # comment in flair embeddings for state-of-the-art results # FlairEmbeddings('news-forward'), fflair, # FlairEmbeddings('news-backward'), bflair ] # 4. initialize document embedding by passing list of word embeddings document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings( word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256, ) # 5. create the text classifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=False) # 6. initialize the text classifier trainer trainer = ModelTrainer(classifier, corpus) self.model = trainer.model self.model.save = self.save self.model.save_checkpoint = self.save_checkpoint # 7. start the training trainer.train('../data/ecuador_earthquake_2016/models', learning_rate=0.1, mini_batch_size=32, anneal_factor=0.5, patience=5, max_epochs=5) self.clf = classifier
sentences_train: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file('training.preprocessed.txt') sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file('dev.preprocessed.txt') sentences_test: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file('test.preprocessed.txt') corpus = TaggedCorpus(sentences_train, sentences_dev, sentences_test) # 2. create the label dictionary label_dict = corpus.make_label_dictionary() # 3. make a list of word embeddings word_embeddings = [WordEmbeddings('de-fasttext'), CharLMEmbeddings('german-forward'), CharLMEmbeddings('german-backward')] # 4. init document embedding by passing list of word embeddings document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings(word_embeddings, hidden_states=32) # 5. create the text classifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=False) # 6. initialize the text classifier trainer trainer = TextClassifierTrainer(classifier, corpus, label_dict) # 7. start the trainig trainer.train('resources/germeval_2018/results', learning_rate=0.01, mini_batch_size=8, max_epochs=30, embeddings_in_memory=False)
def main(args): args = parser.parse_args() # 0. Make a list of word embeddings if args.method == 'glove': word_embeddings = [WordEmbeddings('glove')] elif args.method == 'flair': word_embeddings = [ WordEmbeddings('glove'), FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward') ] elif args.method == 'cui_svd': word_embeddings = [ BackOffEmbeddings( WordEmbeddings('glove'), WordEmbeddings('resources/embeddings/cui2vec100.npy')) ] elif args.method == 'cui_proj': word_embeddings = [ BackOffEmbeddings( WordEmbeddings('glove'), WordEmbeddings( 'resources/embeddings/cui2vec_projected_100-100.gensim')) ] elif args.method == 'mimic': word_embeddings = [ WordEmbeddings( 'resources/embeddings/mimic3_mixed_embeddings100.gensim') ] elif args.method == 'cui2vec': word_embeddings = [ BackOffEmbeddings( WordEmbeddings('glove'), WordEmbeddings( 'resources/embeddings/cui2vec_combined_glove_100dim.gensim' )) ] elif args.method == 'mimic_lm': word_embeddings = [ WordEmbeddings('glove'), FlairEmbeddings('resources/taggers/mimic-forward/best-lm.pt'), FlairEmbeddings('resources/taggers/mimic-backward/best-lm.pt') ] else: raise Exception( "Received option for method %s that cannot be interpreted." % (args.method)) if 'bg' in args.data_file: multi = True print( "Running in multiple label setting because 'bg' was in the data file name %s" % (args.data_file)) else: multi = False # 1. get the corpus sents: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file( args.data_file) corpus = TaggedCorpus(sents, None, None) # 2. create the label dictionary label_dict = corpus.make_label_dictionary() # 3. split the training data into folds num_folds = args.num_folds seed = 719 kf = KFold(n_splits=num_folds, random_state=seed) kf.get_n_splits() # 4. iterate over folds: total_acc = 0 fold = 1 for train_index, test_index in kf.split(corpus.train): # 4a. initialize the text classifier trainer split_traindev = np.array(corpus.train)[train_index].tolist() traindev_size = len(split_traindev) train_dev_splitpoint = int(0.9 * traindev_size) split_train = split_traindev[:train_dev_splitpoint] split_dev = split_traindev[train_dev_splitpoint:] split_test = np.array(corpus.train)[test_index].tolist() split_corpus = TaggedCorpus(split_train, dev=split_dev, test=split_test) print("After split, size of splits: train=%d, dev=%d, test=%d" % (len(split_train), len(split_dev), len(split_test))) # 4b. do training: with tempfile.TemporaryDirectory() as model_dir: # init document embedding by passing list of word embeddings document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings( word_embeddings, hidden_size=128, reproject_words=True, reproject_words_dimension=64, ) classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=multi) trainer = ModelTrainer(classifier, split_corpus) results = trainer.train(model_dir, embeddings_in_memory=False, learning_rate=0.1, mini_batch_size=128, anneal_factor=0.5, patience=5, max_epochs=100) fold_acc = results['test_score'] total_acc += fold_acc print(f"Finished fold {fold} with accuracy {fold_acc}") fold += 1 total_acc /= num_folds print("Finished with total cross-fold accuracy of %f" % (total_acc))
def main(data_folder, benchmark_classifier_folder, new_data_folder, finetuned_classifier_folder): from flair.embeddings import FlairEmbeddings, DocumentLSTMEmbeddings, BertEmbeddings, DocumentRNNEmbeddings, TransformerDocumentEmbeddings from flair.models import TextClassifier from flair.trainers import ModelTrainer from flair.datasets import CSVClassificationCorpus from flair.data import Corpus import pandas as pd import os ### First Stage (Train on benchmark dataset) benchmark = pd.read_csv(data_folder + "combined_benchmark.csv") benchmark = benchmark[['label', 'text']] #### Create train, dev and test set #benchmark = benchmark.sample(frac=1) # if not set random state, everytime has different training result benchmark = benchmark.sample(frac=1, random_state=42) benchmark.iloc[0:int(len(benchmark) * 0.8)].to_csv(data_folder + 'train.csv', sep='\t', index=False, header=False) benchmark.iloc[int(len(benchmark) * 0.8):int(len(benchmark) * 0.9)].to_csv( data_folder + 'test.csv', sep='\t', index=False, header=False) benchmark.iloc[int(len(benchmark) * 0.9):].to_csv(data_folder + 'dev.csv', sep='\t', index=False, header=False) #### Build corpus column_name_map = {1: "text", 0: "label_topic"} corpus: Corpus = CSVClassificationCorpus( data_folder, column_name_map, skip_header=False, #no header in kaggle data delimiter='\t', # comma separated rows #train_file='train.csv', ## passing in file names manually when it can't auto detect #dev_file='dev.csv', #test_file='test.csv' ) #### Create word embeddings word_embeddings = [ BertEmbeddings(), FlairEmbeddings('news-forward-fast'), FlairEmbeddings('news-backward-fast') ] ## caveat: issue of deprecation. BertEmbeddings and DocumentLSTMEmbeddings existed in version 0.4.5, and became legacy embeddings(still available) in version 0.5 #### First Stage Fine-tuning document_embeddings = DocumentLSTMEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256) classifier = TextClassifier( document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False) trainer = ModelTrainer(classifier, corpus) #trainer.train(benchmark_classifier_folder, max_epochs=1) #offline test use epoch=1 trainer.train(benchmark_classifier_folder, max_epochs=10) ### every finetuning results in different scores ### accuracy at phase1 finetuning does not matter too much, phase2 scores more important in biasing the models towards learning indicator-specific keywords ### Second Stage (train on hand annotated datasets) #### Build corpus ### this column_name_map must be updated to reflect which column stores the X(text features) and y(golden labels) for training use ### in the csv file contained in new_data_folder, 2nd column is 'title_desc', ### 4th column is 'title_desc_sent_1' (where we stored agreed sentiment annotations) new_column_name_map = {1: "text", 3: "label_topic"} print(new_column_name_map) corpus: Corpus = CSVClassificationCorpus( new_data_folder, new_column_name_map, skip_header=True, delimiter=',' # comma separated rows ) #### Second Stage fine-tuning benchmark_classifier = TextClassifier.load( os.path.join(benchmark_classifier_folder, 'best-model.pt')) trainer = ModelTrainer(benchmark_classifier, corpus) #trainer.train(finetuned_classifier_folder, max_epochs=1) #offline test use trainer.train(finetuned_classifier_folder, max_epochs=10)
import datetime import spacy from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentPoolEmbeddings, DocumentLSTMEmbeddings, Sentence nlp = spacy.load('de') glove_embedding = WordEmbeddings('de') flair_embedding_forward = FlairEmbeddings('german-forward') flair_embedding_backward = FlairEmbeddings('german-backward') document_pooling_embeddings = DocumentPoolEmbeddings( [glove_embedding, flair_embedding_backward, flair_embedding_forward]) document_lstm_embeddings = DocumentLSTMEmbeddings( [glove_embedding, flair_embedding_backward, flair_embedding_forward]) def is_blacklisted(word): return word in [ 'polizei', 'polizist', 'beamter', 'nr.', 'berlin', 'uhr', 'polizeimeldung', 'nicht', 'jahr', 'jährige', 'jährig', 'jähriger', 'polizeiliche', 'polizeilich', '2015', '2016', '2014', '2017', '2018', 'polizeibeamter', '-', 'u.a.', 'z.b.', 'der', 'die', 'das', 'dem', 'den', 'diese', 'dieser', 'diesen', 'diesem', 'um', 'für', 'eine', 'ein', 'einer', 'einen', 'einem', 'anderer', 'andere', 'anderen', 'anders' ] def is_empty(word): return word.strip() == ''
from flair.data_fetcher import NLPTaskDataFetcher from flair.embeddings import WordEmbeddings, FlairEmbeddings, BertEmbeddings, DocumentLSTMEmbeddings from flair.models import TextClassifier from flair.trainers import ModelTrainer from pathlib import Path word_embeddings = [BertEmbeddings('bert-base-multilingual-cased')] from flair.data import Sentence from flair.embeddings import BertEmbeddings, DocumentLSTMEmbeddings document_embeddings = DocumentLSTMEmbeddings(word_embeddings,hidden_size=768,) import random import numpy as np from numpy import genfromtxt from run_fasttext import fasttext from logisticRegression import lr from run_bert import bert from keras_SLP import SLP from cosineSim import cosineSimilarity #Logging configuration for logging data from datetime import datetime import logging now = datetime.now().strftime('%Y-%m-%d-%H-%M') formatter = logging.Formatter('%(asctime)s - %(name)s - %(message)s')