def build_train_sequence_tagger(corpus, tag_dictionary, params: Params, TAG_TYPE="ner"): embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=[ WordEmbeddings("glove"), FlairEmbeddings("news-forward"), FlairEmbeddings("news-backward"), ]) from flair.models import SequenceTagger tagger: SequenceTagger = SequenceTagger( hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=TAG_TYPE, ) from flair.trainers import ModelTrainer corpus = Corpus(train=corpus.train, dev=corpus.dev, test=[]) trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train( "flair_checkpoints", train_with_dev=False, max_epochs=params.max_epochs, save_final_model=False, ) # original return tagger
def _train(self, corpus: Corpus, params: dict, base_path: Path, max_epochs: int, optimization_value: str): corpus = corpus label_dict = corpus.make_label_dictionary() for sent in corpus.get_all_sentences(): sent.clear_embeddings() model = self._set_up_model(params, label_dict) training_parameters = { key: params[key] for key, value in params.items() if key in TRAINING_PARAMETERS } model_trainer_parameters = { key: params[key] for key, value in params.items() if key in MODEL_TRAINER_PARAMETERS and key != 'model' } trainer: ModelTrainer = ModelTrainer(model, corpus, **model_trainer_parameters) path = base_path results = trainer.train(path, max_epochs=max_epochs, param_selection_mode=True, **training_parameters) if optimization_value == "score": result = results['test_score'] else: result = results['dev_loss_history'][-1] return {'result': result, 'params': params}
def test_tagged_corpus_statistics_multi_label(): train_sentence = Sentence("I love Berlin.", use_tokenizer=True).add_label('label', 'class_1') dev_sentence = Sentence("The sun is shining.", use_tokenizer=True).add_label('label', 'class_2') test_sentence = Sentence("Berlin is sunny.", use_tokenizer=True) test_sentence.add_label('label', 'class_1') test_sentence.add_label('label', 'class_2') class_to_count_dict = Corpus._count_sentence_labels( [train_sentence, dev_sentence, test_sentence] ) assert "class_1" in class_to_count_dict assert "class_2" in class_to_count_dict assert 2 == class_to_count_dict["class_1"] assert 2 == class_to_count_dict["class_2"] tokens_in_sentences = Corpus._get_tokens_per_sentence( [train_sentence, dev_sentence, test_sentence] ) assert 3 == len(tokens_in_sentences) assert 4 == tokens_in_sentences[0] assert 5 == tokens_in_sentences[1] assert 4 == tokens_in_sentences[2]
def test_tagged_corpus_statistics_multi_label(): train_sentence = Sentence("I love Berlin.", labels=["class_1"], use_tokenizer=segtok_tokenizer) dev_sentence = Sentence("The sun is shining.", labels=["class_2"], use_tokenizer=segtok_tokenizer) test_sentence = Sentence( "Berlin is sunny.", labels=["class_1", "class_2"], use_tokenizer=segtok_tokenizer, ) class_to_count_dict = Corpus._get_class_to_count( [train_sentence, dev_sentence, test_sentence]) assert "class_1" in class_to_count_dict assert "class_2" in class_to_count_dict assert 2 == class_to_count_dict["class_1"] assert 2 == class_to_count_dict["class_2"] tokens_in_sentences = Corpus._get_tokens_per_sentence( [train_sentence, dev_sentence, test_sentence]) assert 3 == len(tokens_in_sentences) assert 4 == tokens_in_sentences[0] assert 5 == tokens_in_sentences[1] assert 4 == tokens_in_sentences[2]
def score_flair_tagger( splits, data:Union[List[Sentence],Dataset], ): from flair.trainers import ModelTrainer, trainer logger = trainer.log logger.setLevel(logging.WARNING) data_splits = {split_name:[data[i] for i in split] for split_name,split in splits.items()} train_sentences,dev_sentences,test_sentences = data_splits['train'],data_splits['dev'],data_splits['test'], corpus = Corpus( train=train_sentences, dev=dev_sentences, test=test_sentences, name='scierc') tag_dictionary = corpus.make_tag_dictionary(tag_type=TAG_TYPE) embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('glove'), ] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) tagger: SequenceTagger = SequenceTagger(hidden_size=64, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=TAG_TYPE, locked_dropout=0.01, dropout=0.01, use_crf=True) trainer: ModelTrainer = ModelTrainer(tagger, corpus, optimizer=torch.optim.RMSprop) # print(tagger) # pprint([p_name for p_name, p in tagger.named_parameters()]) save_path = 'flair_sequence_tagging/scierc-ner-%s'%multiprocessing.current_process() trainer.train('%s' % save_path, EvaluationMetric.MICRO_F1_SCORE, learning_rate=0.01, mini_batch_size=32, max_epochs=19, patience=3, save_final_model=False ) # plotter = Plotter() # plotter.plot_training_curves('%s/loss.tsv' % save_path) # plotter.plot_weights('%s/weights.txt' % save_path) def flair_tagger_predict_bio(sentences: List[Sentence]): train_data = [[(token.text, token.tags[tagger.tag_type].value) for token in datum] for datum in sentences] targets = [bilou2bio([tag for token, tag in datum]) for datum in train_data] pred_sentences = tagger.predict(sentences) pred_data = [bilou2bio([token.tags[tagger.tag_type].value for token in datum]) for datum in pred_sentences] return pred_data,targets return { 'train':calc_seqtag_f1_scores(flair_tagger_predict_bio,corpus.train), 'test':calc_seqtag_f1_scores(flair_tagger_predict_bio,corpus.test) }
def test_tagged_corpus_get_all_sentences(): train_sentence = Sentence("I'm used in training.", use_tokenizer=True) dev_sentence = Sentence("I'm a dev sentence.", use_tokenizer=True) test_sentence = Sentence('I will be only used for testing.', use_tokenizer=True) corpus = Corpus([train_sentence], [dev_sentence], [test_sentence]) all_sentences = corpus.get_all_sentences() assert (3 == len(all_sentences))
def test_tagged_corpus_make_label_dictionary_string(): sentence_1 = Sentence('sentence 1', labels=['class_1']) sentence_2 = Sentence('sentence 2', labels=['class_2']) sentence_3 = Sentence('sentence 3', labels=['class_1']) corpus = Corpus([sentence_1, sentence_2, sentence_3], [], []) label_dict = corpus.make_label_dictionary() assert (2 == len(label_dict)) assert ('<unk>' not in label_dict.get_items()) assert ('class_1' in label_dict.get_items()) assert ('class_2' in label_dict.get_items())
def test_tagged_corpus_downsample(): sentence = Sentence('I love Berlin.', labels=[Label('class_1')], use_tokenizer=True) corpus = Corpus([ sentence, sentence, sentence, sentence, sentence, sentence, sentence, sentence, sentence, sentence ], [], []) assert (10 == len(corpus.train)) corpus.downsample(percentage=0.3, only_downsample_train=True) assert (3 == len(corpus.train))
def run_zero_shot(train_tweets, train_y, val_tweets, val_y): """ Performs the training of the zero shot learning model @param train_tweets: the tweets that will be used for training @param train_y: the training labels @param val_tweets: the tweets that will be used for validation @param val_y: the validation labels @return: None """ # 1. Load our pre-trained TARS model for English print("Zero shot") # download https://nlp.informatik.hu-berlin.de/resources/models/tars-base/tars-base.pt tars = TARSClassifier.load( os.path.join(os.path.dirname(__file__), "..", "..", "saved_models", "tars-base.pt")) train_tweets["output"] = train_y.iloc[:] train = train_tweets.apply(create_sentences, axis=1).tolist() train = SentenceDataset(train) val_tweets["output"] = val_y.iloc[:] val = val_tweets.apply(create_sentences, axis=1).tolist() val = SentenceDataset(val) corpus = Corpus(train=train, test=val) tars.add_and_switch_to_new_task( "POSITIVE_NEGATIVE", label_dictionary=corpus.make_label_dictionary()) trainer = ModelTrainer(tars, corpus) # 4. train model trainer.train( base_path='../../data/zero_shot', # path to store the model artifacts learning_rate=0.02, # use very small learning rate mini_batch_size=16, # small mini-batch size since corpus is tiny max_epochs=10, # terminate after 10 epochs ) print("DONE TRAINING") tars = TARSClassifier.load('../../model/zero_shot/final-model.pt') val_tweets["pred"] = val_tweets.apply(predict_few_shot, args=(tars, ), axis=1) val_tweets["pred"] = val_tweets["pred"].apply(lambda x: 1 if x == "positive" else -1) pred = pd.DataFrame(list(val_tweets["pred"]), columns=['Prediction']) pred.index += 1 pred.insert(0, 'Id', pred.index) pred.to_csv("../../predictions/zero_shot_pred.csv", index=False)
def fit(self, X, y): """ Build feature vectors and train FLAIR model. Parameters ---------- X : list(list(str)) list of sentences. Sentences are tokenized into list of words. y : list(list(str)) list of list of BIO tags. Returns ------- self """ log.info("Creating FLAIR corpus...") Xtrain, Xval, ytrain, yval = train_test_split(X, y, test_size=0.1) sents_train = self._convert_to_flair(Xtrain, ytrain) sents_val = self._convert_to_flair(Xval, yval) corpus_train = Corpus(sents_train, sents_val, [], name="train-corpus") tag_dict = corpus_train.make_tag_dictionary(tag_type="ner") if self.embeddings is None: embedding_types = [ WordEmbeddings("glove"), CharacterEmbeddings() ] self.embeddings = StackedEmbeddings(embeddings=embedding_types) log.info("Building FLAIR NER...") self.model_ = SequenceTagger(hidden_size=self.hidden_dim, embeddings=self.embeddings, tag_dictionary=tag_dict, tag_type="ner", use_crf=self.use_crf, use_rnn=self.use_rnn, rnn_layers=self.num_rnn_layers, dropout=self.dropout, word_dropout=self.word_dropout, locked_dropout=self.locked_dropout) log.info("Training FLAIR NER...") opt = torch.optim.SGD if self.optimizer == "sgd" else torch.optim.Adam trainer = ModelTrainer(self.model_, corpus_train, opt) trainer.train(base_path=self.basedir, learning_rate=self.learning_rate, mini_batch_size=self.batch_size, max_epochs=self.max_iter) return self
def train(self): from flair.data import Corpus from flair.datasets import SentenceDataset from flair.data import Sentence self.classes = utils.read_class_titles(settings.CAT_DEPTH) self.classes['NOCAT'] = 'NOCAT' train = SentenceDataset([ Sentence(row['titlen']).add_label('law_topic', self.classes[row['cat1']]) for i, row in self.df_train.iterrows() ]) # make a corpus with train and test split self.corpus = Corpus(train=train, dev=train) # 1. load base TARS tars = self._load_pretained_model() # 2. make the model aware of the desired set of labels from the new corpus tars.add_and_switch_to_new_task( "LAW_TOPIC", label_dictionary=self.corpus.make_label_dictionary()) # 3. initialize the text classifier trainer with your corpus from flair.trainers import ModelTrainer trainer = ModelTrainer(tars, self.corpus) # 4. train model path = settings.WORKING_DIR if 1: trainer.train( base_path=path, # path to store the model artifacts learning_rate=5e-2, # 5ep, 0.2 bad; 5ep with 0.1 looks ok. mini_batch_size=settings.MINIBATCH, # mini_batch_chunk_size=1, mini_batch_chunk_size=4, # optionally set this if transformer is too much for your machine max_epochs=settings.EPOCHS, # terminate after 10 epochs train_with_dev=False, save_final_model=False, param_selection_mode=True, # True to avoid model saves shuffle=False, # Already done ) # from flair.models.text_classification_model import TARSClassifier # self.model = TARSClassifier.load( # os.path.join(path, 'best-model.pt') # ) self.model = tars
def train_seqtagger(train_data:Dataset, dev_data:Dataset, test_data:Dataset ): corpus = Corpus( train=train_data, dev=dev_data, test=test_data, name='scierc') pprint(Counter([tok.tags[TAG_TYPE].value for sent in corpus.train for tok in sent])) tag_dictionary = corpus.make_tag_dictionary(tag_type=TAG_TYPE) print(tag_dictionary.idx2item) embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('glove')] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) from flair.models import SequenceTagger tagger: SequenceTagger = SequenceTagger(hidden_size=64, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=TAG_TYPE, locked_dropout=0.01, dropout=0.01, use_crf=True) from flair.trainers import ModelTrainer trainer: ModelTrainer = ModelTrainer(tagger, corpus,optimizer=torch.optim.Adam) save_path = 'sequence_tagging/resources/taggers/scierc-ner' trainer.train('%s' % save_path, EvaluationMetric.MICRO_F1_SCORE, learning_rate=0.01, mini_batch_size=32, max_epochs=20) # plotter = Plotter() # plotter.plot_training_curves('%s/loss.tsv' % save_path) # plotter.plot_weights('%s/weights.txt' % save_path) from sequence_tagging.evaluate_flair_tagger import evaluate_sequence_tagger pprint('train-f1-macro: %0.2f'%evaluate_sequence_tagger(tagger,corpus.train)['f1-macro']) pprint('dev-f1-macro: %0.2f'%evaluate_sequence_tagger(tagger,corpus.dev)['f1-macro']) pprint('test-f1-macro: %0.2f'%evaluate_sequence_tagger(tagger,corpus.test)['f1-macro']) return tagger
def test_tagged_corpus_downsample(): sentence = Sentence("I love Berlin.", use_tokenizer=True).add_label('label', 'class_1') corpus: Corpus = Corpus( [ sentence, sentence, sentence, sentence, sentence, sentence, sentence, sentence, sentence, sentence, ], [], [], ) assert 10 == len(corpus.train) corpus.downsample(percentage=0.3, downsample_dev=False, downsample_test=False) assert 3 == len(corpus.train)
def build_and_train_conll03en_flair_sequence_tagger(corpus,tag_type,tag_dictionary): ''' do not change! same configuration as described in file: "flair/resources/docs/EXPERIMENTS.md" section: "CoNLL-03 Named Entity Recognition (English)" ''' embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=[ WordEmbeddings("glove"), PooledFlairEmbeddings("news-forward", pooling="min"), PooledFlairEmbeddings("news-backward", pooling="min"), ] ) from flair.models import SequenceTagger tagger: SequenceTagger = SequenceTagger( hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, ) from flair.trainers import ModelTrainer corpus = Corpus(train=corpus.train, dev=corpus.dev,test=[]) trainer: ModelTrainer = ModelTrainer(tagger, corpus) # trainer.train("resources/taggers/example-ner", train_with_dev=True, max_epochs=150) # original trainer.train("flair_checkpoints", train_with_dev=False, max_epochs=40,save_final_model=False) # original return tagger
def test_tagged_corpus_get_tag_statistic(): train_sentence = Sentence("Zalando Research is located in Berlin .") train_sentence[0].add_tag("ner", "B-ORG") train_sentence[1].add_tag("ner", "E-ORG") train_sentence[5].add_tag("ner", "S-LOC") dev_sentence = Sentence( "Facebook, Inc. is a company, and Google is one as well.", use_tokenizer=segtok_tokenizer, ) dev_sentence[0].add_tag("ner", "B-ORG") dev_sentence[1].add_tag("ner", "I-ORG") dev_sentence[2].add_tag("ner", "E-ORG") dev_sentence[8].add_tag("ner", "S-ORG") test_sentence = Sentence("Nothing to do with companies.") tag_to_count_dict = Corpus._get_tag_to_count( [train_sentence, dev_sentence, test_sentence], "ner") assert 1 == tag_to_count_dict["S-ORG"] assert 1 == tag_to_count_dict["S-LOC"] assert 2 == tag_to_count_dict["B-ORG"] assert 2 == tag_to_count_dict["E-ORG"] assert 1 == tag_to_count_dict["I-ORG"]
def class_distribution(self, multiclass: bool = False, nr_classes: int = 10, savefig_file=None, **kwargs): class_count = Corpus._get_class_to_count(self.sentences) class_count = pd.DataFrame.from_dict(class_count, orient='index', columns=['count']).sort_values( 'count', ascending=False) html_table = class_count.to_html() # plot distribution class_count_top = class_count[:nr_classes].copy() if not multiclass: if nr_classes < len(class_count): class_count_top.loc['others'] = class_count[nr_classes:].sum() # pie plot class_count class_count_top.plot.pie(y='count', **kwargs) plt.legend(labels=class_count_top.index, bbox_to_anchor=(1, 0, 0.1, 1), loc='center right') else: class_count_top.plot.bar(y='count', **kwargs) plt.gca().yaxis.grid(True, linestyle='--') plt.tight_layout() if savefig_file: plt.savefig(self.path / savefig_file, dpi=600) plt.show()
def test_tagged_corpus_make_vocab_dictionary(): train_sentence = Sentence('used in training. training is cool.', use_tokenizer=True) corpus = Corpus([train_sentence], [], []) vocab = corpus.make_vocab_dictionary(max_tokens=2, min_freq=(-1)) assert (3 == len(vocab)) assert ('<unk>' in vocab.get_items()) assert ('training' in vocab.get_items()) assert ('.' in vocab.get_items()) vocab = corpus.make_vocab_dictionary(max_tokens=(-1), min_freq=(-1)) assert (7 == len(vocab)) vocab = corpus.make_vocab_dictionary(max_tokens=(-1), min_freq=2) assert (3 == len(vocab)) assert ('<unk>' in vocab.get_items()) assert ('training' in vocab.get_items()) assert ('.' in vocab.get_items())
def test_tagged_corpus_downsample(): sentence = Sentence("I love Berlin.", labels=[Label("class_1")], use_tokenizer=segtok_tokenizer) corpus: Corpus = Corpus( [ sentence, sentence, sentence, sentence, sentence, sentence, sentence, sentence, sentence, sentence, ], [], [], ) assert 10 == len(corpus.train) corpus.downsample(percentage=0.3, only_downsample_train=True) assert 3 == len(corpus.train)
def test_tagged_corpus_downsample(): sentence = Sentence("I love Berlin.", use_tokenizer=True).add_label("label", "class_1") corpus: Corpus = Corpus( FlairDatapointDataset([ sentence, sentence, sentence, sentence, sentence, sentence, sentence, sentence, sentence, sentence, ]), sample_missing_splits=False, ) assert 10 == len(corpus.train) corpus.downsample(percentage=0.3, downsample_dev=False, downsample_test=False) assert 3 == len(corpus.train)
def make_relations_tag_dictionary(corpus: Corpus, tag_type='dependency', special_tags=[]) -> Dictionary: tag_dictionary: Dictionary = Dictionary(add_unk=False) # for tag in special_tags: # tag_dictionary.add_item(tag) for sentence in corpus.get_all_sentences(): for token in sentence.tokens: tag_dictionary.add_item(token.get_tag(tag_type).value) return tag_dictionary
def spelling_aug(corpus): aug = naw.SpellingAug() # augmented_sentences = [] # go through all train and dev sentences for sentence in corpus.train: augmented_texts = aug.augment(sentence, n=3) corpus = Corpus(train=SentenceDataset(augmented_texts), dev=corpus.dev, test=corpus.test) return corpus
def obtain_statistics(self, tag_type: str = 'ner', save_as_json: bool = True): stats_splits = self.corpus.obtain_statistics(tag_type) stats_complete = json.dumps(Corpus._obtain_statistics_for( self.sentences, 'complete', tag_type), indent=4) if save_as_json: (self.path / 'stats_splits.json').write_text(stats_splits) (self.path / 'stats_complete.json').write_text(stats_complete) return (stats_splits, stats_complete)
def test_tagged_corpus_statistics_multi_label(): train_sentence = Sentence('I love Berlin.', labels=['class_1'], use_tokenizer=True) dev_sentence = Sentence('The sun is shining.', labels=['class_2'], use_tokenizer=True) test_sentence = Sentence('Berlin is sunny.', labels=['class_1', 'class_2'], use_tokenizer=True) class_to_count_dict = Corpus._get_class_to_count( [train_sentence, dev_sentence, test_sentence]) assert ('class_1' in class_to_count_dict) assert ('class_2' in class_to_count_dict) assert (2 == class_to_count_dict['class_1']) assert (2 == class_to_count_dict['class_2']) tokens_in_sentences = Corpus._get_tokens_per_sentence( [train_sentence, dev_sentence, test_sentence]) assert (3 == len(tokens_in_sentences)) assert (4 == tokens_in_sentences[0]) assert (5 == tokens_in_sentences[1]) assert (4 == tokens_in_sentences[2])
def test_tagged_corpus_make_label_dictionary_string(): sentence_1 = Sentence("sentence 1", labels=["class_1"]) sentence_2 = Sentence("sentence 2", labels=["class_2"]) sentence_3 = Sentence("sentence 3", labels=["class_1"]) corpus: Corpus = Corpus([sentence_1, sentence_2, sentence_3], [], []) label_dict = corpus.make_label_dictionary() assert 2 == len(label_dict) assert "<unk>" not in label_dict.get_items() assert "class_1" in label_dict.get_items() assert "class_2" in label_dict.get_items()
def train_dev_split(sentences, dev_ratio=0.25): dev_size = len(sentences) * dev_ratio train = [] dev = [] for count, idx in enumerate(np.random.permutation(len(sentences))): if count < dev_size: dev.append(sentences[idx]) else: train.append(sentences[idx]) return Corpus(train=train, dev=dev, test=[])
def create_corpus(self, train_path, val_path, test_path, chunk_len): """ *** This methods is only needed when training your own models It is not accessible from rwtagger_script and not documented in detail. Use at your own risk. ;-) *** :param data_path: :return: """ train_list = self.create_sentlist_from_file_batchmax(train_path, maxlen=chunk_len) val_list = self.create_sentlist_from_file_batchmax(val_path, maxlen=chunk_len) test_list = self.create_sentlist_from_file_batchmax(test_path, maxlen=chunk_len) corpus: Corpus = Corpus(train_list, val_list, test_list) return corpus
def test_tagged_corpus_get_all_sentences(): train_sentence = Sentence("I'm used in training.") dev_sentence = Sentence("I'm a dev sentence.") test_sentence = Sentence("I will be only used for testing.") corpus: Corpus = Corpus( FlairDatapointDataset([train_sentence]), FlairDatapointDataset([dev_sentence]), FlairDatapointDataset([test_sentence]), ) all_sentences = corpus.get_all_sentences() assert 3 == len(all_sentences)
def test_tagged_corpus_make_label_dictionary(): sentence_1 = Sentence("sentence 1").add_label('label', 'class_1') sentence_2 = Sentence("sentence 2").add_label('label', 'class_2') sentence_3 = Sentence("sentence 3").add_label('label', 'class_1') corpus: Corpus = Corpus([sentence_1, sentence_2, sentence_3], [], []) label_dict = corpus.make_label_dictionary('label') assert 2 == len(label_dict) assert "<unk>" not in label_dict.get_items() assert "class_1" in label_dict.get_items() assert "class_2" in label_dict.get_items()
def fit(self, corpus: Corpus, model_path: str): self.model = TARSClassifier( task_name="ChemicalUnderstanding", label_dictionary=corpus.make_label_dictionary(), ) trainer = ModelTrainer(self.model, corpus) trainer.train( base_path=model_path, learning_rate=0.02, mini_batch_size=16, mini_batch_chunk_size=4, max_epochs=10, )
def test_tagged_corpus_make_label_dictionary(): sentence_1 = Sentence("sentence 1").add_label("label", "class_1") sentence_2 = Sentence("sentence 2").add_label("label", "class_2") sentence_3 = Sentence("sentence 3").add_label("label", "class_1") corpus: Corpus = Corpus([sentence_1, sentence_2, sentence_3], [], []) label_dict = corpus.make_label_dictionary("label") assert 3 == len(label_dict) assert "<unk>" in label_dict.get_items() assert "class_1" in label_dict.get_items() assert "class_2" in label_dict.get_items()