예제 #1
0
def test_train_load_use_classifier_flair(results_base_path, tasks_base_path):
    corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb")
    label_dict = corpus.make_label_dictionary()

    flair_document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(
        [flair_embeddings], 128, 1, False, 64, False, False)

    model: TextClassifier = TextClassifier(flair_document_embeddings,
                                           label_dict,
                                           multi_label=False)

    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path, max_epochs=2, shuffle=False)

    sentence = Sentence("Berlin is a really nice city.")

    for s in model.predict(sentence):
        for l in s.labels:
            assert l.value is not None
            assert 0.0 <= l.score <= 1.0
            assert type(l.score) is float

    del trainer, model, corpus, flair_document_embeddings
    loaded_model = TextClassifier.load(results_base_path / "final-model.pt")

    sentence = Sentence("I love Berlin")
    sentence_empty = Sentence("       ")

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # clean up results directory
    shutil.rmtree(results_base_path)
    del loaded_model
예제 #2
0
def train_model(data_dir, max_epochs):
    st.write('Creating word corpus for training...')
    corpus = ClassificationCorpus(data_dir)
    label_dict = corpus.make_label_dictionary()
    st.write('Done')

    st.write('Load and create Embeddings for text data...')
    word_embeddings = [
        WordEmbeddings('glove'),
        # FlairEmbeddings('news-forward'),
        # FlairEmbeddings('news-backward')
    ]
    document_embeddings = DocumentRNNEmbeddings(word_embeddings,
                                                hidden_size=512,
                                                reproject_words=True,
                                                reproject_words_dimension=256)
    st.write('Done')

    st.write('Preparing')
    classifier = TextClassifier(document_embeddings,
                                label_dictionary=label_dict)
    trainer = ModelTrainer(classifier, corpus)
    trainer.train('model-saves',
                  learning_rate=0.1,
                  mini_batch_size=32,
                  anneal_factor=0.5,
                  patience=8,
                  max_epochs=max_epochs,
                  checkpoint=True)
    st.write('Model Training Finished!')
예제 #3
0
def test_train_resume_text_classification_training(results_base_path,
                                                   tasks_base_path):
    corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb")
    label_dict = corpus.make_label_dictionary()

    embeddings: TokenEmbeddings = FlairEmbeddings("news-forward-fast")
    document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(
        [embeddings], 128, 1, False)

    model = TextClassifier(document_embeddings, label_dict, False)

    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path,
                  max_epochs=2,
                  shuffle=False,
                  checkpoint=True)

    trainer = ModelTrainer.load_checkpoint(results_base_path / "checkpoint.pt",
                                           corpus)
    trainer.train(results_base_path,
                  max_epochs=2,
                  shuffle=False,
                  checkpoint=True)

    # clean up results directory
    shutil.rmtree(results_base_path)
예제 #4
0
def test_train_load_use_classifier_multi_label(results_base_path,
                                               tasks_base_path):
    corpus = flair.datasets.ClassificationCorpus(tasks_base_path /
                                                 "multi_class")
    label_dict = corpus.make_label_dictionary()

    word_embedding: WordEmbeddings = WordEmbeddings("turian")
    document_embeddings = DocumentRNNEmbeddings(
        embeddings=[word_embedding],
        hidden_size=32,
        reproject_words=False,
        bidirectional=False,
    )

    model: TextClassifier = TextClassifier(document_embeddings,
                                           label_dict,
                                           multi_label=True)

    trainer = ModelTrainer(model, corpus)
    trainer.train(
        results_base_path,
        mini_batch_size=1,
        max_epochs=100,
        shuffle=False,
        checkpoint=False,
    )

    sentence = Sentence("apple tv")

    for s in model.predict(sentence):
        for l in s.labels:
            print(l)
            assert l.value is not None
            assert 0.0 <= l.score <= 1.0
            assert type(l.score) is float

    sentence = Sentence("apple tv")

    for s in model.predict(sentence):

        assert "apple" in sentence.get_label_names()
        assert "tv" in sentence.get_label_names()

        for l in s.labels:
            print(l)
            assert l.value is not None
            assert 0.0 <= l.score <= 1.0
            assert type(l.score) is float

    loaded_model = TextClassifier.load(results_base_path / "final-model.pt")

    sentence = Sentence("I love Berlin")
    sentence_empty = Sentence("       ")

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # clean up results directory
    shutil.rmtree(results_base_path)
예제 #5
0
def test_train_classifier_with_sampler(results_base_path, tasks_base_path):
    corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb")
    label_dict = corpus.make_label_dictionary()

    word_embedding: WordEmbeddings = WordEmbeddings("turian")
    document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(
        [word_embedding], 32, 1, False, 64, False, False)

    model: TextClassifier = TextClassifier(document_embeddings, label_dict,
                                           False)

    trainer = ModelTrainer(model, corpus)
    trainer.train(
        results_base_path,
        max_epochs=2,
        shuffle=False,
        sampler=ImbalancedClassificationDatasetSampler,
    )

    sentence = Sentence("Berlin is a really nice city.")

    for s in model.predict(sentence):
        for l in s.labels:
            assert l.value is not None
            assert 0.0 <= l.score <= 1.0
            assert type(l.score) is float

    loaded_model = TextClassifier.load(results_base_path / "final-model.pt")

    # clean up results directory
    shutil.rmtree(results_base_path)
def test_train_load_use_classifier_multi_label(results_base_path,
                                               tasks_base_path):
    corpus = flair.datasets.ClassificationCorpus(tasks_base_path /
                                                 "multi_class",
                                                 label_type="topic")
    label_dict = corpus.make_label_dictionary(label_type="topic")

    model: TextClassifier = TextClassifier(
        document_embeddings=document_embeddings,
        label_dictionary=label_dict,
        label_type="topic",
        multi_label=True)

    trainer = ModelTrainer(model, corpus)
    trainer.train(
        results_base_path,
        mini_batch_size=1,
        max_epochs=100,
        shuffle=False,
        checkpoint=False,
        train_with_test=True,
        train_with_dev=True,
    )

    sentence = Sentence("apple tv")

    model.predict(sentence)

    for label in sentence.labels:
        print(label)
        assert label.value is not None
        assert 0.0 <= label.score <= 1.0
        assert type(label.score) is float

    sentence = Sentence("apple tv")

    model.predict(sentence)

    assert "apple" in sentence.get_label_names()
    assert "tv" in sentence.get_label_names()

    for label in sentence.labels:
        assert label.value is not None
        assert 0.0 <= label.score <= 1.0
        assert type(label.score) is float

    del trainer, model, corpus
    loaded_model = TextClassifier.load(results_base_path / "final-model.pt")

    sentence = Sentence("I love Berlin")
    sentence_empty = Sentence("       ")

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # clean up results directory
    shutil.rmtree(results_base_path)
    del loaded_model
예제 #7
0
def test_train_load_use_classifier_multi_label(results_base_path,
                                               tasks_base_path):

    # corpus = NLPTaskDataFetcher.load_corpus('multi_class', base_path=tasks_base_path)
    corpus = NLPTaskDataFetcher.load_classification_corpus(
        data_folder=tasks_base_path / 'multi_class')
    label_dict = corpus.make_label_dictionary()

    glove_embedding: WordEmbeddings = WordEmbeddings('en-glove')
    document_embeddings = DocumentLSTMEmbeddings(embeddings=[glove_embedding],
                                                 hidden_size=32,
                                                 reproject_words=False,
                                                 bidirectional=False)

    model = TextClassifier(document_embeddings, label_dict, multi_label=True)

    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path,
                  EvaluationMetric.MICRO_F1_SCORE,
                  max_epochs=100,
                  test_mode=True,
                  checkpoint=False)

    sentence = Sentence('apple tv')

    for s in model.predict(sentence):
        for l in s.labels:
            print(l)
            assert (l.value is not None)
            assert (0.0 <= l.score <= 1.0)
            assert (type(l.score) is float)

    sentence = Sentence("apple tv")

    for s in model.predict(sentence):

        assert ('apple' in sentence.get_label_names())
        assert ('tv' in sentence.get_label_names())

        for l in s.labels:
            print(l)
            assert (l.value is not None)
            assert (0.0 <= l.score <= 1.0)
            assert (type(l.score) is float)

    loaded_model = TextClassifier.load_from_file(results_base_path /
                                                 'final-model.pt')

    sentence = Sentence('I love Berlin')
    sentence_empty = Sentence('       ')

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # clean up results directory
    shutil.rmtree(results_base_path)
예제 #8
0
def main(args, model_dir):
    logger.info('Args = {}'.format(args))
    corpus = CorpusLoader().load_corpus(CORPUS_PATH[args.corpus])
    tokenizer = TokenizerFactory().tokenizer(args.corpus)
    logger.info('Loaded corpus: {}'.format(corpus))

    logger.info('Get sentences...')
    train_sents, _ = flair_utils.standoff_to_flair_sents(corpus.train, tokenizer, verbose=True)
    dev_sents, _ = flair_utils.standoff_to_flair_sents(corpus.dev, tokenizer, verbose=True)
    test_sents, test_docs = flair_utils.standoff_to_flair_sents(corpus.test,
                                                                tokenizer, verbose=True)

    train_sents = train_sents + dev_sents
    train_sents_filtered = list(filter(lambda sent: not _ignore_sentence(sent), train_sents))

    sample_size = int(len(train_sents_filtered) * args.train_sample_frac)
    rs = RandomState(seed=args.random_seed)
    train_sents_sample = rs.choice(train_sents_filtered, replace=False, size=sample_size).tolist()
    logger.info('Train with fraction of training data: {} sents out of {} sentences ({}%)',
                sample_size, len(train_sents_filtered), args.train_sample_frac)

    # We need to pass some dev data, otherwise flair raises a ZeroDivisionError
    # See: https://github.com/zalandoresearch/flair/issues/1139
    # We just split the training sample into half and instruct Flair to train_with_dev (see below).
    half = len(train_sents_sample) // 2
    flair_corpus = flair_utils.FilteredCorpus(train=train_sents_sample[:half],
                                              dev=train_sents_sample[half:],
                                              test=test_sents,
                                              ignore_sentence=_ignore_sentence)
    logger.info(flair_corpus)

    logger.info('Train model...')
    tagger = run_bilstmcrf.get_model(flair_corpus,
                                     corpus_name=args.corpus,
                                     embedding_lang=args.embedding_lang,
                                     pooled_contextual_embeddings=True)

    trainer = ModelTrainer(tagger, flair_corpus)
    trainer.train(join(model_dir, 'flair'),
                  max_epochs=150,
                  monitor_train=False,
                  train_with_dev=True,
                  save_final_model=args.save_final_model)

    logger.info('Make predictions...')
    run_bilstmcrf.make_predictions(tagger, flair_corpus)

    logger.info('Start evaluation...')
    evaluator = Evaluator(gold=corpus.test,
                          predicted=flair_utils.flair_sents_to_standoff(test_sents, test_docs))

    entity_level_metric = evaluator.entity_level()
    logger.info('\n{}', entity_level_metric)
    entity_level_metric.to_csv(join(model_dir, 'scores_entity.csv'))
    evaluator.token_level().to_csv(join(model_dir, 'scores_token.csv'))
    evaluator.token_level_blind().to_csv(join(model_dir, 'scores_token_blind.csv'))
    logger.info('Done.')
예제 #9
0
def test_text_classifier_multi(results_base_path, tasks_base_path):
    flair.set_seed(123)

    corpus = ClassificationCorpus(
        tasks_base_path / "trivial" / "trivial_text_classification_multi",
        label_type="city",
    )
    label_dict = corpus.make_label_dictionary(label_type="city")

    model: TextClassifier = TextClassifier(
        document_embeddings=DocumentPoolEmbeddings([turian_embeddings],
                                                   fine_tune_mode="linear"),
        label_dictionary=label_dict,
        label_type="city",
        multi_label=True,
    )

    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path,
                  mini_batch_size=2,
                  max_epochs=50,
                  shuffle=True)

    # check if model can predict
    sentence = Sentence("this is Berlin")
    sentence_empty = Sentence("       ")

    model.predict(sentence)
    model.predict([sentence, sentence_empty])
    model.predict([sentence_empty])

    # load model
    loaded_model = TextClassifier.load(results_base_path / "final-model.pt")

    # chcek if model predicts correct label
    sentence = Sentence("this is Berlin")
    sentence_double = Sentence("this is Berlin and pizza")

    loaded_model.predict([sentence, sentence_double])

    values = []
    for label in sentence_double.labels:
        assert label.value is not None
        assert 0.0 <= label.score <= 1.0
        assert type(label.score) is float
        values.append(label.value)

    assert "Berlin" in values
    assert "pizza" in values

    # check if loaded model successfully fit the training data
    result: Result = loaded_model.evaluate(corpus.test, gold_label_type="city")
    print(result.classification_report)
    assert result.classification_report["micro avg"]["f1-score"] == 1.0

    del loaded_model
예제 #10
0
def trainFlairClassifier(df, columns, trainNameCsv, testNameCsv, devNameCsv,
                         classifierFileName):
    ids = df['id'].tolist()

    nSamples = len(ids)
    idx80 = int(nSamples * 0.7)
    idx90 = int(nSamples * 0.9)

    train_ids = ids[:idx80]
    test_ids = ids[idx80:idx90]
    dev_ids = ids[idx90:]

    with TemporaryDirectory() as temp_dir:
        trainCsv = temp_dir + trainNameCsv
        testCsv = temp_dir + testNameCsv
        devCsv = temp_dir + devNameCsv

        df[df['id'].isin(train_ids)].to_csv(trainCsv,
                                            columns=columns,
                                            sep='\t',
                                            index=False,
                                            header=False)
        df[df['id'].isin(test_ids)].to_csv(testCsv,
                                           columns=columns,
                                           sep='\t',
                                           index=False,
                                           header=False)
        df[df['id'].isin(dev_ids)].to_csv(devCsv,
                                          columns=columns,
                                          sep='\t',
                                          index=False,
                                          header=False)

        corpus = NLPTaskDataFetcher.load_classification_corpus(
            temp_dir, train_file=trainCsv, test_file=testCsv, dev_file=devCsv)

        word_embeddings = [
            WordEmbeddings('glove'),
            FlairEmbeddings('news-forward-fast'),
            FlairEmbeddings('news-backward-fast')
        ]
        document_embeddings = DocumentLSTMEmbeddings(
            word_embeddings,
            hidden_size=512,
            reproject_words=True,
            reproject_words_dimension=256)
        classifier = TextClassifier(
            document_embeddings,
            label_dictionary=corpus.make_label_dictionary(),
            multi_label=False)
        trainer = ModelTrainer(classifier, corpus)

        trainer.train(temp_dir, max_epochs=50)

        classifier.save(classifierFileName)
예제 #11
0
def run_zero_shot(train_tweets, train_y, val_tweets, val_y):
    """
    Performs the training of the zero shot learning model

    @param train_tweets: the tweets that will be used for training
    @param train_y: the training labels
    @param val_tweets: the tweets that will be used for validation
    @param val_y: the validation labels
    @return: None
    """
    # 1. Load our pre-trained TARS model for English
    print("Zero shot")
    # download https://nlp.informatik.hu-berlin.de/resources/models/tars-base/tars-base.pt
    tars = TARSClassifier.load(
        os.path.join(os.path.dirname(__file__), "..", "..", "saved_models",
                     "tars-base.pt"))

    train_tweets["output"] = train_y.iloc[:]
    train = train_tweets.apply(create_sentences, axis=1).tolist()
    train = SentenceDataset(train)

    val_tweets["output"] = val_y.iloc[:]
    val = val_tweets.apply(create_sentences, axis=1).tolist()
    val = SentenceDataset(val)

    corpus = Corpus(train=train, test=val)

    tars.add_and_switch_to_new_task(
        "POSITIVE_NEGATIVE", label_dictionary=corpus.make_label_dictionary())

    trainer = ModelTrainer(tars, corpus)

    # 4. train model
    trainer.train(
        base_path='../../data/zero_shot',  # path to store the model artifacts
        learning_rate=0.02,  # use very small learning rate
        mini_batch_size=16,  # small mini-batch size since corpus is tiny
        max_epochs=10,  # terminate after 10 epochs
    )

    print("DONE TRAINING")
    tars = TARSClassifier.load('../../model/zero_shot/final-model.pt')

    val_tweets["pred"] = val_tweets.apply(predict_few_shot,
                                          args=(tars, ),
                                          axis=1)
    val_tweets["pred"] = val_tweets["pred"].apply(lambda x: 1
                                                  if x == "positive" else -1)

    pred = pd.DataFrame(list(val_tweets["pred"]), columns=['Prediction'])
    pred.index += 1
    pred.insert(0, 'Id', pred.index)

    pred.to_csv("../../predictions/zero_shot_pred.csv", index=False)
예제 #12
0
def main(args):
    args = parser.parse_args()

    # 1. get the corpus
    corpus: TaggedCorpus = NLPTaskDataFetcher.load_classification_corpus(
        args.data_dir[0],
        train_file='train.txt',
        dev_file='dev.txt',
        test_file='test.txt')

    # 2. create the label dictionary
    label_dict = corpus.make_label_dictionary()

    # 3. make a list of word embeddings
    word_embeddings = [
        WordEmbeddings('glove'),

        # comment in flair embeddings for state-of-the-art results
        # FlairEmbeddings('news-forward'),
        # FlairEmbeddings('news-backward'),
        # ELMoEmbeddings()
    ]

    # 4. init document embedding by passing list of word embeddings
    document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings(
        word_embeddings,
        hidden_size=128,
        reproject_words=True,
        reproject_words_dimension=64,
    )

    # 5. create the text classifier
    classifier = TextClassifier(document_embeddings,
                                label_dictionary=label_dict,
                                multi_label=False)

    # 6. initialize the text classifier trainer
    trainer = ModelTrainer(classifier, corpus)

    # 7. start the training
    model_out = 'resources/classifiers/sentence-classification/glove'
    trainer.train(model_out,
                  learning_rate=0.1,
                  mini_batch_size=32,
                  anneal_factor=0.5,
                  patience=5,
                  max_epochs=100)

    # 8. plot training curves (optional)
    from flair.visual.training_curves import Plotter
    plotter = Plotter()
    plotter.plot_training_curves(join(model_out, 'loss.tsv'))
    plotter.plot_weights(join(model_out, 'weights.txt'))
 def train(self):
     corpus = NLPTaskDataFetcher.load_classification_corpus(Path(self.corpus_path),
                                                            test_file="test_clean_text.txt",
                                                            dev_file="dev_clean_text.txt",
                                                            train_file="train_clean_text.txt")
     embeddings = [WordEmbeddings(self.word_emb_path), FlairEmbeddings('polish-forward'),
                   FlairEmbeddings('polish-backward')]
     document_embeddings = DocumentRNNEmbeddings(embeddings, hidden_size=self.hidden_size, bidirectional=True)
     classifier = TextClassifier(document_embeddings, label_dictionary=corpus.make_label_dictionary(),
                                 multi_label=False)
     trainer = ModelTrainer(classifier, corpus)
     trainer.train(self.model_path, evaluation_metric=EvaluationMetric.MACRO_F1_SCORE, max_epochs=self.epochs)
예제 #14
0
    def fit(self, X, y):
        """ Build feature vectors and train FLAIR model.

            Parameters
            ----------
            X : list(list(str))
                list of sentences. Sentences are tokenized into list 
                of words.
            y : list(list(str))
                list of list of BIO tags.

            Returns
            -------
            self
        """
        log.info("Creating FLAIR corpus...")
        Xtrain, Xval, ytrain, yval = train_test_split(X, y, test_size=0.1)
        sents_train = self._convert_to_flair(Xtrain, ytrain)
        sents_val = self._convert_to_flair(Xval, yval)
        corpus_train = Corpus(sents_train, sents_val, [], name="train-corpus")

        tag_dict = corpus_train.make_tag_dictionary(tag_type="ner")

        if self.embeddings is None:
            embedding_types = [
                WordEmbeddings("glove"),
                CharacterEmbeddings()    
            ]
            self.embeddings = StackedEmbeddings(embeddings=embedding_types)

        log.info("Building FLAIR NER...")
        self.model_ = SequenceTagger(hidden_size=self.hidden_dim,
            embeddings=self.embeddings,
            tag_dictionary=tag_dict,
            tag_type="ner",
            use_crf=self.use_crf,
            use_rnn=self.use_rnn,
            rnn_layers=self.num_rnn_layers,
            dropout=self.dropout,
            word_dropout=self.word_dropout,
            locked_dropout=self.locked_dropout)

        log.info("Training FLAIR NER...")
        opt = torch.optim.SGD if self.optimizer == "sgd" else torch.optim.Adam
        trainer = ModelTrainer(self.model_, corpus_train, opt)
        trainer.train(base_path=self.basedir,
            learning_rate=self.learning_rate,
            mini_batch_size=self.batch_size,
            max_epochs=self.max_iter)

        return self
예제 #15
0
    def train(self):
        from flair.data import Corpus
        from flair.datasets import SentenceDataset
        from flair.data import Sentence

        self.classes = utils.read_class_titles(settings.CAT_DEPTH)
        self.classes['NOCAT'] = 'NOCAT'

        train = SentenceDataset([
            Sentence(row['titlen']).add_label('law_topic',
                                              self.classes[row['cat1']])
            for i, row in self.df_train.iterrows()
        ])

        # make a corpus with train and test split
        self.corpus = Corpus(train=train, dev=train)

        # 1. load base TARS
        tars = self._load_pretained_model()

        # 2. make the model aware of the desired set of labels from the new corpus
        tars.add_and_switch_to_new_task(
            "LAW_TOPIC", label_dictionary=self.corpus.make_label_dictionary())

        # 3. initialize the text classifier trainer with your corpus
        from flair.trainers import ModelTrainer
        trainer = ModelTrainer(tars, self.corpus)

        # 4. train model
        path = settings.WORKING_DIR
        if 1:
            trainer.train(
                base_path=path,
                # path to store the model artifacts
                learning_rate=5e-2,  # 5ep, 0.2 bad; 5ep with 0.1 looks ok.
                mini_batch_size=settings.MINIBATCH,
                # mini_batch_chunk_size=1, mini_batch_chunk_size=4, # optionally set this if transformer is too much for your machine
                max_epochs=settings.EPOCHS,  # terminate after 10 epochs
                train_with_dev=False,
                save_final_model=False,
                param_selection_mode=True,  # True to avoid model saves
                shuffle=False,  # Already done
            )

        # from flair.models.text_classification_model import TARSClassifier
        # self.model = TARSClassifier.load(
        #     os.path.join(path, 'best-model.pt')
        # )

        self.model = tars
예제 #16
0
def classify(data, labels, test, train, validation):
    train_data = [k for k in data.keys() if k in train]
    train_labels = [labels[k] for k in train_data]
    train_data = [data[k] for k in train_data]

    test_data = [k for k in data.keys() if k in test]
    test_labels = [labels[k] for k in test_data]
    test_data = [data[k] for k in test_data]

    validation_data = [k for k in data.keys() if k in validation]
    validation_labels = [labels[k] for k in validation_data]
    validation_data = [data[k] for k in validation_data]

    save_training_files(train_data, train_labels, test_data, test_labels,
                        validation_data, validation_labels)
    corpus = NLPTaskDataFetcher.load_classification_corpus(
        Path('./'),
        test_file='test.txt',
        dev_file='dev.txt',
        train_file='train.txt')
    word_embeddings = [
        WordEmbeddings('pl'),
        FlairEmbeddings('polish-forward'),
        FlairEmbeddings('polish-backward')
    ]
    doc_embeddings = DocumentRNNEmbeddings(word_embeddings,
                                           hidden_size=512,
                                           reproject_words=True,
                                           reproject_words_dimension=256)
    classifier = TextClassifier(
        doc_embeddings,
        label_dictionary=corpus.make_label_dictionary(),
        multi_label=False)
    trainer = ModelTrainer(classifier, corpus)
    trainer.train('./', max_epochs=25)
    classifier = TextClassifier.load_from_file('./best-model.pt')

    validation_data = [Sentence(x) for x in validation_data]
    for x in validation_data:
        classifier.predict(x)
    predicted = [int(x.labels[0].value) for x in validation_data]
    remove_training_files()
    precision, recall, f1, _ = precision_recall_fscore_support(
        validation_labels, predicted, average='binary')
    return {
        'accuracy': float("{:.3f}".format(round(precision, 3))),
        'recall': float("{:.3f}".format(round(recall, 3))),
        'f1': float("{:.3f}".format(round(f1, 3)))
    }
예제 #17
0
파일: flair.py 프로젝트: jbuccina/keter
    def fit(self, corpus: Corpus, model_path: str):
        self.model = TARSClassifier(
            task_name="ChemicalUnderstanding",
            label_dictionary=corpus.make_label_dictionary(),
        )

        trainer = ModelTrainer(self.model, corpus)

        trainer.train(
            base_path=model_path,
            learning_rate=0.02,
            mini_batch_size=16,
            mini_batch_chunk_size=4,
            max_epochs=10,
        )
예제 #18
0
 def train_model(self,
                 corpus,
                 classifier,
                 step_num,
                 optimizer_state=None,
                 epoch=1,
                 lr=1e-3):
     trainer = ModelTrainer(classifier,
                            corpus,
                            optimizer=AdamW,
                            optimizer_state=optimizer_state)
     result = trainer.train(
         f'{self.experiment_name}/{step_num}/',
         learning_rate=lr,
         min_learning_rate=1e-8,
         mini_batch_size=32,
         anneal_factor=0.5,
         patience=5,
         max_epochs=epoch,
         embeddings_storage_mode=self.embeddings_storage_mode,
         weight_decay=1e-4,
     )
     os.system(f'rm {self.experiment_name}/{step_num-3}/best-model.pt')
     os.system(f'rm {self.experiment_name}/{step_num-3}/final-model.pt')
     return classifier, result['optimizer_state_dict']
예제 #19
0
파일: flair.py 프로젝트: jbuccina/keter
    def train(self):
        tox_corpus = FlairTox21().to_corpus()

        self.model = TARSClassifier(
            task_name="Toxicity",
            label_dictionary=tox_corpus.make_label_dictionary(),
            document_embeddings="distilbert-base-uncased",
        )

        trainer = ModelTrainer(self.model, tox_corpus)

        trainer.train(
            base_path=get_path("model") / self.filename,
            learning_rate=0.02,
            mini_batch_size=1,
            max_epochs=10,
        )
def test_train_resume_text_classification_training(results_base_path, tasks_base_path):
    corpus = NLPTaskDataFetcher.load_corpus(NLPTask.IMDB, base_path=tasks_base_path)
    label_dict = corpus.make_label_dictionary()

    embeddings: TokenEmbeddings = FlairEmbeddings('news-forward-fast', use_cache=False)
    document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings([embeddings], 128, 1, False)

    model = TextClassifier(document_embeddings, label_dict, False)

    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path, max_epochs=2, test_mode=True, checkpoint=True)

    trainer = ModelTrainer.load_from_checkpoint(results_base_path / 'checkpoint.pt', 'TextClassifier', corpus)
    trainer.train(results_base_path, max_epochs=2, test_mode=True, checkpoint=True)

    # clean up results directory
    shutil.rmtree(results_base_path)
예제 #21
0
    def train(self, train_data):
        flair_logger = logging.getLogger("flair")
        handler = RequestsHandler()
        flair_logger.addHandler(handler)

        filter = ListenFilter("filter", self.args)
        flair_logger.addFilter(filter)

        trainer = ModelTrainer(self.model, self.corpus)

        trainer.train(self.model_base_path(),
                      learning_rate=0.1,
                      mini_batch_size=32,
                      max_epochs=self.args.iter if self.args.max_iter else 100,
                      train_with_dev=True,
                      monitor_test=True,
                      embeddings_storage_mode="gpu")
예제 #22
0
def train_ner(device_category):
    """
    Training the sequence labeling model
    """
    columns = {0: 'text', 1: 'ner'}

    training_file = os.path.join(
        root_path, 'part_extraction/data/{}.conll'.format(device_category))
    data_folder = os.path.join(root_path, 'part_extraction/data')

    corpus = ColumnCorpus(data_folder, columns, train_file=training_file)

    print(len(corpus.train))
    tag_type = 'ner'
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
    print(tag_dictionary.idx2item)

    embedding_types = [
        WordEmbeddings('glove'),

        # comment in this line to use character embeddings
        # CharacterEmbeddings(),
        FlairEmbeddings('news-forward'),
        FlairEmbeddings('news-backward'),
    ]

    embeddings = StackedEmbeddings(embeddings=embedding_types)

    tagger = SequenceTagger(hidden_size=256,
                            embeddings=embeddings,
                            tag_dictionary=tag_dictionary,
                            tag_type=tag_type,
                            use_crf=True)

    trainer = ModelTrainer(tagger, corpus)

    # 7. start training
    trainer.train(ner_models,
                  learning_rate=0.1,
                  mini_batch_size=32,
                  max_epochs=150)

    trainer.model.save('{}/{}.pt'.format(ner_models, device_category))
    def handle(self, *args, **options):
        file = options.get('file') or 'annotated_sentences'
        model_folder = options.get('model_folder') or 'model-var'
        columns = {0: 'text', 1: 'var'}
        data_folder = 'data/txt'

        corpus = ColumnCorpus(data_folder, columns,
                              train_file=f'{file}.txt')
        
        tag_type = 'var'

        tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

        embedding_types = [
            WordEmbeddings('glove'),

            # comment in this line to use character embeddings
            # CharacterEmbeddings(),

            # comment in these lines to use flair embeddings
            # FlairEmbeddings('news-forward'),
            # FlairEmbeddings('news-backward'),
            TransformerWordEmbeddings('bert-base-uncased'),
        ]

        embeddings = StackedEmbeddings(embeddings=embedding_types)

        tagger = SequenceTagger(hidden_size=256,
                                embeddings=embeddings,
                                tag_dictionary=tag_dictionary,
                                tag_type=tag_type,
                                use_crf=True)

        trainer = ModelTrainer(tagger, corpus)

        trainer.train(f'data/models/taggers/{model_folder}',
                    learning_rate=0.1,
                    mini_batch_size=32,
                    max_epochs=150)

        
        self.stdout.write(self.style.SUCCESS(f'Successfully trained model on dataset file.'))
    def start(self) -> None:
        self.stacked_embeddings = self._get_stacked_embeddings()
        description = self.experiment.description.replace(" ", "_")
        batch_size = self.experiment.batch_size
        max_epochs = self.experiment.max_epochs
        embeddings_storage_mode = self.experiment.embeddings_storage_mode
        train_with_dev = self.experiment.train_with_dev

        tagger, corpus = self._get_sequence_tagger()

        trainer = ModelTrainer(tagger, corpus)

        trainer.train(
            f"resources/taggers/experiment_{description}_{self.number}",
            learning_rate=0.1,
            mini_batch_size=batch_size,
            max_epochs=max_epochs,
            embeddings_storage_mode=embeddings_storage_mode,
            train_with_dev=train_with_dev,
        )
def test_train_resume_sequence_tagging_training(results_base_path, tasks_base_path):
    corpus = NLPTaskDataFetcher.load_corpora([NLPTask.FASHION, NLPTask.GERMEVAL], base_path=tasks_base_path)
    tag_dictionary = corpus.make_tag_dictionary('ner')

    embeddings = WordEmbeddings('glove')

    model: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type='ner',
                                            use_crf=False)

    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path, max_epochs=2, test_mode=True, checkpoint=True)

    trainer = ModelTrainer.load_from_checkpoint(results_base_path / 'checkpoint.pt', 'SequenceTagger', corpus)
    trainer.train(results_base_path, max_epochs=2, test_mode=True, checkpoint=True)

    # clean up results directory
    shutil.rmtree(results_base_path)
예제 #26
0
def test_train_classifier_with_sampler(results_base_path, tasks_base_path):
    corpus = flair.datasets.ClassificationCorpus((tasks_base_path / 'imdb'))
    label_dict = corpus.make_label_dictionary()
    word_embedding = WordEmbeddings('turian')
    document_embeddings = DocumentRNNEmbeddings([word_embedding], 32, 1, False,
                                                64, False, False)
    model = TextClassifier(document_embeddings, label_dict, False)
    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path,
                  max_epochs=2,
                  shuffle=False,
                  sampler=ImbalancedClassificationDatasetSampler)
    sentence = Sentence('Berlin is a really nice city.')
    for s in model.predict(sentence):
        for l in s.labels:
            assert (l.value is not None)
            assert (0.0 <= l.score <= 1.0)
            assert (type(l.score) is float)
    loaded_model = TextClassifier.load((results_base_path / 'final-model.pt'))
    shutil.rmtree(results_base_path)
예제 #27
0
def test_train_resume_text_classification_training(results_base_path,
                                                   tasks_base_path):
    corpus = flair.datasets.ClassificationCorpus((tasks_base_path / 'imdb'))
    label_dict = corpus.make_label_dictionary()
    embeddings = FlairEmbeddings('news-forward-fast')
    document_embeddings = DocumentRNNEmbeddings([embeddings], 128, 1, False)
    model = TextClassifier(document_embeddings, label_dict, False)
    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path,
                  max_epochs=2,
                  shuffle=False,
                  checkpoint=True)
    checkpoint = TextClassifier.load_checkpoint(
        (results_base_path / 'checkpoint.pt'))
    trainer = ModelTrainer.load_from_checkpoint(checkpoint, corpus)
    trainer.train(results_base_path,
                  max_epochs=2,
                  shuffle=False,
                  checkpoint=True)
    shutil.rmtree(results_base_path)
예제 #28
0
    def fine_tune(self):
        if isinstance(self.document_embedding, TransformerDocumentEmbeddings):
            corpus = TREC_6()
            label_dict = corpus.make_label_dictionary()
            classifier = TextClassifier(self.document_embedding,
                                        label_dictionary=label_dict)
            trainer = ModelTrainer(classifier, corpus, optimizer=Adam)

            # 6. start the training
            trainer.train(
                'resources/taggers/trec',
                learning_rate=3e-5,  # use very small learning rate
                mini_batch_size=16,
                mini_batch_chunk_size=
                4,  # optionally set this if transformer is too much for your machine
                max_epochs=5,  # terminate after 5 epochs
            )
        else:
            raise UserWarning(
                "No fine tuning for this embedding type implemented")
def train_sentiment_model(rootdir, train, dev, test, num_epochs, device, outputdir):

    flair.device = torch.device(device)

    corpus = ClassificationCorpus(rootdir,
                                  train_file=train,
                                  dev_file=dev,
                                  test_file=test,
                                  in_memory=False)

    label_dict = corpus.make_label_dictionary()

    # init Flair embeddings
    flair_forward_embedding = FlairEmbeddings('multi-forward')
    flair_backward_embedding = FlairEmbeddings('multi-backward')

    optional_embedding = ELMoEmbeddings('original')

    word_embeddings = list(filter(None, [
        optional_embedding,
        FlairEmbeddings('news-forward'),
        FlairEmbeddings('news-backward'),
    ]))

    # Initialize document embedding by passing list of word embeddings
    #
    # Note this will kick off model generation that will take a long time (several hours)
    # This will produce final-model.pt and best-model.pt files which represent a stored trained model.
    document_embeddings = DocumentRNNEmbeddings(
        word_embeddings,
        hidden_size=512,
        reproject_words=True,
        reproject_words_dimension=256,
    )

    classifier = TextClassifier(document_embeddings,
                                label_dictionary=label_dict,
                                multi_label=False)

    trainer = ModelTrainer(classifier, corpus)
    trainer.train(outputdir, max_epochs=num_epochs)
예제 #30
0
def test_train_load_use_classifier_with_sampler(results_base_path,
                                                tasks_base_path):
    corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb",
                                                 label_type="topic")
    label_dict = corpus.make_label_dictionary(label_type="topic")

    model: TextClassifier = TextClassifier(
        document_embeddings=document_embeddings,
        label_dictionary=label_dict,
        label_type="topic",
        multi_label=False)

    trainer = ModelTrainer(model, corpus)
    trainer.train(
        results_base_path,
        max_epochs=2,
        shuffle=False,
        sampler=ImbalancedClassificationDatasetSampler,
    )

    sentence = Sentence("Berlin is a really nice city.")
    model.predict(sentence)

    for label in sentence.labels:
        assert label.value is not None
        assert 0.0 <= label.score <= 1.0
        assert type(label.score) is float

    del trainer, model, corpus
    loaded_model = TextClassifier.load(results_base_path / "final-model.pt")

    sentence = Sentence("I love Berlin")
    sentence_empty = Sentence("       ")

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # clean up results directory
    shutil.rmtree(results_base_path)
    del loaded_model