Exemplo n.º 1
0
def test_document_bidirectional_lstm_embeddings():
    (sentence, glove, charlm) = init_document_embeddings()
    embeddings = DocumentLSTMEmbeddings([glove, charlm],
                                        hidden_size=128,
                                        bidirectional=True)
    embeddings.embed(sentence)
    assert (len(sentence.get_embedding()) == 512)
    assert (len(sentence.get_embedding()) == embeddings.embedding_length)
    sentence.clear_embeddings()
    assert (len(sentence.get_embedding()) == 0)
Exemplo n.º 2
0
def run_splits(word_embeddings, embeddings_name):
    for i in range(1, 6):
        print('##########')
        print('Split', str(i))
        print('##########')

        data_folder = '<path_to_splits>/split_' + str(i) + '/'
        corpus = ClassificationCorpus(data_folder,
                                      test_file='test.csv',
                                      dev_file='dev.csv',
                                      train_file='train.csv')

        document_embeddings = DocumentLSTMEmbeddings(
            word_embeddings,
            hidden_size=512,
            reproject_words=True,
            reproject_words_dimension=256)

        classifier = TextClassifier(
            document_embeddings,
            label_dictionary=corpus.make_label_dictionary(),
            multi_label=False)

        trainer = ModelTrainer(classifier, corpus)
        trainer.train(data_folder + '/' + embeddings_name, max_epochs=150)
def test_train_charlm_nocache_load_use_classifier(results_base_path, tasks_base_path):

    corpus = NLPTaskDataFetcher.load_corpus(NLPTask.IMDB, base_path=tasks_base_path)
    label_dict = corpus.make_label_dictionary()

    glove_embedding: TokenEmbeddings = FlairEmbeddings('news-forward-fast', use_cache=False)
    document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings([glove_embedding], 128, 1, False, 64,
                                                                         False,
                                                                         False)

    model = TextClassifier(document_embeddings, label_dict, False)

    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path, max_epochs=2, test_mode=True)

    sentence = Sentence("Berlin is a really nice city.")

    for s in model.predict(sentence):
        for l in s.labels:
            assert (l.value is not None)
            assert (0.0 <= l.score <= 1.0)
            assert (type(l.score) is float)

    loaded_model = TextClassifier.load_from_file(results_base_path / 'final-model.pt')

    sentence = Sentence('I love Berlin')
    sentence_empty = Sentence('       ')

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # clean up results directory
    shutil.rmtree(results_base_path)
Exemplo n.º 4
0
def test_train_resume_text_classification_training(results_base_path,
                                                   tasks_base_path):
    corpus = NLPTaskDataFetcher.load_corpus('imdb', base_path=tasks_base_path)
    label_dict = corpus.make_label_dictionary()

    embeddings: TokenEmbeddings = FlairEmbeddings('news-forward-fast',
                                                  use_cache=False)
    document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings(
        [embeddings], 128, 1, False)

    model = TextClassifier(document_embeddings, label_dict, False)

    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path,
                  max_epochs=2,
                  test_mode=True,
                  checkpoint=True)

    trainer = ModelTrainer.load_from_checkpoint(
        results_base_path / 'checkpoint.pt', 'TextClassifier', corpus)
    trainer.train(results_base_path,
                  max_epochs=2,
                  test_mode=True,
                  checkpoint=True)

    # clean up results directory
    shutil.rmtree(results_base_path)
Exemplo n.º 5
0
def test_train_charlm_load_use_classifier():
    corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB)
    label_dict = corpus.make_label_dictionary()

    glove_embedding: TokenEmbeddings = CharLMEmbeddings('news-forward-fast')
    document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings([glove_embedding], 128, 1, False, 64, False,
                                                                         False)

    model = TextClassifier(document_embeddings, label_dict, False)

    trainer = TextClassifierTrainer(model, corpus, label_dict, False)
    trainer.train('./results', max_epochs=2)

    sentence = Sentence("Berlin is a really nice city.")

    for s in model.predict(sentence):
        for l in s.labels:
            assert (l.value is not None)
            assert (0.0 <= l.score <= 1.0)
            assert (type(l.score) is float)

        loaded_model = TextClassifier.load_from_file('./results/final-model.pt')

    sentence = Sentence('I love Berlin')
    sentence_empty = Sentence('       ')

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # clean up results directory
    shutil.rmtree('./results')
Exemplo n.º 6
0
    def train():
        # load training data in FastText format
        corpus = NLPTaskDataFetcher.load_classification_corpus(
            Path('./'),
            test_file='./data/test.txt',
            train_file='./data/train.txt')

        # Combine different embeddings:
        # Glove word ebmeddings + Flair contextual string embeddings
        word_embeddings = [
            WordEmbeddings('glove'),
            FlairEmbeddings('news-forward-fast'),
            FlairEmbeddings('news-backward-fast')
        ]
        # use LSTM based method for combining the different embeddings
        document_embeddings = DocumentLSTMEmbeddings(
            word_embeddings,
            hidden_size=512,
            reproject_words=True,
            reproject_words_dimension=256)

        classifier = TextClassifier(
            document_embeddings,
            label_dictionary=corpus.make_label_dictionary(),
            multi_label=False)

        trainer = ModelTrainer(classifier, corpus)
        trainer.train('./models', max_epochs=10)
Exemplo n.º 7
0
def test_train_charlm_load_use_classifier(results_base_path, tasks_base_path):
    corpus = NLPTaskDataFetcher.load_corpus(u'imdb', base_path=tasks_base_path)
    label_dict = corpus.make_label_dictionary()
    glove_embedding = FlairEmbeddings(u'news-forward-fast')
    document_embeddings = DocumentLSTMEmbeddings([glove_embedding], 128, 1,
                                                 False, 64, False, False)
    model = TextClassifier(document_embeddings, label_dict, False)
    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path,
                  EvaluationMetric.MACRO_F1_SCORE,
                  max_epochs=2,
                  test_mode=True)
    sentence = Sentence(u'Berlin is a really nice city.')
    for s in model.predict(sentence):
        for l in s.labels:
            assert (l.value is not None)
            assert (0.0 <= l.score <= 1.0)
            assert (type(l.score) is float)
    loaded_model = TextClassifier.load_from_file(
        (results_base_path / u'final-model.pt'))
    sentence = Sentence(u'I love Berlin')
    sentence_empty = Sentence(u'       ')
    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])
    shutil.rmtree(results_base_path)
Exemplo n.º 8
0
def init(tasks_base_path):
    corpus = NLPTaskDataFetcher.load_corpus(NLPTask.AG_NEWS, tasks_base_path)
    label_dict = corpus.make_label_dictionary()
    glove_embedding = WordEmbeddings(u'en-glove')
    document_embeddings = DocumentLSTMEmbeddings([glove_embedding], 128, 1,
                                                 False, 64, False, False)
    model = TextClassifier(document_embeddings, label_dict, False)
    return (corpus, label_dict, model)
Exemplo n.º 9
0
def test_train_load_use_classifier_multi_label(results_base_path,
                                               tasks_base_path):

    # corpus = NLPTaskDataFetcher.load_corpus('multi_class', base_path=tasks_base_path)
    corpus = NLPTaskDataFetcher.load_classification_corpus(
        data_folder=tasks_base_path / 'multi_class')
    label_dict = corpus.make_label_dictionary()

    glove_embedding: WordEmbeddings = WordEmbeddings('en-glove')
    document_embeddings = DocumentLSTMEmbeddings(embeddings=[glove_embedding],
                                                 hidden_size=32,
                                                 reproject_words=False,
                                                 bidirectional=False)

    model = TextClassifier(document_embeddings, label_dict, multi_label=True)

    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path,
                  EvaluationMetric.MICRO_F1_SCORE,
                  max_epochs=100,
                  test_mode=True,
                  checkpoint=False)

    sentence = Sentence('apple tv')

    for s in model.predict(sentence):
        for l in s.labels:
            print(l)
            assert (l.value is not None)
            assert (0.0 <= l.score <= 1.0)
            assert (type(l.score) is float)

    sentence = Sentence("apple tv")

    for s in model.predict(sentence):

        assert ('apple' in sentence.get_label_names())
        assert ('tv' in sentence.get_label_names())

        for l in s.labels:
            print(l)
            assert (l.value is not None)
            assert (0.0 <= l.score <= 1.0)
            assert (type(l.score) is float)

    loaded_model = TextClassifier.load_from_file(results_base_path /
                                                 'final-model.pt')

    sentence = Sentence('I love Berlin')
    sentence_empty = Sentence('       ')

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # clean up results directory
    shutil.rmtree(results_base_path)
Exemplo n.º 10
0
def init() -> Tuple[TaggedCorpus, Dictionary, TextClassifier]:
    corpus = NLPTaskDataFetcher.fetch_data(NLPTask.AG_NEWS)
    label_dict = corpus.make_label_dictionary()

    glove_embedding: WordEmbeddings = WordEmbeddings('en-glove')
    document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings([glove_embedding], 128, 1, False, 64, False, False)

    model = TextClassifier(document_embeddings, label_dict, False)

    return corpus, label_dict, model
Exemplo n.º 11
0
def trainFlairClassifier(df, columns, trainNameCsv, testNameCsv, devNameCsv,
                         classifierFileName):
    ids = df['id'].tolist()

    nSamples = len(ids)
    idx80 = int(nSamples * 0.7)
    idx90 = int(nSamples * 0.9)

    train_ids = ids[:idx80]
    test_ids = ids[idx80:idx90]
    dev_ids = ids[idx90:]

    with TemporaryDirectory() as temp_dir:
        trainCsv = temp_dir + trainNameCsv
        testCsv = temp_dir + testNameCsv
        devCsv = temp_dir + devNameCsv

        df[df['id'].isin(train_ids)].to_csv(trainCsv,
                                            columns=columns,
                                            sep='\t',
                                            index=False,
                                            header=False)
        df[df['id'].isin(test_ids)].to_csv(testCsv,
                                           columns=columns,
                                           sep='\t',
                                           index=False,
                                           header=False)
        df[df['id'].isin(dev_ids)].to_csv(devCsv,
                                          columns=columns,
                                          sep='\t',
                                          index=False,
                                          header=False)

        corpus = NLPTaskDataFetcher.load_classification_corpus(
            temp_dir, train_file=trainCsv, test_file=testCsv, dev_file=devCsv)

        word_embeddings = [
            WordEmbeddings('glove'),
            FlairEmbeddings('news-forward-fast'),
            FlairEmbeddings('news-backward-fast')
        ]
        document_embeddings = DocumentLSTMEmbeddings(
            word_embeddings,
            hidden_size=512,
            reproject_words=True,
            reproject_words_dimension=256)
        classifier = TextClassifier(
            document_embeddings,
            label_dictionary=corpus.make_label_dictionary(),
            multi_label=False)
        trainer = ModelTrainer(classifier, corpus)

        trainer.train(temp_dir, max_epochs=50)

        classifier.save(classifierFileName)
Exemplo n.º 12
0
def main(args):
    args = parser.parse_args()

    # 1. get the corpus
    corpus: TaggedCorpus = NLPTaskDataFetcher.load_classification_corpus(
        args.data_dir[0],
        train_file='train.txt',
        dev_file='dev.txt',
        test_file='test.txt')

    # 2. create the label dictionary
    label_dict = corpus.make_label_dictionary()

    # 3. make a list of word embeddings
    word_embeddings = [
        WordEmbeddings('glove'),

        # comment in flair embeddings for state-of-the-art results
        # FlairEmbeddings('news-forward'),
        # FlairEmbeddings('news-backward'),
        # ELMoEmbeddings()
    ]

    # 4. init document embedding by passing list of word embeddings
    document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings(
        word_embeddings,
        hidden_size=128,
        reproject_words=True,
        reproject_words_dimension=64,
    )

    # 5. create the text classifier
    classifier = TextClassifier(document_embeddings,
                                label_dictionary=label_dict,
                                multi_label=False)

    # 6. initialize the text classifier trainer
    trainer = ModelTrainer(classifier, corpus)

    # 7. start the training
    model_out = 'resources/classifiers/sentence-classification/glove'
    trainer.train(model_out,
                  learning_rate=0.1,
                  mini_batch_size=32,
                  anneal_factor=0.5,
                  patience=5,
                  max_epochs=100)

    # 8. plot training curves (optional)
    from flair.visual.training_curves import Plotter
    plotter = Plotter()
    plotter.plot_training_curves(join(model_out, 'loss.tsv'))
    plotter.plot_weights(join(model_out, 'weights.txt'))
Exemplo n.º 13
0
def test_document_bidirectional_lstm_embeddings_using_first_representation():
    sentence, glove, charlm = init_document_embeddings()

    embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings(
        [glove, charlm], hidden_size=128, bidirectional=True)

    embeddings.embed(sentence)

    assert (len(sentence.get_embedding()) != 0)
    assert (sentence.get_embedding().shape[1] == embeddings.embedding_length)

    sentence.clear_embeddings()

    assert (len(sentence.get_embedding()) == 0)
Exemplo n.º 14
0
def test_document_lstm_embeddings():
    sentence, glove, charlm = init_document_embeddings()

    embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings(
        [glove, charlm], hidden_states=128, bidirectional=False)

    embeddings.embed(sentence)

    assert (len(sentence.get_embedding()) != 0)
    assert (sentence.get_embedding().shape[1] == embeddings.embedding_length)

    sentence.clear_embeddings()

    assert (len(sentence.get_embedding()) == 0)
Exemplo n.º 15
0
 def _set_up_model(self, params):
     embdding_params = {
         key: params[key]
         for key in params if (key in DOCUMENT_EMBEDDING_PARAMETERS)
     }
     if (self.document_embedding_type == u'lstm'):
         document_embedding = DocumentLSTMEmbeddings(**embdding_params)
     else:
         document_embedding = DocumentPoolEmbeddings(**embdding_params)
     text_classifier = TextClassifier(
         label_dictionary=self.label_dictionary,
         multi_label=self.multi_label,
         document_embeddings=document_embedding)
     return text_classifier
Exemplo n.º 16
0
def test_text_classifier_single_label():
    corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB)
    label_dict = corpus.make_label_dictionary()

    glove_embedding: WordEmbeddings = WordEmbeddings('en-glove')
    document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings(
        [glove_embedding], 128, 1, False, 64, False, False)

    model = TextClassifier(document_embeddings, label_dict, False)

    trainer = TextClassifierTrainer(model, corpus, label_dict, False)
    trainer.train('./results', max_epochs=2)

    # clean up results directory
    shutil.rmtree('./results')
Exemplo n.º 17
0
def text_classification():
    corpus: TaggedCorpus = NLPTaskDataFetcher.fetch_data(NLPTask.AG_NEWS)
    corpus.train = [sentence for sentence in corpus.train if len(sentence) > 0]
    corpus.test = [sentence for sentence in corpus.test if len(sentence) > 0]
    corpus.dev = [sentence for sentence in corpus.dev if len(sentence) > 0]
    print("corpus created")
    #print(corpus.get_all_sentences())
    label_dict = corpus.make_label_dictionary()
    print("created label dict")
    #for sent in corpus.get_all_sentences():
    #    print(sent.labels)
    word_embeddings = [
        WordEmbeddings('glove'),
        CharLMEmbeddings('news-forward'),
        CharLMEmbeddings('news-backward')
    ]
    print("loaded word embeddings")
    document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings(
        word_embeddings,
        hidden_states=512,
        reproject_words=True,
        reproject_words_dimension=256,
    )

    print("loaded document embeddings")

    classifier = TextClassifier(document_embeddings,
                                label_dictionary=label_dict,
                                multi_label=True)

    print("created classifier")

    # 6. initialize the text classifier trainer
    trainer = TextClassifierTrainer(classifier, corpus, label_dict)

    print("starting training")

    # 7. start the trainig
    trainer.train('results',
                  learning_rate=0.1,
                  mini_batch_size=32,
                  anneal_factor=0.5,
                  patience=5,
                  max_epochs=50)

    print("training finished")
Exemplo n.º 18
0
def train_flair(dir_name):
    corpus = NLPTaskDataFetcher.load_classification_corpus(
        os.path.join("flair", dir_name),
        test_file='test.csv',
        dev_file='validation.csv',
        train_file='train.csv')
    word_embeddings = [
        WordEmbeddings('pl'),
        FlairEmbeddings('polish-forward'),
        FlairEmbeddings('polish-backward')
    ]
    document_embeddings = DocumentLSTMEmbeddings(word_embeddings,
                                                 hidden_size=512,
                                                 reproject_words=True,
                                                 reproject_words_dimension=256)
    classifier = TextClassifier(
        document_embeddings,
        label_dictionary=corpus.make_label_dictionary(),
        multi_label=False)
    trainer = ModelTrainer(classifier, corpus)
    trainer.train(os.path.join("flair", dir_name), max_epochs=10)
Exemplo n.º 19
0
def test_text_classifier_single_label():
    corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB)
    label_dict = corpus.make_label_dictionary()

    glove_embedding: WordEmbeddings = WordEmbeddings('en-glove')
    document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings([glove_embedding], 128, 1, False, 64, False, False)

    model = TextClassifier(document_embeddings, label_dict, False)

    trainer = TextClassifierTrainer(model, corpus, label_dict, False)
    trainer.train('./results', max_epochs=2)

    sentence = Sentence("Berlin is a really nice city.")

    for s in model.predict(sentence):
        for l in s.labels:
            assert(l.name is not None)
            assert(0.0 <= l.confidence <= 1.0)
            assert(type(l.confidence) is float)

    # clean up results directory
    shutil.rmtree('./results')
from flair.data_fetcher import NLPTaskDataFetcher
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentLSTMEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from pathlib import Path

corpus = NLPTaskDataFetcher.load_classification_corpus(Path('./'),
                                                       test_file='test.csv',
                                                       dev_file='dev.csv',
                                                       train_file='train.csv')

#word embeddings for tagging individual words as vector representation
word_embeddings = [
    WordEmbeddings('glove'),
    FlairEmbeddings('news-forward-fast'),
    FlairEmbeddings('news-backward-fast')
]

#document embeddings for taking individual word embeddings to tag a whole document as vector representation
document_embeddings = DocumentLSTMEmbeddings(word_embeddings,
                                             hidden_size=512,
                                             reproject_words=True,
                                             reproject_words_dimension=256)

classifier = TextClassifier(document_embeddings,
                            label_dictionary=corpus.make_label_dictionary(),
                            multi_label=True)
trainer = ModelTrainer(classifier, corpus)
trainer.train('./', max_epochs=10)
Exemplo n.º 21
0
    def train(self, X, y):

        X_text = X[:, self.args.TEXT_COL]
        y = y.flatten()
        #corpus: TaggedCorpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03)

        train: List[Sentence] = []

        for tweet, label in zip(X_text, y):
            if tweet == '':
                tweet = 'dummy'
            s: Sentence = Sentence(tweet)
            s.add_label(str(label))
            train.append(s)

        corpus: TaggedCorpus = TaggedCorpus(train, train, train)

        # 2. create the label dictionary
        label_dict = corpus.make_label_dictionary()

        # 3. make a list of word embeddings
        word_embeddings = [
            glove_embeddings,
            #twitter_embeddings,
            # comment in this line to use character embeddings
            #CharacterEmbeddings(),
            # comment in flair embeddings for state-of-the-art results
            # FlairEmbeddings('news-forward'),
            fflair,
            # FlairEmbeddings('news-backward'),
            bflair
        ]

        # 4. initialize document embedding by passing list of word embeddings
        document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings(
            word_embeddings,
            hidden_size=512,
            reproject_words=True,
            reproject_words_dimension=256,
        )
        # 5. create the text classifier
        classifier = TextClassifier(document_embeddings,
                                    label_dictionary=label_dict,
                                    multi_label=False)

        # 6. initialize the text classifier trainer
        trainer = ModelTrainer(classifier, corpus)

        self.model = trainer.model
        self.model.save = self.save
        self.model.save_checkpoint = self.save_checkpoint

        # 7. start the training
        trainer.train('../data/ecuador_earthquake_2016/models',
                      learning_rate=0.1,
                      mini_batch_size=32,
                      anneal_factor=0.5,
                      patience=5,
                      max_epochs=5)

        self.clf = classifier
Exemplo n.º 22
0
sentences_train: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file('training.preprocessed.txt')
sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file('dev.preprocessed.txt')
sentences_test: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file('test.preprocessed.txt')

corpus = TaggedCorpus(sentences_train, sentences_dev, sentences_test)

# 2. create the label dictionary
label_dict = corpus.make_label_dictionary()

# 3. make a list of word embeddings
word_embeddings = [WordEmbeddings('de-fasttext'),
                   CharLMEmbeddings('german-forward'),
                   CharLMEmbeddings('german-backward')]

# 4. init document embedding by passing list of word embeddings
document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings(word_embeddings, hidden_states=32)

# 5. create the text classifier
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=False)

# 6. initialize the text classifier trainer
trainer = TextClassifierTrainer(classifier, corpus, label_dict)

# 7. start the trainig
trainer.train('resources/germeval_2018/results',
              learning_rate=0.01,
              mini_batch_size=8,
              max_epochs=30,
              embeddings_in_memory=False)
def main(args):
    args = parser.parse_args()

    # 0. Make a list of word embeddings
    if args.method == 'glove':
        word_embeddings = [WordEmbeddings('glove')]
    elif args.method == 'flair':
        word_embeddings = [
            WordEmbeddings('glove'),
            FlairEmbeddings('news-forward'),
            FlairEmbeddings('news-backward')
        ]
    elif args.method == 'cui_svd':
        word_embeddings = [
            BackOffEmbeddings(
                WordEmbeddings('glove'),
                WordEmbeddings('resources/embeddings/cui2vec100.npy'))
        ]
    elif args.method == 'cui_proj':
        word_embeddings = [
            BackOffEmbeddings(
                WordEmbeddings('glove'),
                WordEmbeddings(
                    'resources/embeddings/cui2vec_projected_100-100.gensim'))
        ]
    elif args.method == 'mimic':
        word_embeddings = [
            WordEmbeddings(
                'resources/embeddings/mimic3_mixed_embeddings100.gensim')
        ]
    elif args.method == 'cui2vec':
        word_embeddings = [
            BackOffEmbeddings(
                WordEmbeddings('glove'),
                WordEmbeddings(
                    'resources/embeddings/cui2vec_combined_glove_100dim.gensim'
                ))
        ]
    elif args.method == 'mimic_lm':
        word_embeddings = [
            WordEmbeddings('glove'),
            FlairEmbeddings('resources/taggers/mimic-forward/best-lm.pt'),
            FlairEmbeddings('resources/taggers/mimic-backward/best-lm.pt')
        ]
    else:
        raise Exception(
            "Received option for method %s that cannot be interpreted." %
            (args.method))

    if 'bg' in args.data_file:
        multi = True
        print(
            "Running in multiple label setting because 'bg' was in the data file name %s"
            % (args.data_file))
    else:
        multi = False

    # 1. get the corpus
    sents: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file(
        args.data_file)
    corpus = TaggedCorpus(sents, None, None)

    # 2. create the label dictionary
    label_dict = corpus.make_label_dictionary()

    # 3. split the training data into folds
    num_folds = args.num_folds
    seed = 719
    kf = KFold(n_splits=num_folds, random_state=seed)
    kf.get_n_splits()

    # 4. iterate over folds:
    total_acc = 0
    fold = 1
    for train_index, test_index in kf.split(corpus.train):
        # 4a. initialize the text classifier trainer
        split_traindev = np.array(corpus.train)[train_index].tolist()
        traindev_size = len(split_traindev)
        train_dev_splitpoint = int(0.9 * traindev_size)
        split_train = split_traindev[:train_dev_splitpoint]
        split_dev = split_traindev[train_dev_splitpoint:]

        split_test = np.array(corpus.train)[test_index].tolist()
        split_corpus = TaggedCorpus(split_train,
                                    dev=split_dev,
                                    test=split_test)

        print("After split, size of splits: train=%d, dev=%d, test=%d" %
              (len(split_train), len(split_dev), len(split_test)))

        # 4b. do training:
        with tempfile.TemporaryDirectory() as model_dir:
            # init document embedding by passing list of word embeddings
            document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings(
                word_embeddings,
                hidden_size=128,
                reproject_words=True,
                reproject_words_dimension=64,
            )
            classifier = TextClassifier(document_embeddings,
                                        label_dictionary=label_dict,
                                        multi_label=multi)
            trainer = ModelTrainer(classifier, split_corpus)

            results = trainer.train(model_dir,
                                    embeddings_in_memory=False,
                                    learning_rate=0.1,
                                    mini_batch_size=128,
                                    anneal_factor=0.5,
                                    patience=5,
                                    max_epochs=100)

        fold_acc = results['test_score']
        total_acc += fold_acc
        print(f"Finished fold {fold} with accuracy {fold_acc}")
        fold += 1
    total_acc /= num_folds

    print("Finished with total cross-fold accuracy of %f" % (total_acc))
def main(data_folder, benchmark_classifier_folder, new_data_folder,
         finetuned_classifier_folder):
    from flair.embeddings import FlairEmbeddings, DocumentLSTMEmbeddings, BertEmbeddings, DocumentRNNEmbeddings, TransformerDocumentEmbeddings
    from flair.models import TextClassifier
    from flair.trainers import ModelTrainer
    from flair.datasets import CSVClassificationCorpus
    from flair.data import Corpus
    import pandas as pd
    import os

    ### First Stage (Train on benchmark dataset)
    benchmark = pd.read_csv(data_folder + "combined_benchmark.csv")
    benchmark = benchmark[['label', 'text']]

    #### Create train, dev and test set
    #benchmark = benchmark.sample(frac=1) # if not set random state, everytime has different training result
    benchmark = benchmark.sample(frac=1, random_state=42)
    benchmark.iloc[0:int(len(benchmark) * 0.8)].to_csv(data_folder +
                                                       'train.csv',
                                                       sep='\t',
                                                       index=False,
                                                       header=False)
    benchmark.iloc[int(len(benchmark) * 0.8):int(len(benchmark) * 0.9)].to_csv(
        data_folder + 'test.csv', sep='\t', index=False, header=False)
    benchmark.iloc[int(len(benchmark) * 0.9):].to_csv(data_folder + 'dev.csv',
                                                      sep='\t',
                                                      index=False,
                                                      header=False)

    #### Build corpus
    column_name_map = {1: "text", 0: "label_topic"}

    corpus: Corpus = CSVClassificationCorpus(
        data_folder,
        column_name_map,
        skip_header=False,  #no header in kaggle data
        delimiter='\t',  # comma separated rows
        #train_file='train.csv', ## passing in file names manually when it can't auto detect
        #dev_file='dev.csv',
        #test_file='test.csv'
    )

    #### Create word embeddings
    word_embeddings = [
        BertEmbeddings(),
        FlairEmbeddings('news-forward-fast'),
        FlairEmbeddings('news-backward-fast')
    ]
    ## caveat: issue of deprecation. BertEmbeddings and DocumentLSTMEmbeddings existed in version 0.4.5, and became legacy embeddings(still available) in version 0.5

    #### First Stage Fine-tuning
    document_embeddings = DocumentLSTMEmbeddings(word_embeddings,
                                                 hidden_size=512,
                                                 reproject_words=True,
                                                 reproject_words_dimension=256)
    classifier = TextClassifier(
        document_embeddings,
        label_dictionary=corpus.make_label_dictionary(),
        multi_label=False)
    trainer = ModelTrainer(classifier, corpus)
    #trainer.train(benchmark_classifier_folder, max_epochs=1) #offline test use epoch=1
    trainer.train(benchmark_classifier_folder, max_epochs=10)

    ### every finetuning results in different scores
    ### accuracy at phase1 finetuning does not matter too much, phase2 scores more important in biasing the models towards learning indicator-specific keywords

    ### Second Stage (train on hand annotated datasets)
    #### Build corpus

    ### this column_name_map must be updated to reflect which column stores the X(text features) and y(golden labels) for training use
    ### in the csv file contained in new_data_folder, 2nd column is 'title_desc',
    ### 4th column is 'title_desc_sent_1' (where we stored agreed sentiment annotations)
    new_column_name_map = {1: "text", 3: "label_topic"}
    print(new_column_name_map)

    corpus: Corpus = CSVClassificationCorpus(
        new_data_folder,
        new_column_name_map,
        skip_header=True,
        delimiter=','  # comma separated rows
    )

    #### Second Stage fine-tuning

    benchmark_classifier = TextClassifier.load(
        os.path.join(benchmark_classifier_folder, 'best-model.pt'))
    trainer = ModelTrainer(benchmark_classifier, corpus)
    #trainer.train(finetuned_classifier_folder, max_epochs=1) #offline test use
    trainer.train(finetuned_classifier_folder, max_epochs=10)
Exemplo n.º 25
0
import datetime
import spacy

from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentPoolEmbeddings, DocumentLSTMEmbeddings, Sentence

nlp = spacy.load('de')
glove_embedding = WordEmbeddings('de')
flair_embedding_forward = FlairEmbeddings('german-forward')
flair_embedding_backward = FlairEmbeddings('german-backward')

document_pooling_embeddings = DocumentPoolEmbeddings(
    [glove_embedding, flair_embedding_backward, flair_embedding_forward])

document_lstm_embeddings = DocumentLSTMEmbeddings(
    [glove_embedding, flair_embedding_backward, flair_embedding_forward])


def is_blacklisted(word):
    return word in [
        'polizei', 'polizist', 'beamter', 'nr.', 'berlin', 'uhr',
        'polizeimeldung', 'nicht', 'jahr', 'jährige', 'jährig', 'jähriger',
        'polizeiliche', 'polizeilich', '2015', '2016', '2014', '2017', '2018',
        'polizeibeamter', '-', 'u.a.', 'z.b.', 'der', 'die', 'das', 'dem',
        'den', 'diese', 'dieser', 'diesen', 'diesem', 'um', 'für', 'eine',
        'ein', 'einer', 'einen', 'einem', 'anderer', 'andere', 'anderen',
        'anders'
    ]


def is_empty(word):
    return word.strip() == ''
from flair.data_fetcher import NLPTaskDataFetcher
from flair.embeddings import WordEmbeddings, FlairEmbeddings, BertEmbeddings, DocumentLSTMEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from pathlib import Path

word_embeddings = [BertEmbeddings('bert-base-multilingual-cased')]

from flair.data import Sentence
from flair.embeddings import BertEmbeddings, DocumentLSTMEmbeddings

document_embeddings = DocumentLSTMEmbeddings(word_embeddings,hidden_size=768,)

import random
import numpy as np
from numpy import genfromtxt
from run_fasttext import fasttext
from logisticRegression import lr
from run_bert import bert
from keras_SLP import SLP

from cosineSim import cosineSimilarity

#Logging configuration for logging data
from datetime import datetime

import logging

now = datetime.now().strftime('%Y-%m-%d-%H-%M')

formatter = logging.Formatter('%(asctime)s - %(name)s - %(message)s')