예제 #1
0
def test_train_tars(tasks_base_path, results_base_path):
    # test corpus
    corpus = ClassificationCorpus(tasks_base_path / "imdb_underscore")

    # create a TARS classifier
    tars = TARSClassifier(embeddings="sshleifer/tiny-distilroberta-base")

    # switch to a new task (TARS can do multiple tasks so you must define one)
    tars.add_and_switch_to_new_task(
        task_name="question 2_CLASS",
        label_dictionary=corpus.make_label_dictionary(label_type="class"),
        label_type="class",
    )

    # initialize the text classifier trainer
    trainer = ModelTrainer(tars, corpus)

    # start the training
    trainer.train(
        base_path=results_base_path,
        learning_rate=0.02,
        mini_batch_size=1,
        max_epochs=1,
    )

    sentence = Sentence("This is great!")
    tars.predict(sentence)
예제 #2
0
def test_init_tars_and_switch(tasks_base_path):
    # test corpus
    corpus = ClassificationCorpus(tasks_base_path / "imdb")

    # create a TARS classifier
    tars = TARSClassifier(task_name='2_CLASS',
                          label_dictionary=corpus.make_label_dictionary(label_type='class'),
                          label_type='class')

    # check if right number of classes
    assert (len(tars.get_current_label_dictionary()) == 2)

    # switch to task with only one label
    tars.add_and_switch_to_new_task('1_CLASS', 'one class', "testlabel")

    # check if right number of classes
    assert (len(tars.get_current_label_dictionary()) == 1)

    # switch to task with three labels provided as list
    tars.add_and_switch_to_new_task('3_CLASS', ['list 1', 'list 2', 'list 3'], "testlabel")

    # check if right number of classes
    assert (len(tars.get_current_label_dictionary()) == 3)

    # switch to task with four labels provided as set
    tars.add_and_switch_to_new_task('4_CLASS', {'set 1', 'set 2', 'set 3', 'set 4'}, "testlabel")

    # check if right number of classes
    assert (len(tars.get_current_label_dictionary()) == 4)

    # switch to task with two labels provided as Dictionary
    tars.add_and_switch_to_new_task('2_CLASS_AGAIN', corpus.make_label_dictionary(label_type='class'), "testlabel")

    # check if right number of classes
    assert (len(tars.get_current_label_dictionary()) == 2)
예제 #3
0
def run_splits(word_embeddings, embeddings_name):
    for i in range(1, 6):
        print('##########')
        print('Split', str(i))
        print('##########')

        data_folder = '<path_to_splits>/split_' + str(i) + '/'
        corpus = ClassificationCorpus(data_folder,
                                      test_file='test.csv',
                                      dev_file='dev.csv',
                                      train_file='train.csv')

        document_embeddings = DocumentLSTMEmbeddings(
            word_embeddings,
            hidden_size=512,
            reproject_words=True,
            reproject_words_dimension=256)

        classifier = TextClassifier(
            document_embeddings,
            label_dictionary=corpus.make_label_dictionary(),
            multi_label=False)

        trainer = ModelTrainer(classifier, corpus)
        trainer.train(data_folder + '/' + embeddings_name, max_epochs=150)
예제 #4
0
def train_model(data_dir, max_epochs):
    st.write('Creating word corpus for training...')
    corpus = ClassificationCorpus(data_dir)
    label_dict = corpus.make_label_dictionary()
    st.write('Done')

    st.write('Load and create Embeddings for text data...')
    word_embeddings = [
        WordEmbeddings('glove'),
        # FlairEmbeddings('news-forward'),
        # FlairEmbeddings('news-backward')
    ]
    document_embeddings = DocumentRNNEmbeddings(word_embeddings,
                                                hidden_size=512,
                                                reproject_words=True,
                                                reproject_words_dimension=256)
    st.write('Done')

    st.write('Preparing')
    classifier = TextClassifier(document_embeddings,
                                label_dictionary=label_dict)
    trainer = ModelTrainer(classifier, corpus)
    trainer.train('model-saves',
                  learning_rate=0.1,
                  mini_batch_size=32,
                  anneal_factor=0.5,
                  patience=8,
                  max_epochs=max_epochs,
                  checkpoint=True)
    st.write('Model Training Finished!')
예제 #5
0
    def train(self,
              learning_rate: float = 0.1,
              mini_batch_size: int = 16,
              anneal_factor: float = 0.5,
              patience: int = 5,
              max_epochs: int = 10):
        """

        :return:
        """
        self.make_corpus()
        corpus = ClassificationCorpus(self.output_data_path,
                                      train_file='train.txt',
                                      dev_file='dev.txt',
                                      test_file='test.txt')

        label_dictionary = corpus.make_label_dictionary()

        embeddings = [WordEmbeddings('glove')]
        document_pool = DocumentPoolEmbeddings(embeddings)
        classifier = TextClassifier(document_pool,
                                    label_dictionary=label_dictionary)
        trainer = ModelTrainer(classifier, corpus)
        trainer.train(
            self.model_path,
            learning_rate=learning_rate,
            mini_batch_size=mini_batch_size,
            anneal_factor=anneal_factor,
            patience=patience,
            max_epochs=max_epochs,
        )
예제 #6
0
def test_train_tars(tasks_base_path):
    # test corpus
    corpus = ClassificationCorpus(tasks_base_path / "imdb_underscore")

    # create a TARS classifier
    tars = TARSClassifier(embeddings="sshleifer/tiny-distilroberta-base")

    # switch to a new task (TARS can do multiple tasks so you must define one)
    tars.add_and_switch_to_new_task(task_name="question 2_CLASS",
                                    label_dictionary=corpus.make_label_dictionary(label_type='class'),
                                    label_type='class',
                                    )

    # initialize the text classifier trainer
    trainer = ModelTrainer(tars, corpus)

    # start the training
    trainer.train(base_path='resources/taggers/trec',  # path to store the model artifacts
                  learning_rate=0.02,  # use very small learning rate
                  mini_batch_size=1,
                  # mini_batch_chunk_size=4,  # optionally set this if transformer is too much for your machine
                  max_epochs=1,  # terminate after 10 epochs
                  )

    sentence = Sentence("This is great!")
    tars.predict(sentence)
예제 #7
0
def test_text_classifier_multi(results_base_path, tasks_base_path):
    flair.set_seed(123)

    corpus = ClassificationCorpus(
        tasks_base_path / "trivial" / "trivial_text_classification_multi",
        label_type="city",
    )
    label_dict = corpus.make_label_dictionary(label_type="city")

    model: TextClassifier = TextClassifier(
        document_embeddings=DocumentPoolEmbeddings([turian_embeddings],
                                                   fine_tune_mode="linear"),
        label_dictionary=label_dict,
        label_type="city",
        multi_label=True,
    )

    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path,
                  mini_batch_size=2,
                  max_epochs=50,
                  shuffle=True)

    # check if model can predict
    sentence = Sentence("this is Berlin")
    sentence_empty = Sentence("       ")

    model.predict(sentence)
    model.predict([sentence, sentence_empty])
    model.predict([sentence_empty])

    # load model
    loaded_model = TextClassifier.load(results_base_path / "final-model.pt")

    # chcek if model predicts correct label
    sentence = Sentence("this is Berlin")
    sentence_double = Sentence("this is Berlin and pizza")

    loaded_model.predict([sentence, sentence_double])

    values = []
    for label in sentence_double.labels:
        assert label.value is not None
        assert 0.0 <= label.score <= 1.0
        assert type(label.score) is float
        values.append(label.value)

    assert "Berlin" in values
    assert "pizza" in values

    # check if loaded model successfully fit the training data
    result: Result = loaded_model.evaluate(corpus.test, gold_label_type="city")
    print(result.classification_report)
    assert result.classification_report["micro avg"]["f1-score"] == 1.0

    del loaded_model
예제 #8
0
def train():
    corpus: Corpus = ClassificationCorpus(sst_folder,
                                          test_file='test.csv',
                                          dev_file='dev.csv',
                                          train_file='sst_dev.csv')

    label_dict = corpus.make_label_dictionary()
    stacked_embedding = WordEmbeddings('glove')

    # Stack Flair string-embeddings with optional embeddings
    word_embeddings = list(
        filter(None, [
            stacked_embedding,
            FlairEmbeddings('news-forward-fast'),
            FlairEmbeddings('news-backward-fast'),
        ]))
    # Initialize document embedding by passing list of word embeddings
    document_embeddings = DocumentRNNEmbeddings(
        word_embeddings,
        hidden_size=512,
        reproject_words=True,
        reproject_words_dimension=256,
    )
    # Define classifier
    classifier = TextClassifier(document_embeddings,
                                label_dictionary=label_dict,
                                multi_label=False)

    trainer = ModelTrainer(classifier, corpus)
    trainer.train(model_path, max_epochs=10, train_with_dev=False)
예제 #9
0
파일: classifier.py 프로젝트: yyht/daga
def predict(args):
    """Predict."""
    model = TextClassifier.load(os.path.join(args.model_dir, args.model_file))
    logger.info(f'Model: "{model}"')

    if args.one_per_line:
        corpus: Corpus = ClassificationCorpus(
            args.data_dir,
            test_file=args.test_file,
        )
    else:
        assert args.label_symbol is not None
        corpus: Corpus = FlyClassificationCorpus(
            args.data_dir,
            test_file=args.test_file,
            comment_symbol=args.comment_symbol,
            label_symbol=args.label_symbol,
        )

    fout = io.open(args.output_file, "w", encoding="utf-8", errors="ignore")
    logger.info("Saving to %s", args.output_file)

    start_time = time.time()
    for i in range(len(corpus.test)):
        sentence = corpus.test[i]
        model.predict(sentence)
        if sentence.labels:
            top = sentence.labels[0]
            fout.write(f"{top.value} {top.score:.4f}\n")
            fout.flush()

    logger.info("End of prediction: time %.1f min",
                (time.time() - start_time) / 60)
예제 #10
0
class ClassificationCorpusAnalysis(CorpusAnalysis):
    def __init__(self,
                 path: Union[Path, str],
                 column_name_map: dict = None,
                 corpus: Corpus = None,
                 **corpus_params):
        if isinstance(path, str):
            path = Path(path)
        assert path.exists()

        self.path = path
        if corpus:
            self.corpus = corpus
        else:
            if column_name_map:
                self.corpus = CSVClassificationCorpus(self.path,
                                                      column_name_map,
                                                      **corpus_params)
            else:
                self.corpus = ClassificationCorpus(self.path, **corpus_params)
        self.sentences = self.corpus.get_all_sentences()
        print(self.corpus)

    def class_distribution(self,
                           multiclass: bool = False,
                           nr_classes: int = 10,
                           savefig_file=None,
                           **kwargs):
        class_count = Corpus._get_class_to_count(self.sentences)
        class_count = pd.DataFrame.from_dict(class_count,
                                             orient='index',
                                             columns=['count']).sort_values(
                                                 'count', ascending=False)
        html_table = class_count.to_html()

        # plot distribution
        class_count_top = class_count[:nr_classes].copy()
        if not multiclass:
            if nr_classes < len(class_count):
                class_count_top.loc['others'] = class_count[nr_classes:].sum()
            # pie plot class_count
            class_count_top.plot.pie(y='count', **kwargs)
            plt.legend(labels=class_count_top.index,
                       bbox_to_anchor=(1, 0, 0.1, 1),
                       loc='center right')
        else:
            class_count_top.plot.bar(y='count', **kwargs)
            plt.gca().yaxis.grid(True, linestyle='--')

        plt.tight_layout()
        if savefig_file:
            plt.savefig(self.path / savefig_file, dpi=600)
        plt.show()

    def example_document_for_classes(self, ):
        # Todo!
        pass
def train_sentiment_model(rootdir, train, dev, test, num_epochs, device, outputdir):

    flair.device = torch.device(device)

    corpus = ClassificationCorpus(rootdir,
                                  train_file=train,
                                  dev_file=dev,
                                  test_file=test,
                                  in_memory=False)

    label_dict = corpus.make_label_dictionary()

    # init Flair embeddings
    flair_forward_embedding = FlairEmbeddings('multi-forward')
    flair_backward_embedding = FlairEmbeddings('multi-backward')

    optional_embedding = ELMoEmbeddings('original')

    word_embeddings = list(filter(None, [
        optional_embedding,
        FlairEmbeddings('news-forward'),
        FlairEmbeddings('news-backward'),
    ]))

    # Initialize document embedding by passing list of word embeddings
    #
    # Note this will kick off model generation that will take a long time (several hours)
    # This will produce final-model.pt and best-model.pt files which represent a stored trained model.
    document_embeddings = DocumentRNNEmbeddings(
        word_embeddings,
        hidden_size=512,
        reproject_words=True,
        reproject_words_dimension=256,
    )

    classifier = TextClassifier(document_embeddings,
                                label_dictionary=label_dict,
                                multi_label=False)

    trainer = ModelTrainer(classifier, corpus)
    trainer.train(outputdir, max_epochs=num_epochs)
예제 #12
0
def train(args):
    """Train."""
    start_time = time.time()
    if args.one_per_line:
        corpus: Corpus = ClassificationCorpus(
            args.data_dir,
            train_file=args.train_file,
            dev_file=args.dev_file,
        )
    else:
        assert args.label_symbol is not None
        corpus: Corpus = FlyClassificationCorpus(
            args.data_dir,
            train_file=args.train_file,
            dev_file=args.dev_file,
            comment_symbol=args.comment_symbol,
            label_symbol=args.label_symbol,
        )

    label_dict = corpus.make_label_dictionary()
    vocab = corpus.make_vocab_dictionary().get_items()
    embeddings = utils.init_embeddings(vocab, args)

    document_embeddings = DocumentRNNEmbeddings(
        [embeddings],
        hidden_size=args.hidden_size,
        use_attn=args.use_attn,
        num_heads=args.num_heads,
        scaling=args.scaling,
        pooling_operation=args.pooling_operation,
        use_sent_query=args.use_sent_query,
    )

    model = TextClassifier(document_embeddings, label_dictionary=label_dict)

    utils.init_model(model, args)

    trainer: ModelTrainer = ModelTrainer(model, corpus,
                                         utils.optim_method(args.optim))

    trainer.train(
        args.model_dir,
        mini_batch_size=args.mini_batch_size,
        max_epochs=args.max_epochs,
        anneal_factor=args.anneal_factor,
        learning_rate=args.learning_rate,
        patience=args.patience,
        min_learning_rate=args.min_learning_rate,
        embeddings_storage_mode=args.embeddings_storage_mode,
    )

    logger.info("End of training: time %.1f min",
                (time.time() - start_time) / 60)
예제 #13
0
    def __init__(self,
                 path: Union[Path, str],
                 column_name_map: dict = None,
                 corpus: Corpus = None,
                 **corpus_params):
        if isinstance(path, str):
            path = Path(path)
        assert path.exists()

        self.path = path
        if corpus:
            self.corpus = corpus
        else:
            if column_name_map:
                self.corpus = CSVClassificationCorpus(self.path,
                                                      column_name_map,
                                                      **corpus_params)
            else:
                self.corpus = ClassificationCorpus(self.path, **corpus_params)
        self.sentences = self.corpus.get_all_sentences()
        print(self.corpus)
예제 #14
0
 def _train_model(self):
     # type: () -> None
     corpus = ClassificationCorpus(
         Path(__path_to_base__),
         test_file=os.path.basename(self.path_to_test),
         dev_file=os.path.basename(self.path_to_dev),
         train_file=os.path.basename(self.path_to_train))
     word_embeddings = [
         ELMoEmbeddings('original'),
         FlairEmbeddings('news-forward-fast'),
         FlairEmbeddings('news-backward-fast')
     ]
     document_embeddings = DocumentRNNEmbeddings(
         word_embeddings,
         hidden_size=512,
         reproject_words=True,
         reproject_words_dimension=256)
     classifier = TextClassifier(
         document_embeddings,
         label_dictionary=corpus.make_label_dictionary(),
         multi_label=False)
     trainer = ModelTrainer(classifier, corpus)
     trainer.train(__path_to_base__, max_epochs=10)
예제 #15
0
def test_init_tars_and_switch(tasks_base_path):
    # test corpus
    corpus = ClassificationCorpus(tasks_base_path / "imdb")

    # create a TARS classifier
    tars = TARSClassifier(
        task_name="2_CLASS",
        label_dictionary=corpus.make_label_dictionary(label_type="class"),
        label_type="class",
    )

    # check if right number of classes
    assert len(tars.get_current_label_dictionary()) == 2

    # switch to task with only one label
    tars.add_and_switch_to_new_task("1_CLASS", "one class", "testlabel")

    # check if right number of classes
    assert len(tars.get_current_label_dictionary()) == 1

    # switch to task with three labels provided as list
    tars.add_and_switch_to_new_task("3_CLASS", ["list 1", "list 2", "list 3"], "testlabel")

    # check if right number of classes
    assert len(tars.get_current_label_dictionary()) == 3

    # switch to task with four labels provided as set
    tars.add_and_switch_to_new_task("4_CLASS", {"set 1", "set 2", "set 3", "set 4"}, "testlabel")

    # check if right number of classes
    assert len(tars.get_current_label_dictionary()) == 4

    # switch to task with two labels provided as Dictionary
    tars.add_and_switch_to_new_task("2_CLASS_AGAIN", corpus.make_label_dictionary(label_type="class"), "testlabel")

    # check if right number of classes
    assert len(tars.get_current_label_dictionary()) == 2
예제 #16
0
def load_corpus():
    label_dictionary: Dictionary = Dictionary(add_unk=False)
    label_dictionary.multi_label = False

    label_dictionary.add_item('0')
    label_dictionary.add_item('1')

    # this is the folder in which train, test and dev files reside
    data_folder = 'datasets/constrained_classification/k16'

    # load corpus containing training, test and dev data
    corpus: Corpus = ClassificationCorpus(data_folder,
                                          dev_file='fasttext.valid',
                                          train_file='fasttext.train')

    return corpus, label_dictionary
예제 #17
0
test_data = utils.mgdb.read_mongo('raw_data_test')
test_data.to_csv(path.join(data_folder, 'test.txt'),
                 sep=' ',
                 index=False,
                 header=False,
                 columns=['label', 'text'])

dev_data = utils.mgdb.read_mongo('raw_data_dev')
dev_data.to_csv(path.join(data_folder, 'dev.txt'),
                sep=' ',
                index=False,
                header=False,
                columns=['label', 'text'])

#%%
corpus: Corpus = ClassificationCorpus('data/splitted_data')
if len(corpus.train) == 0 or len(corpus.test) == 0:
    raise Exception('Creating corpus failed')

#%%
word_embeddings = [WordEmbeddings('glove')]

document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(
    word_embeddings,
    hidden_size=512,
    reproject_words=True,
    reproject_words_dimension=256,
)

label_dict = corpus.make_label_dictionary()
예제 #18
0
data_folder = 'content_folder'
from flair.data import Corpus
from flair.datasets import ClassificationCorpus

corpus: Corpus = ClassificationCorpus(data_folder)

from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentRNNEmbeddings
from flair.trainers import ModelTrainer
from flair.models import TextClassifier

label_dict = corpus.make_label_dictionary()
word_embeddings = [WordEmbeddings('glove')]
document_embeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=256)
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict)
trainer = ModelTrainer(classifier, corpus)
trainer.train('/content/data',
              learning_rate=0.1,
              mini_batch_size=32,
              anneal_factor=0.5,
              patience=5,
              max_epochs=150)
예제 #19
0
                                               articles_train,
                                               train=True)
    tokens_valid, labels_valid = return_annotated_articles(
        params["dev_labels_file"], articles_valid, train=True)
    tokens_test, labels_test = return_annotated_articles(
        params["test_labels_file"], articles_test, train=True)

    write_to_file_flair_corpus(params['data_bert_format_dir'] + 'train.txt',
                               tokens, labels)
    write_to_file_flair_corpus(params['data_bert_format_dir'] + 'dev.txt',
                               tokens_valid, labels_valid)
    write_to_file_flair_corpus(params['data_bert_format_dir'] + 'test.txt',
                               tokens_test, labels_test)
    # init a corpus using column format, data folder and the names of the train, dev and test files
    corpus = ClassificationCorpus(params['data_bert_format_dir'],
                                  train_file='train.txt',
                                  test_file='test.txt',
                                  dev_file='dev.txt')

    corpus.filter_empty_sentences()
    print(corpus)

    label_dictionary = corpus.make_label_dictionary()

    print(label_dictionary)

    flat_labels = [item for sublist in labels for item in sublist]
    class_weights = compute_class_weight('balanced', np.unique(flat_labels),
                                         flat_labels)
    unique_labels = np.unique(flat_labels)
    weights = {}
    for i in range(len(unique_labels)):
예제 #20
0
        dev_data['preprocessed'] = tweet_preprocessing.preprocess_data(
            dev_data['content'], 'embedding')
        if B_TEST_PHASE is True:
            test_data['preprocessed'] = tweet_preprocessing.preprocess_data(
                test_data['content'], 'embedding')

        utils.csv2ftx(train_data.content, train_data.sentiment, S_DATASET,
                      'train', 'flair')
        utils.csv2ftx(dev_data.content, dev_data.sentiment, S_DATASET, 'dev',
                      'flair')
        utils.csv2ftx(test_data.content, test_data.sentiment, S_DATASET,
                      'test', 'flair')

        corpus = Corpus = ClassificationCorpus(
            '../dataset/flair/',
            train_file='intertass_{}_train.txt'.format(S_DATASET),
            dev_file='intertass_{}_dev.txt'.format(S_DATASET),
            test_file='intertass_{}_test.txt'.format(S_DATASET))

        # class_weights = compute_class_weight('balanced', [0, 1, 2, 3], y=train_data.sentiment)
        # dict_weights = dict()
        # for i, label in enumerate(class_weights):
        #     dict_weights.update({str(label): class_weights[i]})

        # word_embeddings = [BertEmbeddings('bert-base-multilingual-cased')]
        word_embeddings = [
            BertEmbeddings('dccuchile/bert-base-spanish-wwm-cased')
        ]

        document_embeddings = DocumentRNNEmbeddings(
            word_embeddings,