def test_train_resume_text_classification_training(results_base_path,
                                                   tasks_base_path):
    corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb")
    label_dict = corpus.make_label_dictionary()

    #document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(
    #    [flair_embeddings], 128, 1, False
    #)

    model = TextClassifier(document_embeddings, label_dict, multi_label=False)

    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path,
                  max_epochs=2,
                  shuffle=False,
                  checkpoint=True)

    del trainer, model
    trainer = ModelTrainer.load_checkpoint(results_base_path / "checkpoint.pt",
                                           corpus)
    trainer.train(results_base_path,
                  max_epochs=2,
                  shuffle=False,
                  checkpoint=True)

    # clean up results directory
    shutil.rmtree(results_base_path)
    del trainer
def test_train_resume_classifier(results_base_path, tasks_base_path):
    corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb",
                                                 label_type="topic")
    label_dict = corpus.make_label_dictionary(label_type="topic")

    model = TextClassifier(document_embeddings=document_embeddings,
                           label_dictionary=label_dict,
                           multi_label=False,
                           label_type="topic")

    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path,
                  max_epochs=2,
                  shuffle=False,
                  checkpoint=True)

    del trainer, model
    trainer = ModelTrainer.load_checkpoint(results_base_path / "checkpoint.pt",
                                           corpus)
    trainer.train(results_base_path,
                  max_epochs=2,
                  shuffle=False,
                  checkpoint=True)

    # clean up results directory
    shutil.rmtree(results_base_path)
    del trainer
def test_train_resume_sequence_tagging_training(results_base_path,
                                                tasks_base_path):
    corpus_1 = flair.datasets.ColumnCorpus(data_folder=tasks_base_path /
                                           "fashion",
                                           column_format={
                                               0: "text",
                                               2: "ner"
                                           })
    corpus_2 = flair.datasets.GERMEVAL(base_path=tasks_base_path)

    corpus = MultiCorpus([corpus_1, corpus_2])
    tag_dictionary = corpus.make_tag_dictionary("ner")

    model: SequenceTagger = SequenceTagger(
        hidden_size=64,
        embeddings=turian_embeddings,
        tag_dictionary=tag_dictionary,
        tag_type="ner",
        use_crf=False,
    )

    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path,
                  max_epochs=2,
                  shuffle=False,
                  checkpoint=True)

    del trainer, model
    trainer = ModelTrainer.load_checkpoint(results_base_path / "checkpoint.pt",
                                           corpus)

    trainer.train(results_base_path,
                  max_epochs=2,
                  shuffle=False,
                  checkpoint=True)

    # clean up results directory
    shutil.rmtree(results_base_path)
    del trainer
def train(args, tag_type):
    '''
    Training script to be run for training the ner model

    Parameters:
    -----------
    args:arguments passed to the parser on CLI
    '''
    data_dir = args.input_dir + '/data'
    corpus = ColumnCorpus(data_folder=data_dir,
                          column_format={
                              0: 'text',
                              1: 'ner'
                          },
                          train_file=args.train_file,
                          test_file=args.test_file,
                          dev_file=args.dev_file)

    # print(corpus.train[0])
    # print(corpus)

    # tag_type = 'ner'

    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
    # print(tag_dictionary)

    if args.character_embeddings:
        embedding_types: List[TokenEmbeddings] = [
            WordEmbeddings('glove'),
            CharacterEmbeddings(),
            FlairEmbeddings(args.flair_model_name_or_path_forward),
            FlairEmbeddings(args.flair_model_name_or_path_backward),
        ]
    else:
        embedding_types: List[TokenEmbeddings] = [
            WordEmbeddings('glove'),
            FlairEmbeddings(args.flair_model_name_or_path_forward),
            FlairEmbeddings(args.flair_model_name_or_path_backward),
        ]

    embeddings: StackedEmbeddings = StackedEmbeddings(
        embeddings=embedding_types)

    # initialize sequence tagger

    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=tag_type,
                                            use_crf=True)

    # initialize trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    if (args.train_or_predict == "continue_train"):
        print("continue training")
        checkpoint = '/Users/titashneogi/workspace/NLP/NER/data/flair/cumulative_model/checkpoint.pt'
        trainer = ModelTrainer.load_checkpoint(checkpoint, corpus)

    # start training
    trainer.train(args.model_dir,
                  learning_rate=args.train_learning_rate,
                  mini_batch_size=args.per_gpu_batch_size,
                  max_epochs=args.num_train_epochs,
                  embeddings_storage_mode=args.embeddings_storage_mode)

    model = SequenceTagger.load(args.model_dir + '/final-model.pt')
    if (args.predict_file):
        with open(data_dir + args.predict_file, 'r') as f:
            str_file = f.read()

        sentence = Sentence(str_file)

        model.predict(sentence)
        print(sentence.to_tagged_string())
示例#5
0
def train(
    review_category,
    params,
    update_model= False,
    learning_rate=0.01,
    embeddings_storage_mode='gpu',
    checkpoint= True,
    batch_growth_annealing= True,
    weight_decay = 1e-4,
    shuffle=True,
    train_with_dev=True,
    mini_batch_size=2,
    maxi_batch_size=128,
    anneal_factor=0.5,
    patience=2,
    max_epochs=150
    ):
    review_category = str(review_category)
    print('loading training corpus from %s'%(params.data_folder))
    corpus: Corpus = ClassificationCorpus(params.data_folder,
                train_file= review_category+'_train.txt',
                test_file= review_category+'_test.txt',
                dev_file= review_category+'_dev.txt')
    label_dict = corpus.make_label_dictionary()
    print('labels: ',label_dict)
    if eval(params.transformer):
        print('initializing transformer document embeddings using %s ...'%(params.transformer_pretrain_lm))
        # 3. initialize transformer document embeddings (many models are available)
        document_embeddings = TransformerDocumentEmbeddings(params.transformer_pretrain_lm, fine_tune=True)
    else:
        print('initializing document embeddings')
        word_embeddings= [
            WordEmbeddings('glove'),
            # comment in this line to use character embeddings
            CharacterEmbeddings(),
            # comment in these lines to use flair embeddings
            FlairEmbeddings('news-forward'),
            FlairEmbeddings('news-backward'),
            BertEmbeddings(),
            # TransformerXLEmbeddings(),
            #RoBERTaEmbeddings(),
            #XLNetEmbeddings()
        ]
        # Can choose between many RNN types (GRU by default, to change use rnn_type parameter)
        document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(word_embeddings,
                                                    hidden_size=512,
                                                    reproject_words=True,
                                                    reproject_words_dimension=256,
                                                    )
    if not update_model:
        print('building review_analysis classifier ...')
        # create the text classifier
        classifier = TextClassifier(document_embeddings, label_dictionary=label_dict)
        # initialize the text classifier trainer
        print("initializing review_analysis classifier's trainer")
        trainer = ModelTrainer(classifier, corpus, optimizer=Adam)
    else:
        # continue trainer at later point
        checkpoint_path = params.checkpoint_dir+'/%s/checkpoint.pt'%(review_category)
        print('loading checkpoint from %s'%(checkpoint_path))
        trainer = ModelTrainer.load_checkpoint(checkpoint_path, corpus)
    ####### training the model
    print("training the review_category: %s model ..."%(review_category))
    try:
        trainer.train(params.checkpoint_dir+'/%s'%(review_category),
        learning_rate=learning_rate,
        embeddings_storage_mode=embeddings_storage_mode,
        checkpoint= checkpoint,
        batch_growth_annealing= batch_growth_annealing,
        weight_decay = weight_decay,
        shuffle=shuffle,
        train_with_dev=train_with_dev,
        mini_batch_size=mini_batch_size,
        maxi_batch_size=maxi_batch_size,
        anneal_factor=anneal_factor,
        patience=patience,
        max_epochs=max_epochs)
    except:
        print('chuncking batch ... by %d'%(params.mini_batch_chunk_size))
        trainer.train(params.checkpoint_dir+'/%s'%(review_category),
        learning_rate=learning_rate,
        embeddings_storage_mode=embeddings_storage_mode,
        checkpoint= checkpoint,
        batch_growth_annealing= batch_growth_annealing,
        weight_decay = weight_decay,
        shuffle=shuffle,
        train_with_dev=train_with_dev,
        mini_batch_size=mini_batch_size,
        maxi_batch_size=maxi_batch_size,
        anneal_factor=anneal_factor,
        patience=patience,
        max_epochs=max_epochs,
        mini_batch_chunk_size=params.mini_batch_chunk_size)
示例#6
0
        pre_process(down_sample=1.0, equal_sets=True)

    corpus = CSVClassificationCorpus(data_folder='data',
                                     column_name_map={
                                         0: "label",
                                         1: "text"
                                     },
                                     delimiter='\t',
                                     skip_header=True,
                                     test_file='test.tsv',
                                     dev_file='dev.tsv',
                                     train_file='train.tsv')

    if path.isfile('results/checkpoint.pt'):
        print("Starting from checkpoint")
        trainer = ModelTrainer.load_checkpoint('results/checkpoint.pt', corpus)
    else:
        word_embeddings = [
            WordEmbeddings('glove'),
            FlairEmbeddings('news-forward'),
            FlairEmbeddings('news-backward')
        ]
        document_embeddings = DocumentRNNEmbeddings(
            embeddings=word_embeddings,
            hidden_size=512,
            reproject_words=True,
            reproject_words_dimension=256,
            rnn_type="LSTM")
        weights = load_weights()
        classifier = TextClassifier(
            document_embeddings=document_embeddings,
示例#7
0
def trainer(file_path: Path, filenames: Tuple[str, str, str], checkpoint: str,
            stack: str, n_epochs: int) -> None:
    """Train sentiment model using Flair NLP library:
    https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md

    To help provide added context, we can stack Glove, Bert or ELMo embeddings along with Flair embeddings.
    """
    # pip install flair allennlp
    from flair.datasets import ClassificationCorpus
    from flair.embeddings import FlairEmbeddings, DocumentRNNEmbeddings, DocumentPoolEmbeddings
    from flair.models import TextClassifier
    from flair.trainers import ModelTrainer
    from flair.training_utils import EvaluationMetric
    from flair.visual.training_curves import Plotter

    if stack == "glove":
        from flair.embeddings import WordEmbeddings
        stacked_embedding = WordEmbeddings('glove')
    elif stack == "fasttext":
        from flair.embeddings import WordEmbeddings
        stacked_embedding = WordEmbeddings('it')
    elif stack == "elmo":
        from flair.embeddings import ELMoEmbeddings
        stacked_embedding = ELMoEmbeddings('original')
    elif stack == "bert":
        from flair.embeddings import BertEmbeddings
        stacked_embedding = BertEmbeddings('bert-base-uncased')
    elif stack == "bert-multi":
        from flair.embeddings import BertEmbeddings
        stacked_embedding = BertEmbeddings('bert-base-multilingual-uncased')
    elif stack == 'bpe':
        from flair.embeddings import BytePairEmbeddings
        stacked_embedding = BytePairEmbeddings('it')
    else:
        stacked_embedding = None

    # Define and Load corpus from the provided dataset
    train, dev, test = filenames
    corpus = ClassificationCorpus(
        file_path,
        train_file=train,
        dev_file=dev,
        test_file=test,
    )
    # Create label dictionary from provided labels in data
    label_dict = corpus.make_label_dictionary()

    # Stack Flair string-embeddings with optional embeddings
    word_embeddings = list(
        filter(None, [
            stacked_embedding,
            FlairEmbeddings('it-forward'),
            FlairEmbeddings('it-backward'),
        ]))
    # Initialize document embedding by passing list of word embeddings
    document_embeddings = DocumentRNNEmbeddings(
        word_embeddings,
        hidden_size=256,
        reproject_words=True,
        dropout=0.5,
        reproject_words_dimension=256,
    )

    #document_embeddings = DocumentPoolEmbeddings([
    #    stacked_embedding,
    #    FlairEmbeddings('it-forward'),
    #    FlairEmbeddings('it-backward')],pooling='mean')

    # Define classifier
    classifier = TextClassifier(document_embeddings,
                                label_dictionary=label_dict,
                                multi_label=True)

    if not checkpoint:
        trainer = ModelTrainer(classifier, corpus)
    else:
        # If checkpoint file is defined, resume training
        #checkpoint = classifier.load_checkpoint(Path(checkpoint))
        trainer = ModelTrainer.load_checkpoint(checkpoint, corpus)

    # Begin training (enable checkpointing to continue training at a later time, if desired)
    trainer.train(
        file_path,
        max_epochs=n_epochs,
        checkpoint=True,
    )

    # Plot curves and store weights and losses
    plotter = Plotter()
    plotter.plot_training_curves(file_path + '/loss.tsv')
    plotter.plot_weights(file_path + '/weights.txt')
示例#8
0
    document_embeddings = DocumentPoolEmbeddings(
        word_embeddings,
        pooling='mean',
        fine_tune_mode='nonlinear',
    )

    # choose classifier type
    tagger: TextClassifier = TextClassifier(
        document_embeddings=document_embeddings,
        label_dictionary=label_dictionary,
        multi_label=False)
    # define model

    checkpoint = os.path.join(tagger_output_directory, 'checkpoint.pt')
    if os.path.isfile(checkpoint):
        trainer = ModelTrainer.load_checkpoint(checkpoint, corpus)
    else:
        trainer = ModelTrainer(tagger, corpus)

    # train model
    trainer.train(
        tagger_output_directory,
        checkpoint=True,
        learning_rate=0.7,
        dropout=0.25,
        mini_batch_size=
        64,  # decrease to prevent graphic card memory errors. Increase to improve learning speed
        monitor_test=False,
        monitor_train=False,
        patience=
        2,  # after how many unsuccessful epochs should we start annealing the learning rate
示例#9
0
                                 column_name_map={
                                     0: 'label',
                                     1: 'text'
                                 },
                                 skip_header=False,
                                 delimiter='\t',
                                 in_memory=False,
                                 max_tokens_per_doc=1000 * 10)

import torch
import flair
flair.devide = torch.device('cuda:0')
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from flair.datasets import DataLoader
mt = ModelTrainer.load_checkpoint(os.path.join(logs_path, "checkpoint.pt"),
                                  corpus)

test_results, test_loss = mt.model.evaluate(
    DataLoader(
        corpus.test,
        batch_size=4,
        num_workers=4,
    ),
    out_path=os.path.join(logs_path, "test.tsv"),
    embedding_storage_mode="none",
)

with open(os.path.join(logs_path, "test.txt"), "w") as f:
    f.write(str(test_results.main_score) + "\n\n")
    f.write(str(test_results.log_header) + "\n")
    f.write(str(test_results.log_line) + "\n\n")