def _train(self, corpus: Corpus, params: dict, base_path: Path,
               max_epochs: int, optimization_value: str):
        corpus = corpus
        label_dict = corpus.make_label_dictionary()
        for sent in corpus.get_all_sentences():
            sent.clear_embeddings()
        model = self._set_up_model(params, label_dict)
        training_parameters = {
            key: params[key]
            for key, value in params.items() if key in TRAINING_PARAMETERS
        }
        model_trainer_parameters = {
            key: params[key]
            for key, value in params.items()
            if key in MODEL_TRAINER_PARAMETERS and key != 'model'
        }
        trainer: ModelTrainer = ModelTrainer(model, corpus,
                                             **model_trainer_parameters)
        path = base_path
        results = trainer.train(path,
                                max_epochs=max_epochs,
                                param_selection_mode=True,
                                **training_parameters)

        if optimization_value == "score":
            result = results['test_score']
        else:
            result = results['dev_loss_history'][-1]

        return {'result': result, 'params': params}
示例#2
0
def test_tagged_corpus_make_label_dictionary_string():
    sentence_1 = Sentence('sentence 1', labels=['class_1'])
    sentence_2 = Sentence('sentence 2', labels=['class_2'])
    sentence_3 = Sentence('sentence 3', labels=['class_1'])
    corpus = Corpus([sentence_1, sentence_2, sentence_3], [], [])
    label_dict = corpus.make_label_dictionary()
    assert (2 == len(label_dict))
    assert ('<unk>' not in label_dict.get_items())
    assert ('class_1' in label_dict.get_items())
    assert ('class_2' in label_dict.get_items())
示例#3
0
def run_zero_shot(train_tweets, train_y, val_tweets, val_y):
    """
    Performs the training of the zero shot learning model

    @param train_tweets: the tweets that will be used for training
    @param train_y: the training labels
    @param val_tweets: the tweets that will be used for validation
    @param val_y: the validation labels
    @return: None
    """
    # 1. Load our pre-trained TARS model for English
    print("Zero shot")
    # download https://nlp.informatik.hu-berlin.de/resources/models/tars-base/tars-base.pt
    tars = TARSClassifier.load(
        os.path.join(os.path.dirname(__file__), "..", "..", "saved_models",
                     "tars-base.pt"))

    train_tweets["output"] = train_y.iloc[:]
    train = train_tweets.apply(create_sentences, axis=1).tolist()
    train = SentenceDataset(train)

    val_tweets["output"] = val_y.iloc[:]
    val = val_tweets.apply(create_sentences, axis=1).tolist()
    val = SentenceDataset(val)

    corpus = Corpus(train=train, test=val)

    tars.add_and_switch_to_new_task(
        "POSITIVE_NEGATIVE", label_dictionary=corpus.make_label_dictionary())

    trainer = ModelTrainer(tars, corpus)

    # 4. train model
    trainer.train(
        base_path='../../data/zero_shot',  # path to store the model artifacts
        learning_rate=0.02,  # use very small learning rate
        mini_batch_size=16,  # small mini-batch size since corpus is tiny
        max_epochs=10,  # terminate after 10 epochs
    )

    print("DONE TRAINING")
    tars = TARSClassifier.load('../../model/zero_shot/final-model.pt')

    val_tweets["pred"] = val_tweets.apply(predict_few_shot,
                                          args=(tars, ),
                                          axis=1)
    val_tweets["pred"] = val_tweets["pred"].apply(lambda x: 1
                                                  if x == "positive" else -1)

    pred = pd.DataFrame(list(val_tweets["pred"]), columns=['Prediction'])
    pred.index += 1
    pred.insert(0, 'Id', pred.index)

    pred.to_csv("../../predictions/zero_shot_pred.csv", index=False)
示例#4
0
文件: flair.py 项目: jbuccina/keter
    def fit(self, corpus: Corpus, model_path: str):
        self.model = TARSClassifier(
            task_name="ChemicalUnderstanding",
            label_dictionary=corpus.make_label_dictionary(),
        )

        trainer = ModelTrainer(self.model, corpus)

        trainer.train(
            base_path=model_path,
            learning_rate=0.02,
            mini_batch_size=16,
            mini_batch_chunk_size=4,
            max_epochs=10,
        )
    def train_classifier_model(self,
                               corpus: Corpus,
                               document_embeddings: DocumentRNNEmbeddings,
                               model_params: {dict} = None):
        try:

            label_dict = corpus.make_label_dictionary()
            # create the text classifier
            classifier = TextClassifier(document_embeddings,
                                        label_dictionary=label_dict)
            # initialize the text classifier trainer
            trainer = ModelTrainer(classifier, corpus)

            if model_params is None:
                learning_rate = gv.learning_rate
                mini_batch_size = gv.mini_batch_size
                anneal_factor = gv.anneal_factor
                patience = gv.patience
                max_epochs = gv.max_epochs
            else:
                learning_rate = model_params["learning_rate"]
                mini_batch_size = model_params["mini_batch_size"]
                anneal_factor = model_params["anneal_factor"]
                patience = model_params["patience"]
                max_epochs = model_params["max_epochs"]

            # start the training
            self.select_training_device()

            trainer.train(self.model_filepath,
                          learning_rate=learning_rate,
                          mini_batch_size=mini_batch_size,
                          anneal_factor=anneal_factor,
                          patience=patience,
                          max_epochs=max_epochs)
        except Exception as e:
            gv.logger.error(e)
                tag_dictionary_no_prefix.item2idx[tag_no_pref_encoded]] > 0:
            corpus_sents.append(sent)
            tag_countdown[
                tag_dictionary_no_prefix.item2idx[tag_no_pref_encoded]] -= 1
            sent_picked = True

print("sents for training: " + str(len(corpus_sents)))
print("amount of items in dict: " + str(len(tag_dictionary.item2idx)))

training_dataset = SentenceDataset(corpus_sents)
training_corpus = Corpus(train=training_dataset,
                         dev=corpus_small.dev,
                         test=corpus_small.test,
                         sample_missing_splits=False)
trainer = ModelTrainer(tagger, training_corpus, optimizer=torch.optim.AdamW)
tag_dictionary = training_corpus.make_label_dictionary(tag_type)
tagger.add_and_switch_to_new_task("fewshot-moviecomplex-simple-to-conll3",
                                  tag_dictionary=tag_dictionary,
                                  tag_type=tag_type)
trainer.train(
    base_path='resources/v3/fewshot-moviecomplex-simple-to-conll3-k' + str(k),
    learning_rate=5.0e-5,
    mini_batch_size=32,
    mini_batch_chunk_size=None,
    max_epochs=10,
    weight_decay=0.,
    embeddings_storage_mode="none",
    scheduler=OneCycleLR,
)

# evaluation
示例#7
0
        Sentence('We had a 1 hour call with Denise').add_label('contact_type', 'call'),
        Sentence('Had a quick call to discuss the offer').add_label('contact_type', 'call'),
        Sentence('I was on skype with Paul all day').add_label('contact_type', 'call'),
        Sentence('I have set up a meeting tomorrow').add_label('contact_type', 'meeting'),
        Sentence('I emailed the latest report').add_label('contact_type', 'email'),
        Sentence('I sent an email to Mark').add_label('contact_type', 'email'),
        Sentence('I will email those files to you').add_label('contact_type', 'email'),
        Sentence('phoned Jeremy').add_label('contact_type', 'call'),
        Sentence('emailed Jeremy').add_label('contact_type', 'email'),
        Sentence('called Jeremy').add_label('contact_type', 'call'),
        Sentence('meet with Jeremy').add_label('contact_type', 'meeting'),
        Sentence('meeting w Deborah').add_label('contact_type', 'meeting'),
        Sentence('the client scheduled a meeting at their office').add_label('contact_type', 'meeting'),
        Sentence('had a meeting with client and agreed to continue over email').add_label('contact_type', 'meeting'),
        Sentence('had a call with client and to arrange a meeting tomorrow').add_label('contact_type', 'call'),
    ])

corpus = Corpus(train=train, test=test)

tars = TARSClassifier.load('tars-base')

tars.add_and_switch_to_new_task("contact_type", label_dictionary=corpus.make_label_dictionary())

trainer = ModelTrainer(tars, corpus)

trainer.train(base_path='../pretrained/contact_type_model/',
              learning_rate=0.02,
              mini_batch_size=1,
              max_epochs=15,
              train_with_dev=True,
              )
示例#8
0
class FlairTARS(Classifier):
    '''Flair TARS few-shots training.
    It makes use of meaningful category labels found in classes.txt.
    Slow... base pretrained model (tars-base) is too heavy for our needs.
    Embeddings: 30522 x 768, 24 layers, 12 heads, 110M params.
    Flair works on top of PyTorch.
    In principle this classifier should be able to perform very well
    with ~3 samples per class. (See paper about TARS).
    '''

    pretrained_model_name = 'tars-base'

    def prepare_resources(self):
        # turn off INFO and DEBUG logging
        import flair  # KEEP THIS IMPORT HERE! (it initialises 'flair' logger)
        import logging
        logger = logging.getLogger('flair')
        logger.setLevel(logging.WARNING)
        if self.seed:
            flair.set_seed(self.seed)

    def train(self):
        from flair.data import Corpus
        from flair.datasets import SentenceDataset
        from flair.data import Sentence

        self.classes = utils.read_class_titles(settings.CAT_DEPTH)
        self.classes['NOCAT'] = 'NOCAT'

        train = SentenceDataset([
            Sentence(row['titlen']).add_label('law_topic',
                                              self.classes[row['cat1']])
            for i, row in self.df_train.iterrows()
        ])

        # make a corpus with train and test split
        self.corpus = Corpus(train=train, dev=train)

        # 1. load base TARS
        tars = self._load_pretained_model()

        # 2. make the model aware of the desired set of labels from the new corpus
        tars.add_and_switch_to_new_task(
            "LAW_TOPIC", label_dictionary=self.corpus.make_label_dictionary())

        # 3. initialize the text classifier trainer with your corpus
        from flair.trainers import ModelTrainer
        trainer = ModelTrainer(tars, self.corpus)

        # 4. train model
        path = settings.WORKING_DIR
        if 1:
            trainer.train(
                base_path=path,
                # path to store the model artifacts
                learning_rate=5e-2,  # 5ep, 0.2 bad; 5ep with 0.1 looks ok.
                mini_batch_size=settings.MINIBATCH,
                # mini_batch_chunk_size=1, mini_batch_chunk_size=4, # optionally set this if transformer is too much for your machine
                max_epochs=settings.EPOCHS,  # terminate after 10 epochs
                train_with_dev=False,
                save_final_model=False,
                param_selection_mode=True,  # True to avoid model saves
                shuffle=False,  # Already done
            )

        # from flair.models.text_classification_model import TARSClassifier
        # self.model = TARSClassifier.load(
        #     os.path.join(path, 'best-model.pt')
        # )

        self.model = tars

    def predict(self, string):
        from flair.data import Sentence

        # 2. Prepare a test sentence
        sentence = Sentence(string)

        ret = ['NOCAT', 1.0]

        # 4. Predict for these classes
        self.model.predict(sentence)

        if len(sentence.labels):
            label = sentence.labels[0]
            ret = [self.classes.get_key_from_val(label.value), label.score]

        return str(ret[0]), ret[1]

    def _predict_zero(self, string):
        '''Abandoned; 0-shot predictions were too poor.'''
        from flair.models.text_classification_model import TARSClassifier
        from flair.data import Sentence

        # 2. Prepare a test sentence
        sentence = Sentence(string)

        # 3. Define some classes that you want to predict using descriptive names
        ret = [len(self.classes) - 1, 1.0]

        # 4. Predict for these classes
        self._get_tars().predict_zero_shot(sentence, self.classes)

        if len(sentence.labels):
            label = sentence.labels[0]
            ret = [self.classes.get_key_from_val(label.value), label.score]

        return str(ret[0]), ret[1]

    def get_internal_dimension(self):
        # return self.model.document_embeddings.embedding_length
        return None

    def _load_pretained_model(self):
        from flair.models.text_classification_model import TARSClassifier

        # 1. Load our pre-trained TARS model for English
        # Note that this must be reloaded before each training as its modified
        # during training.
        return TARSClassifier.load(self.get_pretrained_model_name())
示例#9
0
    Sentence('How much do data engineers make').add_label(label_name, money),
    Sentence('Canadian tech salaries').add_label(label_name, money),
    Sentence('Vulnerability found in a popular crypto wallet').add_label(
        label_name, crypto),
    Sentence('Ethereum smart contracts are useless').add_label(
        label_name, crypto),
])

# make a corpus with train and test split
corpus = Corpus(train=train, test=test)

from flair.trainers import ModelTrainer

# 2. make the model aware of the desired set of labels from the new corpus
tars.add_and_switch_to_new_task(
    "HN_MONEYTALK", label_dictionary=corpus.make_label_dictionary())

# 3. initialize the text classifier trainer with your corpus
trainer = ModelTrainer(tars, corpus)

# 4. train model
trainer.train(
    base_path=
    'resources/taggers/hn_moneytalk',  # path to store the model artifacts
    learning_rate=0.02,  # use very small learning rate
    mini_batch_size=1,  # small mini-batch size since corpus is tiny
    max_epochs=10,  # terminate after 10 epochs
    train_with_dev=True,
)

# 1. Load few-shot TARS model