Пример #1
0
def test_train_resume_sequence_tagging_training(results_base_path,
                                                tasks_base_path):
    corpus_1 = flair.datasets.ColumnCorpus(data_folder=(tasks_base_path /
                                                        'fashion'),
                                           column_format={
                                               0: 'text',
                                               2: 'ner',
                                           })
    corpus_2 = flair.datasets.GERMEVAL(base_path=tasks_base_path)
    corpus = MultiCorpus([corpus_1, corpus_2])
    tag_dictionary = corpus.make_tag_dictionary('ner')
    embeddings = WordEmbeddings('turian')
    model = SequenceTagger(hidden_size=64,
                           embeddings=embeddings,
                           tag_dictionary=tag_dictionary,
                           tag_type='ner',
                           use_crf=False)
    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path,
                  max_epochs=2,
                  shuffle=False,
                  checkpoint=True)
    checkpoint = SequenceTagger.load_checkpoint(
        (results_base_path / 'checkpoint.pt'))
    trainer = ModelTrainer.load_from_checkpoint(checkpoint, corpus)
    trainer.train(results_base_path,
                  max_epochs=2,
                  shuffle=False,
                  checkpoint=True)
    shutil.rmtree(results_base_path)
Пример #2
0
def test_train_load_use_tagger_multicorpus(results_base_path, tasks_base_path):
    corpus_1 = flair.datasets.ColumnCorpus(data_folder=(tasks_base_path /
                                                        'fashion'),
                                           column_format={
                                               0: 'text',
                                               2: 'ner',
                                           })
    corpus_2 = flair.datasets.GERMEVAL(base_path=tasks_base_path)
    corpus = MultiCorpus([corpus_1, corpus_2])
    tag_dictionary = corpus.make_tag_dictionary('ner')
    embeddings = WordEmbeddings('turian')
    tagger = SequenceTagger(hidden_size=64,
                            embeddings=embeddings,
                            tag_dictionary=tag_dictionary,
                            tag_type='ner',
                            use_crf=False)
    trainer = ModelTrainer(tagger, corpus)
    trainer.train(results_base_path,
                  learning_rate=0.1,
                  mini_batch_size=2,
                  max_epochs=2,
                  shuffle=False)
    loaded_model = SequenceTagger.load((results_base_path / 'final-model.pt'))
    sentence = Sentence('I love Berlin')
    sentence_empty = Sentence('       ')
    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])
    shutil.rmtree(results_base_path)
def test_train_resume_sequence_tagging_training(results_base_path, tasks_base_path):
    corpus_1 = flair.datasets.ColumnCorpus(
        data_folder=tasks_base_path / "fashion", column_format={0: "text", 2: "ner"}
    )
    corpus_2 = flair.datasets.GERMEVAL(base_path=tasks_base_path)

    corpus = MultiCorpus([corpus_1, corpus_2])
    tag_dictionary = corpus.make_tag_dictionary("ner")

    embeddings = WordEmbeddings("turian")

    model: SequenceTagger = SequenceTagger(
        hidden_size=64,
        embeddings=embeddings,
        tag_dictionary=tag_dictionary,
        tag_type="ner",
        use_crf=False,
    )

    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True)

    checkpoint = SequenceTagger.load_checkpoint(results_base_path / "checkpoint.pt")
    trainer = ModelTrainer.load_from_checkpoint(checkpoint, corpus)

    trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True)

    # clean up results directory
    shutil.rmtree(results_base_path)
Пример #4
0
def run_evaluator(model_name: str, dataset_names: str):
    corpus_list = []

    # Dataset-related
    for dataset in dataset_names.split((",")):
        dataset_name, language = dataset.split("/")

        preproc_fn = None

        if dataset_name == "ajmc":
            preproc_fn = prepare_ajmc_corpus

        corpus_list.append(
            NER_HIPE_2022(dataset_name=dataset_name,
                          language=language,
                          preproc_fn=preproc_fn,
                          add_document_separator=True))

    corpora: MultiCorpus = MultiCorpus(corpora=corpus_list,
                                       sample_missing_splits=False)
    label_dictionary = corpora.make_label_dictionary(label_type="ner")
    print("Label Dictionary:", label_dictionary.get_items())

    model = SequenceTagger.load(model_name)

    dev_result = model.evaluate(corpora.dev,
                                gold_label_type="ner",
                                mini_batch_size=8)

    print(dev_result)
Пример #5
0
 def multi_corpus(self,
                  output: Union[str, Path],
                  first_corpus: str = "germeval"):
     data_dir = Path(Path(self.directory).parent, first_corpus)
     first = self._load_corpus(data_dir)
     second = self._load_corpus()
     corpus = MultiCorpus([first, second])
     tagger = self._train(output, corpus)
Пример #6
0
def test_train_load_use_tagger_multicorpus(results_base_path, tasks_base_path):
    corpus_1 = flair.datasets.ColumnCorpus(data_folder=tasks_base_path /
                                           "fashion",
                                           column_format={
                                               0: "text",
                                               3: "ner"
                                           })
    corpus_2 = flair.datasets.NER_GERMAN_GERMEVAL(
        base_path=tasks_base_path).downsample(0.1)

    corpus = MultiCorpus([corpus_1, corpus_2])
    tag_dictionary = corpus.make_label_dictionary("ner")

    tagger: SequenceTagger = SequenceTagger(
        hidden_size=64,
        embeddings=turian_embeddings,
        tag_dictionary=tag_dictionary,
        tag_type="ner",
        use_crf=False,
        allow_unk_predictions=True,
    )

    # initialize trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    trainer.train(
        results_base_path,
        learning_rate=0.1,
        mini_batch_size=2,
        max_epochs=2,
        shuffle=False,
    )

    del trainer, tagger, corpus
    loaded_model: SequenceTagger = SequenceTagger.load(results_base_path /
                                                       "final-model.pt")

    sentence = Sentence("I love Berlin")
    sentence_empty = Sentence("       ")

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    del loaded_model
Пример #7
0
def test_train_load_use_tagger_multicorpus(results_base_path, tasks_base_path):
    corpus_1 = flair.datasets.ColumnCorpus(data_folder=tasks_base_path /
                                           "fashion",
                                           column_format={
                                               0: "text",
                                               2: "ner"
                                           })
    corpus_2 = flair.datasets.GERMEVAL(base_path=tasks_base_path)

    corpus = MultiCorpus([corpus_1, corpus_2])
    tag_dictionary = corpus.make_tag_dictionary("ner")

    embeddings = WordEmbeddings("turian")

    tagger: SequenceTagger = SequenceTagger(
        hidden_size=64,
        embeddings=embeddings,
        tag_dictionary=tag_dictionary,
        tag_type="ner",
        use_crf=False,
    )

    # initialize trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    trainer.train(
        results_base_path,
        learning_rate=0.1,
        mini_batch_size=2,
        max_epochs=2,
        shuffle=False,
    )

    loaded_model: SequenceTagger = SequenceTagger.load(results_base_path /
                                                       "final-model.pt")

    sentence = Sentence("I love Berlin")
    sentence_empty = Sentence("       ")

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # clean up results directory
    shutil.rmtree(results_base_path)
Пример #8
0
def test_multi_corpus(tasks_base_path):
    corpus_1 = flair.datasets.ColumnCorpus(tasks_base_path / "germeval_14", column_format={0: "text", 2: "ner"})

    corpus_2 = flair.datasets.ColumnCorpus(tasks_base_path / "fashion", column_format={0: "text", 2: "ner"})
    # get two corpora as one
    corpus = MultiCorpus([corpus_1, corpus_2])

    assert len(corpus.train) == 8
    assert len(corpus.dev) == 2
    assert len(corpus.test) == 2
Пример #9
0
def run_experiment(seed, batch_size, epoch, learning_rate, hipe_datasets,
                   json_config):
    # Config values
    # Replace it with more Pythonic solutions later!
    word_embedding = json_config["word_embedding"]
    use_crf = json_config["use_crf"] if "use_crf" in json_config else False

    # Set seed for reproducibility
    set_seed(seed)

    corpus_list = []

    # Dataset-related
    for dataset in hipe_datasets:
        dataset_name, language = dataset.split("/")
        corpus_list.append(
            NER_HIPE_2022(dataset_name=dataset_name,
                          language=language,
                          add_document_separator=True))

    print("Use CRF:", use_crf)

    corpora: MultiCorpus = MultiCorpus(corpora=corpus_list,
                                       sample_missing_splits=False)
    label_dictionary = corpora.make_label_dictionary(label_type="ner")
    print("Label Dictionary:", label_dictionary.get_items())

    # FastText Embeddings
    embeddings = FastTextEmbeddings(embeddings=word_embedding)

    tagger: SequenceTagger = SequenceTagger(
        hidden_size=256,
        embeddings=embeddings,
        tag_dictionary=label_dictionary,
        tag_type="ner",
        use_crf=use_crf,
    )

    # Trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpora)

    datasets = "-".join([dataset for dataset in hipe_datasets])

    trainer.train(
        f"hipe2022-flert-we-trainer-{datasets}-{word_embedding}-bs{batch_size}-wsFalse-e{epoch}-lr{learning_rate}-crf{use_crf}-{seed}",
        mini_batch_size=batch_size,
        mini_batch_chunk_size=2,
        patience=3,
        max_epochs=epoch,
        shuffle=True,
        learning_rate=learning_rate,
    )

    # Finally, print model card for information
    tagger.print_model_card()
Пример #10
0
def test_train_resume_tagger(results_base_path, tasks_base_path):

    corpus_1 = flair.datasets.ColumnCorpus(data_folder=tasks_base_path /
                                           "fashion",
                                           column_format={
                                               0: "text",
                                               3: "ner"
                                           })
    corpus_2 = flair.datasets.NER_GERMAN_GERMEVAL(
        base_path=tasks_base_path).downsample(0.1)

    corpus = MultiCorpus([corpus_1, corpus_2])
    tag_dictionary = corpus.make_label_dictionary("ner")

    model: SequenceTagger = SequenceTagger(
        hidden_size=64,
        embeddings=turian_embeddings,
        tag_dictionary=tag_dictionary,
        tag_type="ner",
        use_crf=False,
    )

    # train model for 2 epochs
    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path,
                  max_epochs=2,
                  shuffle=False,
                  checkpoint=True)

    del model

    # load the checkpoint model and train until epoch 4
    checkpoint_model = SequenceTagger.load(results_base_path / "checkpoint.pt")
    trainer.resume(model=checkpoint_model, max_epochs=4)

    # clean up results directory
    del trainer
# 2. what tag do we want to predict?
tag_type = 'pos'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

# initialize embeddings
embedding_types: List[TokenEmbeddings] = [
    WordEmbeddings('extvec'),
    FlairEmbeddings('news-forward'),
    FlairEmbeddings('news-backward'),
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type)

trainer: ModelTrainer = ModelTrainer(tagger, corpus)

multi_corpus = MultiCorpus([english_corpus, german_corpus, dutch_corpus])

result = trainer.train('resources/taggers/example-pos',
                       train_with_dev=True,
                       embeddings_storage_mode='cpu',
                       mini_batch_size=256,
                       max_epochs=max_epochs)
Пример #12
0
def run_experiment(seed, batch_size, epoch, learning_rate, hipe_datasets,
                   json_config):
    # Config values
    # Replace it with more Pythonic solutions later!
    hf_model = json_config["hf_model"]
    context_size = json_config["context_size"]
    layers = json_config["layers"] if "layers" in json_config else "-1"
    use_crf = json_config["use_crf"] if "use_crf" in json_config else False

    # Set seed for reproducibility
    set_seed(seed)

    corpus_list = []

    # Dataset-related
    for dataset in hipe_datasets:
        dataset_name, language = dataset.split("/")

        current_corpus = NER_HIPE_2022(dataset_name=dataset_name,
                                       language=language,
                                       add_document_separator=True)

        for split in ["train", "dev"]:
            kb_data = []

            print(f"Loading KB contexts for {dataset}...")

            with open(f"kb_data/ajmc/{language}/{language}_{split}.jsonl",
                      "rt") as f_p:
                for line in f_p:
                    kb_data.append(line)

            corpus_split = current_corpus.train if split == "train" else current_corpus.dev

            for index, sent in enumerate(corpus_split):
                jsonl = json.loads(kb_data[index])

                kb_context = " ".join(jsonl["contexts"]).split(" ")

                sent.kb_context = kb_context

        corpus_list.append(current_corpus)

    if context_size == 0:
        context_size = False

    print("FLERT Context:", context_size)
    print("Layers:", layers)
    print("Use CRF:", use_crf)

    corpora: MultiCorpus = MultiCorpus(corpora=corpus_list,
                                       sample_missing_splits=False)
    label_dictionary = corpora.make_label_dictionary(label_type="ner")
    print("Label Dictionary:", label_dictionary.get_items())

    # Embeddings
    embeddings = KBTransformerEmbeddings(
        model=hf_model,
        layers=layers,
        subtoken_pooling="first",
        fine_tune=True,
        use_context=context_size,
    )

    tagger: SequenceTagger = SequenceTagger(
        hidden_size=256,
        embeddings=embeddings,
        tag_dictionary=label_dictionary,
        tag_type="ner",
        use_crf=use_crf,
        use_rnn=False,
        reproject_embeddings=False,
    )

    # Trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpora)

    datasets = "-".join([dataset for dataset in hipe_datasets])

    trainer.fine_tune(
        f"hipe2022-flert-fine-tune-kb-{datasets}-{hf_model}-bs{batch_size}-ws{context_size}-e{epoch}-lr{learning_rate}-layers{layers}-crf{use_crf}-{seed}",
        learning_rate=learning_rate,
        mini_batch_size=batch_size,
        max_epochs=epoch,
        shuffle=True,
        embeddings_storage_mode='none',
        weight_decay=0.,
        use_final_model_for_eval=False,
    )

    # Finally, print model card for information
    tagger.print_model_card()
# 2. initialize embeddings
from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings
embedding_types = [
    WordEmbeddings("pubmed"),
    FlairEmbeddings("pubmed-forward"),
    FlairEmbeddings("pubmed-backward"),

]

embeddings = StackedEmbeddings(embeddings=embedding_types)

# 3. Initialize corpus
# We also train on the test portions of the corpora, because we evaluate on held-out corpora
from flair.data import MultiCorpus
from torch.utils.data import ConcatDataset
corpus = MultiCorpus(GENE_CORPORA)
corpus._train = ConcatDataset([corpus._train, corpus._test])

# 4. Initialize sequence tagger
from flair.models import SequenceTagger
tag_dictionary = corpus.make_tag_dictionary(tag_type="ner")

tagger = SequenceTagger(
    hidden_size=256,
    embeddings=embeddings,
    tag_dictionary=tag_dictionary,
    tag_type="ner",
    use_crf=True,
    locked_dropout=0.5
)
Пример #14
0
 def load_corpora(tasks: List[Union[NLPTask, str]],
                  base_path: Path = None) -> MultiCorpus:
     return MultiCorpus([
         NLPTaskDataFetcher.load_corpus(task, base_path) for task in tasks
     ])
Пример #15
0
    print('-' * 150, flush=True)
    print(" - Domain: ", domain, flush=True)
    print(" - Corpus Train Size: ", len(corpus.train), flush=True)
    print(" - Corpus Dev Size: ", len(corpus.dev), flush=True)
    print(" - Corpus Test Size: ", len(corpus.test), flush=True)
    print('-' * 150, flush=True)

# initialize embeddings
embedding_types: List[TokenEmbeddings] = [
    WordEmbeddings('extvec'),
    FlairEmbeddings('news-forward'),
    FlairEmbeddings('news-backward'),
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

multi_all_corpus = MultiCorpus(ALL_POS_ALL_CORPUS.values())
# 3. make the tag dictionary from the corpus
tag_dictionary = multi_all_corpus.make_tag_dictionary(tag_type=tag_type)
print('-' * 50,
      '\nTag_dictionary size: ',
      len(tag_dictionary),
      '\n',
      '-' * 50,
      flush=True)
# initialize sequence tagger
tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type)

for target_domain in ALL_POS_DOMAINS:
Пример #16
0
def run_experiment(seed, batch_size, epoch, learning_rate, hipe_datasets,
                   json_config):
    # Config values
    # Replace it with more Pythonic solutions later!
    hf_model = json_config["hf_model"]
    context_size = json_config["context_size"]
    layers = json_config["layers"] if "layers" in json_config else "-1"
    use_crf = json_config["use_crf"] if "use_crf" in json_config else False
    additional_hipe_datasets = json_config[
        "additional_hipe_datasets"] if "additional_hipe_datasets" in json_config else None
    label_name_map = json_config[
        "label_name_map"] if "label_name_map" in json_config else None

    # Set seed for reproducibility
    set_seed(seed)

    corpus_list = []

    # Dataset-related
    for dataset in hipe_datasets:
        dataset_name, language = dataset.split("/")

        preproc_fn = None

        if dataset_name == "ajmc":
            preproc_fn = prepare_ajmc_corpus

        corpus_list.append(
            NER_HIPE_2022(dataset_name=dataset_name,
                          language=language,
                          preproc_fn=preproc_fn,
                          add_document_separator=True))

    if additional_hipe_datasets and label_name_map:
        # Special case: do not use Dev data from additional datasets
        # This makes evaluation and comparison much more easier!

        for dataset in additional_hipe_datasets:
            dataset_name, language = dataset.split("/")

            preproc_fn = None

            if dataset_name == "hipe2020":
                print("Using own HIPE-2020 Preprocessing function.")
                print(
                    "Please make sure that Flair Datasets folder was cleaned before!"
                )
                preproc_fn = prepare_clef_2020_corpus

            additional_corpus = NER_HIPE_2022(dataset_name=dataset_name,
                                              label_name_map=label_name_map,
                                              language=language,
                                              add_document_separator=True,
                                              preproc_fn=preproc_fn)
            additional_corpus._dev = []
            corpus_list.append(additional_corpus)

    if context_size == 0:
        context_size = False

    print("FLERT Context:", context_size)
    print("Layers:", layers)
    print("Use CRF:", use_crf)

    corpora: MultiCorpus = MultiCorpus(corpora=corpus_list,
                                       sample_missing_splits=False)
    label_dictionary = corpora.make_label_dictionary(label_type="ner")
    print("Label Dictionary:", label_dictionary.get_items())

    # Embeddings
    embeddings = TransformerWordEmbeddings(
        model=hf_model,
        layers=layers,
        subtoken_pooling="first",
        fine_tune=True,
        use_context=context_size,
    )

    tagger: SequenceTagger = SequenceTagger(
        hidden_size=256,
        embeddings=embeddings,
        tag_dictionary=label_dictionary,
        tag_type="ner",
        use_crf=use_crf,
        use_rnn=False,
        reproject_embeddings=False,
    )

    # Trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpora)

    datasets = "-".join([dataset for dataset in hipe_datasets])

    trainer.fine_tune(
        f"hipe2022-flert-fine-tune-{datasets}-{hf_model}-bs{batch_size}-ws{context_size}-e{epoch}-lr{learning_rate}-layers{layers}-crf{use_crf}-{seed}",
        learning_rate=learning_rate,
        mini_batch_size=batch_size,
        max_epochs=epoch,
        shuffle=True,
        embeddings_storage_mode='none',
        weight_decay=0.,
        use_final_model_for_eval=False,
    )

    # Finally, print model card for information
    tagger.print_model_card()
Пример #17
0
def run_experiment(seed, batch_size, epoch, learning_rate, hipe_datasets,
                   json_config):
    # Config values
    # Replace it with more Pythonic solutions later!
    best_model = json_config["best_model"]
    context_size = json_config["context_size"]
    layers = json_config["layers"] if "layers" in json_config else "-1"
    use_crf = json_config["use_crf"] if "use_crf" in json_config else False

    # Set seed for reproducibility
    set_seed(seed)

    corpus_list = []

    # Dataset-related
    for dataset in hipe_datasets:
        dataset_name, language = dataset.split("/")
        preproc_fn = None

        if dataset_name == "ajmc":
            preproc_fn = prepare_ajmc_corpus

        corpus_list.append(
            NER_HIPE_2022(dataset_name=dataset_name,
                          language=language,
                          preproc_fn=preproc_fn,
                          add_document_separator=True))

    if context_size == 0:
        context_size = False

    print("FLERT Context:", context_size)
    print("Layers:", layers)
    print("Use CRF:", use_crf)

    corpora: MultiCorpus = MultiCorpus(corpora=corpus_list,
                                       sample_missing_splits=False)
    label_dictionary = corpora.make_label_dictionary(label_type="ner")
    print("Label Dictionary:", label_dictionary.get_items())

    print("Loading model from stage 1:", best_model)
    tagger: SequenceTagger = SequenceTagger.load(best_model)

    # Trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpora)

    datasets = "-".join([dataset for dataset in hipe_datasets])

    best_model_name = best_model.replace("/", "_")

    trainer.fine_tune(
        f"hipe2022-flert-fine-tune-multistage-{datasets}-{best_model_name}-bs{batch_size}-ws{context_size}-e{epoch}-lr{learning_rate}-layers{layers}-crf{use_crf}-{seed}",
        learning_rate=learning_rate,
        mini_batch_size=batch_size,
        max_epochs=epoch,
        shuffle=True,
        embeddings_storage_mode='none',
        weight_decay=0.,
        use_final_model_for_eval=False,
    )

    # Finally, print model card for information
    tagger.print_model_card()
import flair
from flair.data import Corpus
#from flair.datasets import TREC_6
from flair.models import SimpleSequenceTagger
from flair.trainers import ModelTrainer
from flair.embeddings import WordEmbeddings
from flair.data import Sentence
from flair.data import MultiCorpus
from flair.datasets import UD_ENGLISH, UD_GERMAN




# 1. get the corpora - English and German UD
corpus: MultiCorpus = MultiCorpus([UD_ENGLISH(), UD_GERMAN()]).downsample(0.1)

# 2. what tag do we want to predict?
tag_type = 'upos'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary)



#4 embedding
glove_embedding = WordEmbeddings('glove')

#5 sequence tagger
tagger = SimpleSequenceTagger(
Пример #19
0
columns = {0: 'text', 1: 'ner'}
data_folder = '../'

corpus1: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(
    data_folder,
    columns,
    train_file="de-da-te-ta.10E-4percent.conll.train.txt",
    test_file="de-da-te-ta.10E-4percent.conll.test.txt",
    dev_file="de-da-te-ta.10E-4percent.conll.dev.txt")
corpus2: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(
    data_folder,
    columns,
    train_file="de-da-te-ta.10E-4percent.conll.84max.train.txt",
    test_file="de-da-te-ta.10E-4percent.conll.84max.test.txt",
    dev_file="de-da-te-ta.10E-4percent.conll.84max.dev.txt")
corpus = MultiCorpus([corpus1, corpus2])

custom_embedding = WordEmbeddings(
    '../../glove/GloVe/vectors_converted_to_gensim.gensim')
#bert_embedding = BertEmbeddings('bert-embedding-files/')
word_embeddings = StackedEmbeddings([custom_embedding, WordEmbeddings('tr')])

search_space = SearchSpace()
search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[word_embeddings])
#search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[32, 64, 128, 256, 512])
search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[256])
search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[2])
#search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5)
search_space.add(Parameter.LEARNING_RATE,
                 hp.choice,
                 options=[0.05, 0.1, 0.15, 0.2, 0.25])
Пример #20
0
from flair.data import TaggedCorpus, MultiCorpus
from flair.data_fetcher import NLPTaskDataFetcher
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, BertEmbeddings
from typing import List
from flair.data import Dictionary
import flair, torch
flair.device = torch.device('cpu') 

columns = {0: 'text', 1: 'ner'}
data_folder = '../'
corpus1: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(data_folder, columns, train_file="de-da-te-ta.10E-4percent.conll.train.txt", test_file="de-da-te-ta.10E-4percent.conll.test.txt", dev_file="de-da-te-ta.10E-4percent.conll.dev.txt")
corpus2: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(data_folder, columns, train_file="de-da-te-ta.10E-4percent.conll.84max.train.txt", test_file="de-da-te-ta.10E-4percent.conll.84max.test.txt", dev_file="de-da-te-ta.10E-4percent.conll.84max.dev.txt")
corpus = MultiCorpus([corpus1, corpus2])
tag_type = 'ner'
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
#tag_dictionary: Dictionary = Dictionary.load('../vocab/m.model')

glove_embedding = WordEmbeddings('../../glove/GLOVE/GloVe/vectors.gensim')
word2vec_embedding = WordEmbeddings('../../huawei_w2v/vector.gensim')

#bert_embedding = BertEmbeddings('../bert_pretraining/pretraining_outputs/pretraining_output_batch_size_32')
embedding_types: List[TokenEmbeddings] = [WordEmbeddings('tr'), glove_embedding, word2vec_embedding]
#embedding_types: List[TokenEmbeddings] = [custom_embedding]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True, use_rnn=True, rnn_layers=2)

from flair.trainers import ModelTrainer