Exemplo n.º 1
0
def hyper_opt(corpus):
    print("hyper_opt is started")
    # define your search space
    search_space = SearchSpace()

    search_space.add(Parameter.EMBEDDINGS,
                     hp.choice,
                     options=[
                         StackedEmbeddings([
                             WordEmbeddings('en'),
                             WordEmbeddings('glove'),
                             CharacterEmbeddings(),
                             FlairEmbeddings('news-forward'),
                             FlairEmbeddings('news-backward'),
                             ELMoEmbeddings()
                         ])
                     ])

    search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[256])
    #search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2])
    #search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5)
    search_space.add(Parameter.LEARNING_RATE, hp.choice, options=[0.01, 0.1])
    search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[32, 64])

    # create the parameter selector
    param_selector = SequenceTaggerParamSelector(
        corpus,
        'ner',
        #'/content/gdrive/My Drive/resume_ner_data/hyperparam_selection',
        model_path,
        max_epochs=50,
        training_runs=2,
        optimization_value=OptimizationValue.DEV_SCORE)

    # start the optimization
    param_selector.optimize(search_space, max_evals=100)
Exemplo n.º 2
0
tag_type = 'pos'

tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

embedding_types: List[TokenEmbeddings] = [
    WordEmbeddings('de'),
    CharLMEmbeddings('german-forward'),
    CharLMEmbeddings('german-backward'),
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

from hyperopt import hp
from flair.hyperparameter.param_selection import SearchSpace, Parameter

search_space = SearchSpace()
search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[embeddings])
search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[32, 64, 128, 256, 512])
search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2])
search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5)
search_space.add(Parameter.LEARNING_RATE, hp.choice, options=[0.05, 0.1, 0.15, 0.2])
search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[8, 16, 24, 32])

from pathlib import Path
from flair.hyperparameter.param_selection import SequenceTaggerParamSelector, OptimizationValue

param_selector = SequenceTaggerParamSelector(
    corpus,
    tag_type,
    Path('resources/results'),
    max_epochs=150,
Exemplo n.º 3
0
# Set up the Corpus
columns = {0: 'text', 1: 'ner'}

data_folder = './data/IOBES'

corpus: Corpus = ColumnCorpus(data_folder,
                              columns,
                              train_file="train.txt",
                              dev_file="test.txt")
tag_type = 'ner'
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

# define search_space

search_space = SearchSpace()
search_space.add(Parameter.EMBEDDINGS,
                 hp.choice,
                 options=[
                     StackedEmbeddings([
                         ELMoEmbeddings('original'),
                         FlairEmbeddings('news-forward'),
                         FlairEmbeddings('news-backward'),
                         BertEmbeddings('bert-large-cased')
                     ]),
                     StackedEmbeddings([
                         ELMoEmbeddings('original'),
                         FlairEmbeddings('news-forward'),
                         FlairEmbeddings('news-backward'),
                         BertEmbeddings('bert-large-cased'),
                         CharacterEmbeddings()
Exemplo n.º 4
0
if __name__ == "__main__":
    data_folder = Path("..", "classification", "data", "downsampled", "flair")
    for c in ["dramen", "romane", "zeitung", "wikipedia"]:
        test_file = f"{c}-downsampled-test-flair.txt"
        dev_file = f"{c}-downsampled-val-flair.txt"
        train_file = f"{c}-downsampled-train-flair.txt"

        corpus = ClassificationCorpus(data_folder,
                                      test_file=test_file,
                                      dev_file=dev_file,
                                      train_file=train_file)

        label_dict = corpus.make_label_dictionary()

        search_space = SearchSpace()
        search_space.add(
            Parameter.EMBEDDINGS,
            hp.choice,
            options=[[BertEmbeddings("bert-base-german-cased")]],
        )
        search_space.add(Parameter.HIDDEN_SIZE,
                         hp.choice,
                         options=[32, 64, 128])
        search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2])
        search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5)
        search_space.add(Parameter.LEARNING_RATE,
                         hp.choice,
                         options=[0.05, 0.1, 0.15, 0.2])
        search_space.add(Parameter.MINI_BATCH_SIZE,
                         hp.choice,
# Set up the Corpus
columns = {0: 'text', 1: 'ner'}

data_folder = './data/'

corpus: Corpus = ColumnCorpus(data_folder,
                              columns,
                              train_file="train.txt",
                              dev_file="dev.txt",
                              test_file="test.txt")
tag_dictionary = corpus.make_tag_dictionary(tag_type='ner')

# define search_space

search_space = SearchSpace()
search_space.add(Parameter.EMBEDDINGS,
                 hp.choice,
                 options=[
                     StackedEmbeddings([
                         FlairEmbeddings('hi-forward'),
                         FlairEmbeddings('hi-backward')
                     ]),
                     StackedEmbeddings([
                         WordEmbeddings('hi'),
                         FlairEmbeddings('hi-forward'),
                         FlairEmbeddings('hi-backward')
                     ]),
                 ])
search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[32, 64, 128, 256])
search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2, 4])
Exemplo n.º 6
0
from flair.hyperparameter.param_selection import OptimizationValue
from flair.datasets import ColumnCorpus
from flair.models import SequenceTagger

# Set up the Corpus
columns = {0: 'text', 1:'ner'}

data_folder = './data/IOBES'

corpus: Corpus = ColumnCorpus(data_folder, columns, train_file="train.txt", dev_file="dev.txt", test_file="test.txt")
tag_type = 'ner'
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

# define search_space

search_space = SearchSpace()
search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[
    StackedEmbeddings([ ELMoEmbeddings('original') ]),
    StackedEmbeddings([ ELMoEmbeddings('original'), CharacterEmbeddings() ])
])
search_space.add(Parameter.HIDDEN_SIZE, hp.randint, upper=400)
search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1,2])
search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5)
search_space.add(Parameter.LEARNING_RATE, hp.uniform, low=0.01, high=0.25)
search_space.add(Parameter.PATIENCE, hp.choice, options=[3, 5])
search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[16, 32])
search_space.add(Parameter.USE_CRF, hp.choice, options=[True, False])


# initialise embeddings
Exemplo n.º 7
0
]

listings_raw = mongo_extract()
good_descs, all_skills = find_job_listings(listings_raw)
unique_skills = find_unique(all_skills, bad_skills)
all_sentences = create_sentences(good_descs)
matcher, matcher_lm = create_matchers(unique_skills)
raw_descs, all_annos = match_sentences(all_sentences, matcher, matcher_lm)

print(len(all_annos))

train_data = all_annos[:4500]
test_data = all_annos[4500:6200]
dev_data = all_annos[6200:]

search_space = SearchSpace()

#Create or embedding stacks
#Flair recommends adding GLoVe to their character-level embeddings

flair_normal = StackedEmbeddings([
    WordEmbeddings('glove'),
    FlairEmbeddings('mix-forward'),
    FlairEmbeddings('mix-backward')
])

bert = BertEmbeddings()
elmo = ELMoEmbeddings('original')
flair_pooled = StackedEmbeddings([
    WordEmbeddings('glove'),
    PooledFlairEmbeddings('mix-forward'),
Exemplo n.º 8
0
def optimize(directory):
    """Hyperparameter optimization.
    """
    # 1. Load corpus:
    train = Path(directory, "train.tsv")
    dev = Path(directory, "dev.tsv")
    test = Path(directory, "test.tsv")
    data = corpus.load(train, dev, test)

    # 2. Define search space:
    space = SearchSpace()

    # 3. Collect embeddings:
    fasttext = list(trainer.utils.collect_features(["fasttext"]))
    bert = list(trainer.utils.collect_features(["bert"]))
    #flair = list(trainer.utils.collect_features(["flair-forward", "flair-backward"]))

    # 4. Add to search space:
    space.add(Parameter.EMBEDDINGS, hp.choice, options=[fasttext, bert])

    # 5. Add other parameter search spaces:
    space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[32, 64, 128])
    space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2])
    space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5)
    space.add(Parameter.LEARNING_RATE,
              hp.choice,
              options=[0.05, 0.1, 0.15, 0.2])
    space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[8, 16, 32])

    # 6. Create parameter selector:
    selector = SequenceTaggerParamSelector(corpus=data,
                                           tag_type="ner",
                                           base_path=Path(
                                               "figur-recognition",
                                               "optimization"),
                                           max_epochs=3,
                                           training_runs=3)

    # 7. Start the optimization:
    selector.optimize(space, max_evals=100)
Exemplo n.º 9
0
def main():
    my_parser = argparse.ArgumentParser(add_help=True)
    my_parser.add_argument('-ft',
                           '--finetuning',
                           action='store',
                           type=int,
                           required=False)

    args = my_parser.parse_args()

    # df = import_data()
    # print(df)

    data_folder = './adato/data/'
    column_name_map = {0: 'label_topic', 2: 'text'}

    corpus: Corpus = CSVClassificationCorpus(
        data_folder,
        column_name_map,
        train_file='cleaned_train.csv',
        test_file='cleaned_test.csv',
        skip_header=True,
        delimiter=',',
    )

    print(corpus)
    print(corpus.train[0])

    word_embeddings = [WordEmbeddings('glove')]
    document_embeddings = DocumentRNNEmbeddings(
        word_embeddings,
        hidden_size=256,
    )

    if args.finetuning:
        search_space = SearchSpace()
        search_space.add(Parameter.EMBEDDINGS,
                         hp.choice,
                         options=[
                             [WordEmbeddings('en')],
                             [
                                 FlairEmbeddings('news-forward'),
                                 FlairEmbeddings('news-backward'),
                             ],
                             [document_embeddings],
                         ])
        search_space.add(Parameter.HIDDEN_SIZE,
                         hp.choice,
                         options=[32, 64, 128])
        search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2])
        search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5)
        search_space.add(Parameter.LEARNING_RATE,
                         hp.choice,
                         options=[0.05, 0.1, 0.15, 0.2])
        search_space.add(Parameter.MINI_BATCH_SIZE,
                         hp.choice,
                         options=[8, 16, 32])

        param_selector = TextClassifierParamSelector(
            corpus,
            False,
            'adato/model/classifiers/hyperopt/',
            'lstm',
            max_epochs=40,
            training_runs=3,
            optimization_value=OptimizationValue.DEV_SCORE)

        param_selector.optimize(search_space, max_evals=2)

    else:
        label_dict = corpus.make_label_dictionary()

        classifier = TextClassifier(
            document_embeddings,
            label_dictionary=label_dict,
        )

        trainer = ModelTrainer(classifier, corpus)

        trainer.train('adato/model/classifiers/flair/',
                      learning_rate=0.1,
                      mini_batch_size=32,
                      anneal_factor=0.5,
                      patience=5,
                      max_epochs=40)

    call(["python", "app/app.py"])
Exemplo n.º 10
0
#                    FlairEmbeddings('spanish-forward-fast'),
#                    FlairEmbeddings('spanish-backward-fast')]

# word_embeddings = [WordEmbeddings('../../../../Data/Models/Word2Vec/Spanish_CoNLL17/w2v_es_conll17.gensim.vec'),
#                    WordEmbeddings('../../../../Data/Models/Glove/glove-sbwc_spanish.i25.gensim.vec'),
#                    ELMoEmbeddings('../../../../Data/Models/Elmo/Spanish_CoNLL17/')]

# word_embeddings = [FlairEmbeddings('spanish-forward-fast'), FlairEmbeddings('spanish-backward-fast')]


# document_embeddings = DocumentLSTMEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256)
# classifier = TextClassifier(document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False)
# trainer = ModelTrainer(classifier, corpus)
# trainer.train('./', max_epochs=10)

search_space = SearchSpace()
# search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[word_embeddings])
# search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[8, 16, 32, 64, 128])
# search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2])
# search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5)
# search_space.add(Parameter.LEARNING_RATE, hp.choice, options=[0.01, 0.025, 0.05, 0.1])
# search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[8, 16, 32])


# search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[
#     [WordEmbeddings('../../../../Data/Models/Word2Vec/Spanish_CoNLL17/w2v_es_conll17.gensim.vec')],
#     [WordEmbeddings('../../../../Data/Models/Glove/glove-sbwc_spanish.i25.gensim.vec')],
#     [ELMoEmbeddings('../../../../Data/Models/Elmo/Spanish_CoNLL17/')],
#     [BytePairEmbeddings('es')],
# ])
search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[[WordEmbeddings('../../../../Data/Models/Chars/lemma_lowercased_estenten11_freeling_v4_virt.gensim.vec')]])
Exemplo n.º 11
0
    test_file="de-da-te-ta.10E-4percent.conll.test.txt",
    dev_file="de-da-te-ta.10E-4percent.conll.dev.txt")
corpus2: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(
    data_folder,
    columns,
    train_file="de-da-te-ta.10E-4percent.conll.84max.train.txt",
    test_file="de-da-te-ta.10E-4percent.conll.84max.test.txt",
    dev_file="de-da-te-ta.10E-4percent.conll.84max.dev.txt")
corpus = MultiCorpus([corpus1, corpus2])

custom_embedding = WordEmbeddings(
    '../../glove/GloVe/vectors_converted_to_gensim.gensim')
#bert_embedding = BertEmbeddings('bert-embedding-files/')
word_embeddings = StackedEmbeddings([custom_embedding, WordEmbeddings('tr')])

search_space = SearchSpace()
search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[word_embeddings])
#search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[32, 64, 128, 256, 512])
search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[256])
search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[2])
#search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5)
search_space.add(Parameter.LEARNING_RATE,
                 hp.choice,
                 options=[0.05, 0.1, 0.15, 0.2, 0.25])
search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[16])

param_selector = SequenceTaggerParamSelector(
    corpus=corpus,
    tag_type='ner',
    base_path='./results_tr_glove_embedding_learning_rate',
    max_epochs=10,
Exemplo n.º 12
0
if __name__ == "__main__":
    """
    python param_search.py config_file
    """
    with open(sys.argv[1], 'rb') as f:
        config = json.load(f)

    # get the corpus
    column_name_map = {0: config["label_name"], 1: "text"}
    corpus: Corpus = CSVClassificationCorpus(config["data_folder"],
                                             column_name_map,
                                             skip_header=True,
                                             delimiter='\t',  # tab-separated files
                                             )
    word_embeddings = [utils.get_general_embeddings(), utils.get_mixed_bio_embeddings(), utils.get_bio_embeddings()]
    search_space = SearchSpace()
    search_space.add(Parameter.EMBEDDINGS, hp.choice, options=word_embeddings)
    search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[128, 256])
    search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2])
    search_space.add(Parameter.BIDIRECTIONAL, hp.choice, options=[False, True])
    search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5)
    search_space.add(Parameter.LEARNING_RATE, hp.choice, options=[0.05, 0.1, 0.15, 0.2])
    search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[16, 32])
    param_selector = TextClassifierParamSelector(
        corpus=corpus,
        multi_label=False,
        base_path='resources/results'+config['task'],
        document_embedding_type='lstm',
        max_epochs=10,
        training_runs=1,
        optimization_value=OptimizationValue.DEV_SCORE
Exemplo n.º 13
0
    columns = {0: 'text', 1: 'ner'}
    # this is the folder in which train, test and dev files reside
    data_folder = args.ner_folder
    # init a corpus using column format, data folder and the names of the train, dev and test files
    corpus = ColumnCorpus(data_folder, columns, train_file='train.txt', test_file='test.txt', dev_file='dev.txt')
    print(corpus)
    # 2. what tag do we want to predict?
    tag_type = 'ner'
    # 3. make the tag dictionary from the corpus
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

    # Language models
    lm_fwd = args.embedding + "_fwd/best-lm.pt"
    lm_bwd = args.embedding + "_bwd/best-lm.pt"

    # define your search space
    search_space = SearchSpace()
    search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[StackedEmbeddings([FlairEmbeddings(lm_fwd), FlairEmbeddings(lm_bwd)])])
    search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[512, 1024])
    search_space.add(Parameter.ANNEAL_FACTOR, hp.choice, options=[0.5, 0.75])
    search_space.add(Parameter.LEARNING_RATE, hp.choice, options=[0.1, 0.5, 1.0])
    search_space.add(Parameter.PATIENCE, hp.choice, options=[3, 5, 7])
    search_space.add(Parameter.DROPOUT, hp.choice, options=[0.15])
    search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[8])

    # create the parameter selector
    param_selector = SequenceTaggerParamSelector(corpus, 'ner', args.dst, max_epochs=42, training_runs=1, optimization_value=OptimizationValue.DEV_SCORE)

    # start the optimization
    param_selector.optimize(search_space, max_evals=40)
Exemplo n.º 14
0
def optimize():
    corpus, label_dictionary = load_corpus()
    corpus.downsample(0.01)
    # define your search space
    search_space = SearchSpace()
    #embeddigns[  RoBERTaEmbeddings(pretrained_model_name_or_path="roberta-base", layers="0,1,2,3,4,5,6,7,8,9,10,11,12",
    #pooling_operation="first", use_scalar_mix=True) ]
    embeddings = [
        WordEmbeddings('glove'),
        FlairEmbeddings('news-forward'),
        FlairEmbeddings('news-backward')
    ]
    search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[embeddings])
    search_space.add(Parameter.HIDDEN_SIZE,
                     hp.choice,
                     options=[32, 64, 128, 256, 512])
    search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2])
    search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5)
    search_space.add(Parameter.LEARNING_RATE,
                     hp.choice,
                     options=[0.05, 0.1, 0.15, 0.2])
    search_space.add(Parameter.MINI_BATCH_SIZE,
                     hp.choice,
                     options=[16, 32, 64])

    # create the parameter selector
    param_selector = TextClassifierParamSelector(
        corpus,
        False,
        'resources/results',
        'lstm',
        max_epochs=10,
        training_runs=3,
        optimization_value=OptimizationValue.DEV_SCORE,
        label_dictionary=label_dictionary)

    # start the optimization
    param_selector.optimize(search_space, max_evals=100)