def hyper_opt(corpus): print("hyper_opt is started") # define your search space search_space = SearchSpace() search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[ StackedEmbeddings([ WordEmbeddings('en'), WordEmbeddings('glove'), CharacterEmbeddings(), FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward'), ELMoEmbeddings() ]) ]) search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[256]) #search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2]) #search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5) search_space.add(Parameter.LEARNING_RATE, hp.choice, options=[0.01, 0.1]) search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[32, 64]) # create the parameter selector param_selector = SequenceTaggerParamSelector( corpus, 'ner', #'/content/gdrive/My Drive/resume_ner_data/hyperparam_selection', model_path, max_epochs=50, training_runs=2, optimization_value=OptimizationValue.DEV_SCORE) # start the optimization param_selector.optimize(search_space, max_evals=100)
tag_type = 'pos' tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('de'), CharLMEmbeddings('german-forward'), CharLMEmbeddings('german-backward'), ] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) from hyperopt import hp from flair.hyperparameter.param_selection import SearchSpace, Parameter search_space = SearchSpace() search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[embeddings]) search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[32, 64, 128, 256, 512]) search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2]) search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5) search_space.add(Parameter.LEARNING_RATE, hp.choice, options=[0.05, 0.1, 0.15, 0.2]) search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[8, 16, 24, 32]) from pathlib import Path from flair.hyperparameter.param_selection import SequenceTaggerParamSelector, OptimizationValue param_selector = SequenceTaggerParamSelector( corpus, tag_type, Path('resources/results'), max_epochs=150,
# Set up the Corpus columns = {0: 'text', 1: 'ner'} data_folder = './data/IOBES' corpus: Corpus = ColumnCorpus(data_folder, columns, train_file="train.txt", dev_file="test.txt") tag_type = 'ner' tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) # define search_space search_space = SearchSpace() search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[ StackedEmbeddings([ ELMoEmbeddings('original'), FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward'), BertEmbeddings('bert-large-cased') ]), StackedEmbeddings([ ELMoEmbeddings('original'), FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward'), BertEmbeddings('bert-large-cased'), CharacterEmbeddings()
if __name__ == "__main__": data_folder = Path("..", "classification", "data", "downsampled", "flair") for c in ["dramen", "romane", "zeitung", "wikipedia"]: test_file = f"{c}-downsampled-test-flair.txt" dev_file = f"{c}-downsampled-val-flair.txt" train_file = f"{c}-downsampled-train-flair.txt" corpus = ClassificationCorpus(data_folder, test_file=test_file, dev_file=dev_file, train_file=train_file) label_dict = corpus.make_label_dictionary() search_space = SearchSpace() search_space.add( Parameter.EMBEDDINGS, hp.choice, options=[[BertEmbeddings("bert-base-german-cased")]], ) search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[32, 64, 128]) search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2]) search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5) search_space.add(Parameter.LEARNING_RATE, hp.choice, options=[0.05, 0.1, 0.15, 0.2]) search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice,
# Set up the Corpus columns = {0: 'text', 1: 'ner'} data_folder = './data/' corpus: Corpus = ColumnCorpus(data_folder, columns, train_file="train.txt", dev_file="dev.txt", test_file="test.txt") tag_dictionary = corpus.make_tag_dictionary(tag_type='ner') # define search_space search_space = SearchSpace() search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[ StackedEmbeddings([ FlairEmbeddings('hi-forward'), FlairEmbeddings('hi-backward') ]), StackedEmbeddings([ WordEmbeddings('hi'), FlairEmbeddings('hi-forward'), FlairEmbeddings('hi-backward') ]), ]) search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[32, 64, 128, 256]) search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2, 4])
from flair.hyperparameter.param_selection import OptimizationValue from flair.datasets import ColumnCorpus from flair.models import SequenceTagger # Set up the Corpus columns = {0: 'text', 1:'ner'} data_folder = './data/IOBES' corpus: Corpus = ColumnCorpus(data_folder, columns, train_file="train.txt", dev_file="dev.txt", test_file="test.txt") tag_type = 'ner' tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) # define search_space search_space = SearchSpace() search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[ StackedEmbeddings([ ELMoEmbeddings('original') ]), StackedEmbeddings([ ELMoEmbeddings('original'), CharacterEmbeddings() ]) ]) search_space.add(Parameter.HIDDEN_SIZE, hp.randint, upper=400) search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1,2]) search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5) search_space.add(Parameter.LEARNING_RATE, hp.uniform, low=0.01, high=0.25) search_space.add(Parameter.PATIENCE, hp.choice, options=[3, 5]) search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[16, 32]) search_space.add(Parameter.USE_CRF, hp.choice, options=[True, False]) # initialise embeddings
] listings_raw = mongo_extract() good_descs, all_skills = find_job_listings(listings_raw) unique_skills = find_unique(all_skills, bad_skills) all_sentences = create_sentences(good_descs) matcher, matcher_lm = create_matchers(unique_skills) raw_descs, all_annos = match_sentences(all_sentences, matcher, matcher_lm) print(len(all_annos)) train_data = all_annos[:4500] test_data = all_annos[4500:6200] dev_data = all_annos[6200:] search_space = SearchSpace() #Create or embedding stacks #Flair recommends adding GLoVe to their character-level embeddings flair_normal = StackedEmbeddings([ WordEmbeddings('glove'), FlairEmbeddings('mix-forward'), FlairEmbeddings('mix-backward') ]) bert = BertEmbeddings() elmo = ELMoEmbeddings('original') flair_pooled = StackedEmbeddings([ WordEmbeddings('glove'), PooledFlairEmbeddings('mix-forward'),
def optimize(directory): """Hyperparameter optimization. """ # 1. Load corpus: train = Path(directory, "train.tsv") dev = Path(directory, "dev.tsv") test = Path(directory, "test.tsv") data = corpus.load(train, dev, test) # 2. Define search space: space = SearchSpace() # 3. Collect embeddings: fasttext = list(trainer.utils.collect_features(["fasttext"])) bert = list(trainer.utils.collect_features(["bert"])) #flair = list(trainer.utils.collect_features(["flair-forward", "flair-backward"])) # 4. Add to search space: space.add(Parameter.EMBEDDINGS, hp.choice, options=[fasttext, bert]) # 5. Add other parameter search spaces: space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[32, 64, 128]) space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2]) space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5) space.add(Parameter.LEARNING_RATE, hp.choice, options=[0.05, 0.1, 0.15, 0.2]) space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[8, 16, 32]) # 6. Create parameter selector: selector = SequenceTaggerParamSelector(corpus=data, tag_type="ner", base_path=Path( "figur-recognition", "optimization"), max_epochs=3, training_runs=3) # 7. Start the optimization: selector.optimize(space, max_evals=100)
def main(): my_parser = argparse.ArgumentParser(add_help=True) my_parser.add_argument('-ft', '--finetuning', action='store', type=int, required=False) args = my_parser.parse_args() # df = import_data() # print(df) data_folder = './adato/data/' column_name_map = {0: 'label_topic', 2: 'text'} corpus: Corpus = CSVClassificationCorpus( data_folder, column_name_map, train_file='cleaned_train.csv', test_file='cleaned_test.csv', skip_header=True, delimiter=',', ) print(corpus) print(corpus.train[0]) word_embeddings = [WordEmbeddings('glove')] document_embeddings = DocumentRNNEmbeddings( word_embeddings, hidden_size=256, ) if args.finetuning: search_space = SearchSpace() search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[ [WordEmbeddings('en')], [ FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward'), ], [document_embeddings], ]) search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[32, 64, 128]) search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2]) search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5) search_space.add(Parameter.LEARNING_RATE, hp.choice, options=[0.05, 0.1, 0.15, 0.2]) search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[8, 16, 32]) param_selector = TextClassifierParamSelector( corpus, False, 'adato/model/classifiers/hyperopt/', 'lstm', max_epochs=40, training_runs=3, optimization_value=OptimizationValue.DEV_SCORE) param_selector.optimize(search_space, max_evals=2) else: label_dict = corpus.make_label_dictionary() classifier = TextClassifier( document_embeddings, label_dictionary=label_dict, ) trainer = ModelTrainer(classifier, corpus) trainer.train('adato/model/classifiers/flair/', learning_rate=0.1, mini_batch_size=32, anneal_factor=0.5, patience=5, max_epochs=40) call(["python", "app/app.py"])
# FlairEmbeddings('spanish-forward-fast'), # FlairEmbeddings('spanish-backward-fast')] # word_embeddings = [WordEmbeddings('../../../../Data/Models/Word2Vec/Spanish_CoNLL17/w2v_es_conll17.gensim.vec'), # WordEmbeddings('../../../../Data/Models/Glove/glove-sbwc_spanish.i25.gensim.vec'), # ELMoEmbeddings('../../../../Data/Models/Elmo/Spanish_CoNLL17/')] # word_embeddings = [FlairEmbeddings('spanish-forward-fast'), FlairEmbeddings('spanish-backward-fast')] # document_embeddings = DocumentLSTMEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256) # classifier = TextClassifier(document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False) # trainer = ModelTrainer(classifier, corpus) # trainer.train('./', max_epochs=10) search_space = SearchSpace() # search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[word_embeddings]) # search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[8, 16, 32, 64, 128]) # search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2]) # search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5) # search_space.add(Parameter.LEARNING_RATE, hp.choice, options=[0.01, 0.025, 0.05, 0.1]) # search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[8, 16, 32]) # search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[ # [WordEmbeddings('../../../../Data/Models/Word2Vec/Spanish_CoNLL17/w2v_es_conll17.gensim.vec')], # [WordEmbeddings('../../../../Data/Models/Glove/glove-sbwc_spanish.i25.gensim.vec')], # [ELMoEmbeddings('../../../../Data/Models/Elmo/Spanish_CoNLL17/')], # [BytePairEmbeddings('es')], # ]) search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[[WordEmbeddings('../../../../Data/Models/Chars/lemma_lowercased_estenten11_freeling_v4_virt.gensim.vec')]])
test_file="de-da-te-ta.10E-4percent.conll.test.txt", dev_file="de-da-te-ta.10E-4percent.conll.dev.txt") corpus2: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus( data_folder, columns, train_file="de-da-te-ta.10E-4percent.conll.84max.train.txt", test_file="de-da-te-ta.10E-4percent.conll.84max.test.txt", dev_file="de-da-te-ta.10E-4percent.conll.84max.dev.txt") corpus = MultiCorpus([corpus1, corpus2]) custom_embedding = WordEmbeddings( '../../glove/GloVe/vectors_converted_to_gensim.gensim') #bert_embedding = BertEmbeddings('bert-embedding-files/') word_embeddings = StackedEmbeddings([custom_embedding, WordEmbeddings('tr')]) search_space = SearchSpace() search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[word_embeddings]) #search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[32, 64, 128, 256, 512]) search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[256]) search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[2]) #search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5) search_space.add(Parameter.LEARNING_RATE, hp.choice, options=[0.05, 0.1, 0.15, 0.2, 0.25]) search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[16]) param_selector = SequenceTaggerParamSelector( corpus=corpus, tag_type='ner', base_path='./results_tr_glove_embedding_learning_rate', max_epochs=10,
if __name__ == "__main__": """ python param_search.py config_file """ with open(sys.argv[1], 'rb') as f: config = json.load(f) # get the corpus column_name_map = {0: config["label_name"], 1: "text"} corpus: Corpus = CSVClassificationCorpus(config["data_folder"], column_name_map, skip_header=True, delimiter='\t', # tab-separated files ) word_embeddings = [utils.get_general_embeddings(), utils.get_mixed_bio_embeddings(), utils.get_bio_embeddings()] search_space = SearchSpace() search_space.add(Parameter.EMBEDDINGS, hp.choice, options=word_embeddings) search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[128, 256]) search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2]) search_space.add(Parameter.BIDIRECTIONAL, hp.choice, options=[False, True]) search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5) search_space.add(Parameter.LEARNING_RATE, hp.choice, options=[0.05, 0.1, 0.15, 0.2]) search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[16, 32]) param_selector = TextClassifierParamSelector( corpus=corpus, multi_label=False, base_path='resources/results'+config['task'], document_embedding_type='lstm', max_epochs=10, training_runs=1, optimization_value=OptimizationValue.DEV_SCORE
columns = {0: 'text', 1: 'ner'} # this is the folder in which train, test and dev files reside data_folder = args.ner_folder # init a corpus using column format, data folder and the names of the train, dev and test files corpus = ColumnCorpus(data_folder, columns, train_file='train.txt', test_file='test.txt', dev_file='dev.txt') print(corpus) # 2. what tag do we want to predict? tag_type = 'ner' # 3. make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) # Language models lm_fwd = args.embedding + "_fwd/best-lm.pt" lm_bwd = args.embedding + "_bwd/best-lm.pt" # define your search space search_space = SearchSpace() search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[StackedEmbeddings([FlairEmbeddings(lm_fwd), FlairEmbeddings(lm_bwd)])]) search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[512, 1024]) search_space.add(Parameter.ANNEAL_FACTOR, hp.choice, options=[0.5, 0.75]) search_space.add(Parameter.LEARNING_RATE, hp.choice, options=[0.1, 0.5, 1.0]) search_space.add(Parameter.PATIENCE, hp.choice, options=[3, 5, 7]) search_space.add(Parameter.DROPOUT, hp.choice, options=[0.15]) search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[8]) # create the parameter selector param_selector = SequenceTaggerParamSelector(corpus, 'ner', args.dst, max_epochs=42, training_runs=1, optimization_value=OptimizationValue.DEV_SCORE) # start the optimization param_selector.optimize(search_space, max_evals=40)
def optimize(): corpus, label_dictionary = load_corpus() corpus.downsample(0.01) # define your search space search_space = SearchSpace() #embeddigns[ RoBERTaEmbeddings(pretrained_model_name_or_path="roberta-base", layers="0,1,2,3,4,5,6,7,8,9,10,11,12", #pooling_operation="first", use_scalar_mix=True) ] embeddings = [ WordEmbeddings('glove'), FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward') ] search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[embeddings]) search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[32, 64, 128, 256, 512]) search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2]) search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5) search_space.add(Parameter.LEARNING_RATE, hp.choice, options=[0.05, 0.1, 0.15, 0.2]) search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[16, 32, 64]) # create the parameter selector param_selector = TextClassifierParamSelector( corpus, False, 'resources/results', 'lstm', max_epochs=10, training_runs=3, optimization_value=OptimizationValue.DEV_SCORE, label_dictionary=label_dictionary) # start the optimization param_selector.optimize(search_space, max_evals=100)