def run_task(task): auto_generated_dir = os.getcwd() os.chdir(hydra.utils.get_original_cwd()) result_path = auto_generated_dir pathology = task.data.pathology data_folder = task.data.data_folder corpus = load_corpus(data_folder, task.data.tag_column, pathology, task.data.downsample_perc) for i in range(task.number_of_repeat): print(f'Repeating: #{i}') model_folder = f'{result_path}/{i}' tagger, scores = train_eval_tagger(corpus, model_folder, cfg_model=task.model) dump_file(scores, model_folder, 'score.json') print('GC collect') gc.collect() torch.cuda.empty_cache()
def save(keys, value): full_path = path + tuple(str(k) for k in keys) return dump_file(value, *full_path)
def run_experiment(config): print('Active learning strategy:', config.al.strat_name) print('Loading task...', config.data.task) preprocess = (config.model.model_type == 'crf') print(config.data.data_folder) X_train, X_test, y_train, y_test, tag_dictionary = load_task(config.data.data_folder, config.data.task, config.data.tag_column, preprocess) print('Done.') strat = strategies_to_try(config.al.strat_name) model_name = config.model.model_type if config.al.percent: percent = 0.02 print('FULL:', len(y_train)) y_seed = y_train2y_seed_percent(y_train, percent) selector = [False for _ in range(len(y_seed))] for ind, answ in enumerate(y_seed): if answ is None: selector[ind] = False elif all(e is None for e in y_seed): selector[ind] = False else: selector[ind] = True y_nonempty = np.array(y_seed)[selector] print('2PERCENT:', len(y_nonempty)) max_samples_number = int(len(y_seed) * percent) else: y_seed = y_train2y_seed(y_train) max_samples_number = config.al.max_samples_number for repeat in range(config.n_repeats): print(f'######################==Repeat {repeat} ==#####################') strat = strategies_to_try(config.al.strat_name) model_name = config.model.model_type if config.al.percent: print('FULL:', len(y_train)) y_seed = y_train2y_seed_percent(y_train) selector = [False for _ in range(len(y_seed))] for ind, answ in enumerate(y_seed): if answ is None: selector[ind] = False elif all(e is None for e in y_seed): selector[ind] = False else: selector[ind] = True y_nonempty = np.array(y_seed)[selector] print('2PERCENT:', len(y_nonempty)) max_samples_number = int(len(y_seed) * 0.02) else: y_seed = y_train2y_seed(y_train) max_samples_number = config.al.max_samples_number print('MAX_SAMPLES:', max_samples_number) if 'flair' in config.model.model_type: print(config.model.model_type) bayes_type = config.model.bayes_type if config.model.bayes else 'no_bayes' models_path = os.path.join(config.exp_path, f'{model_name}_{config.model.emb_name}_{bayes_type}/{config.al.strat_name}') os.makedirs(models_path, exist_ok=True) if os.path.exists(os.path.join(models_path, f'statistics{repeat}.json')): print(f'statistics{repeat}.json already exists. Next') continue print('Embeddings', config.model.emb_name) emb = get_embeddings(config.model.emb_name) tagger = SequenceTagger(hidden_size=config.model.hidden_size, embeddings=emb(), tag_dictionary=tag_dictionary, tag_type=config.data.task, use_crf=True) print(config.model.bayes) if config.model.bayes: print('BAYES CHOSEN') convert_to_mc_dropout(tagger, (nn.Dropout, flair.nn.WordDropout, flair.nn.LockedDropout), option='flair') active_tagger = LibActFlairBayes(tagger, base_path=models_path, reset_model_before_train=True, mini_batch_size=config.model.bs, eval_mini_batch_size=config.model.ebs, checkpoint=False, learning_rate=config.model.lr, index_subset=False, save_all_models=False, max_epochs=config.model.n_epochs, min_learning_rate=config.model.min_lr) print(active_tagger) else: active_tagger = LibActFlair(tagger, base_path=models_path, reset_model_before_train=True, mini_batch_size=config.model.bs, eval_mini_batch_size=config.model.ebs, checkpoint=False, learning_rate=config.model.lr, index_subset=False, save_all_models=False, max_epochs=config.model.n_epochs, min_learning_rate=config.model.min_lr) fit_model = False elif config.model.model_type == 'crf': models_path = os.path.join(config.exp_path, model_name) os.makedirs(models_path, exist_ok=True) if os.path.exists(os.path.join(models_path, f'statistics{repeat}.json')): print(f'statistics{repeat}.json already exists. Next') continue active_tagger = LibActCrf(algorithm="lbfgs", c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True) fit_model = True elif config.model.model_type == 'bert': if config.model.bayes: libactnn = LibActNNBayes bayes_type = config.model.bayes_type else: libactnn = LibActNN bayes_type = 'no_bayes' models_path = os.path.join(config.exp_path, f'{model_name}_{bayes_type}') os.makedirs(models_path, exist_ok=True) if os.path.exists(os.path.join(models_path, f'statistics{repeat}.json')): print(f'statistics{repeat}.json already exists. Next') continue index2tag = ['[PAD]'] + tag_dictionary.get_items() tag2index = {e: i for i, e in enumerate(index2tag)} active_tagger = create_libact_adaptor_bert(tag2index, index2tag, libactnn, config.model, config.cache_dir) fit_model = False active_learn_alg_ctor = make_libact_strategy_ctor(lambda tr_ds: strat( tr_ds, active_tagger), max_samples_number=config.al.max_samples_number) active_learner = ActiveLearner(active_learn_alg_ctor=active_learn_alg_ctor, y_dtype='str', X_full_dataset=X_train, y_full_dataset=y_seed, X_test_dataset=X_test, y_test_dataset=y_test, model_evaluate=active_tagger, eval_metrics=[f1_score], rnd_start_steps=0) statistics = emulate_active_learning(y_train, active_learner, max_iterations=config.al.n_iterations, fit_model=fit_model) dump_file(statistics, models_path, f'statistics{repeat}.json')