Пример #1
0
def run_task(task):
    auto_generated_dir = os.getcwd()
    os.chdir(hydra.utils.get_original_cwd())

    result_path = auto_generated_dir
    pathology = task.data.pathology
    data_folder = task.data.data_folder

    corpus = load_corpus(data_folder, task.data.tag_column, pathology,
                         task.data.downsample_perc)

    for i in range(task.number_of_repeat):
        print(f'Repeating: #{i}')

        model_folder = f'{result_path}/{i}'

        tagger, scores = train_eval_tagger(corpus,
                                           model_folder,
                                           cfg_model=task.model)

        dump_file(scores, model_folder, 'score.json')

        print('GC collect')
        gc.collect()
        torch.cuda.empty_cache()
Пример #2
0
 def save(keys, value):
     full_path = path + tuple(str(k) for k in keys)
     return dump_file(value, *full_path)
Пример #3
0
def run_experiment(config):
    print('Active learning strategy:', config.al.strat_name)

    print('Loading task...', config.data.task)
    preprocess = (config.model.model_type == 'crf')
    print(config.data.data_folder)
    X_train, X_test, y_train, y_test, tag_dictionary = load_task(config.data.data_folder, 
                                                                 config.data.task, 
                                                                 config.data.tag_column,
                                                                 preprocess)
    print('Done.')

    strat = strategies_to_try(config.al.strat_name)
    model_name = config.model.model_type

    if config.al.percent:
        percent = 0.02
        print('FULL:', len(y_train))
        y_seed = y_train2y_seed_percent(y_train, percent)
        selector = [False for _ in range(len(y_seed))]
        for ind, answ in enumerate(y_seed):
            if answ is None:
                selector[ind] = False
            elif all(e is None for e in y_seed):
                selector[ind] = False
            else:
                selector[ind] = True

        y_nonempty = np.array(y_seed)[selector]
        print('2PERCENT:', len(y_nonempty))
        max_samples_number = int(len(y_seed) * percent)
    else:
        y_seed = y_train2y_seed(y_train)
        max_samples_number = config.al.max_samples_number

    for repeat in range(config.n_repeats):
        print(f'######################==Repeat {repeat} ==#####################')

        strat = strategies_to_try(config.al.strat_name)

        model_name = config.model.model_type
        
        
        
        if config.al.percent:
            print('FULL:', len(y_train))
            y_seed = y_train2y_seed_percent(y_train)
            selector = [False for _ in range(len(y_seed))]
            for ind, answ in enumerate(y_seed):
                if answ is None:
                    selector[ind] = False
                elif all(e is None for e in y_seed):
                    selector[ind] = False
                else:
                    selector[ind] = True
                
            y_nonempty = np.array(y_seed)[selector]
            print('2PERCENT:', len(y_nonempty))
            max_samples_number = int(len(y_seed) * 0.02)
             
        else:
            y_seed = y_train2y_seed(y_train)
            max_samples_number = config.al.max_samples_number
            
        print('MAX_SAMPLES:', max_samples_number)

        if 'flair' in config.model.model_type:
            print(config.model.model_type)
            
            bayes_type = config.model.bayes_type if config.model.bayes else 'no_bayes'
            models_path = os.path.join(config.exp_path, f'{model_name}_{config.model.emb_name}_{bayes_type}/{config.al.strat_name}')
            os.makedirs(models_path, exist_ok=True)

            if os.path.exists(os.path.join(models_path, f'statistics{repeat}.json')):
                print(f'statistics{repeat}.json already exists. Next')
                continue

            print('Embeddings', config.model.emb_name)
            emb = get_embeddings(config.model.emb_name)

            tagger = SequenceTagger(hidden_size=config.model.hidden_size,
                                    embeddings=emb(),
                                    tag_dictionary=tag_dictionary,
                                    tag_type=config.data.task,
                                    use_crf=True)
            print(config.model.bayes)
            if config.model.bayes:
                print('BAYES CHOSEN')
                convert_to_mc_dropout(tagger, (nn.Dropout, flair.nn.WordDropout, flair.nn.LockedDropout), option='flair')
                active_tagger = LibActFlairBayes(tagger,
                                            base_path=models_path,
                                            reset_model_before_train=True,
                                            mini_batch_size=config.model.bs,
                                            eval_mini_batch_size=config.model.ebs,
                                            checkpoint=False,
                                            learning_rate=config.model.lr,
                                            index_subset=False,
                                            save_all_models=False,
                                            max_epochs=config.model.n_epochs,
                                            min_learning_rate=config.model.min_lr)
                
                print(active_tagger)
                
            else:
                active_tagger = LibActFlair(tagger,
                                            base_path=models_path,
                                            reset_model_before_train=True,
                                            mini_batch_size=config.model.bs,
                                            eval_mini_batch_size=config.model.ebs,
                                            checkpoint=False,
                                            learning_rate=config.model.lr,
                                            index_subset=False,
                                            save_all_models=False,
                                            max_epochs=config.model.n_epochs,
                                            min_learning_rate=config.model.min_lr)
            fit_model = False

        elif config.model.model_type == 'crf':
            models_path = os.path.join(config.exp_path, model_name)
            os.makedirs(models_path, exist_ok=True)

            if os.path.exists(os.path.join(models_path, f'statistics{repeat}.json')):
                print(f'statistics{repeat}.json already exists. Next')
                continue

            active_tagger = LibActCrf(algorithm="lbfgs",
                                      c1=0.1,
                                      c2=0.1,
                                      max_iterations=100,
                                      all_possible_transitions=True)
            fit_model = True

        elif config.model.model_type == 'bert':

            if config.model.bayes:
                libactnn = LibActNNBayes
                bayes_type = config.model.bayes_type
            else:
                libactnn = LibActNN
                bayes_type = 'no_bayes'

            models_path = os.path.join(config.exp_path, f'{model_name}_{bayes_type}')
            os.makedirs(models_path, exist_ok=True)

            if os.path.exists(os.path.join(models_path, f'statistics{repeat}.json')):
                print(f'statistics{repeat}.json already exists. Next')
                continue
            
            index2tag = ['[PAD]'] + tag_dictionary.get_items()
            tag2index = {e: i for i, e in enumerate(index2tag)}
            active_tagger = create_libact_adaptor_bert(tag2index, index2tag, libactnn, config.model, config.cache_dir)
            fit_model = False

        active_learn_alg_ctor = make_libact_strategy_ctor(lambda tr_ds: strat(
            tr_ds, active_tagger), max_samples_number=config.al.max_samples_number)

        active_learner = ActiveLearner(active_learn_alg_ctor=active_learn_alg_ctor,
                                       y_dtype='str',
                                       X_full_dataset=X_train,
                                       y_full_dataset=y_seed,
                                       X_test_dataset=X_test,
                                       y_test_dataset=y_test,
                                       model_evaluate=active_tagger,
                                       eval_metrics=[f1_score],
                                       rnd_start_steps=0)

        statistics = emulate_active_learning(y_train, active_learner,
                                             max_iterations=config.al.n_iterations,
                                             fit_model=fit_model)
        dump_file(statistics, models_path, f'statistics{repeat}.json')