예제 #1
0
def evaluate(gridsearch=True, gen_error=True):
    """Evaluate model

    Compute either an estimate for the generalization error for
    f1_macro with a nested gridsearch or evaluate the parameter
    grid in a simple gridsearch.

    Parameters
    -----------
    gridsearch : boolean, if True the gridsearch is performed

    gen_error : boolean, if True an estimate for the generalization
        error is computed.

    Returns
    ---------
    NOTHING but SAVES the results of the performed computations
    """
    # since there are no hyper parameters to be optimized we only need
    # the generalization error estimate
    MODEL.set_question_loader(subcats=shared.SUBCATS)
    if gridsearch:
        MODEL.gridsearch(param_grid=PARAM_GRID,
                         n_jobs=shared.N_JOBS,
                         CV=shared.CV)
        shared.save_and_report(
            results=MODEL.grid_search_.cv_results_,
            folder='lda')

    if gen_error:
        nested_scores = MODEL.nested_cv(param_grid=PARAM_GRID, CV=shared.CV)
        shared.save_and_report(results=nested_scores,
                               folder='lda',
                               name='gen_error.npy')
예제 #2
0
def evaluate(gridsearch=True, gen_error=True, memory=True):
    """Evaluate model

    Compute either an estimate for the generalization error for
    f1_macro with a nested gridsearch or evaluate the parameter
    grid in a simple gridsearch.

    Parameters
    -----------
    gridsearch : boolean, if True the gridsearch is performed

    gen_error : boolean, if True an estimate for the generalization
        error is computed.

    memory : boolean, if True memory option is used

    Returns
    ---------
    NOTHING but SAVES the results of the performed computations
    """
    MODEL = model.SMSGuruModel(classifier=CLASSIFIER, reduction=None,
                               memory=memory)
    MODEL.set_question_loader(subcats=shared.SUBCATS)
    if gridsearch:
        MODEL.gridsearch(param_grid=PARAM_GRID, n_jobs=shared.N_JOBS,
                         CV=shared.CV)
        shared.save_and_report(
            results=MODEL.grid_search_.cv_results_,
            folder='logreg')

    if gen_error:
        nested_scores = MODEL.nested_cv(param_grid=PARAM_GRID, CV=shared.CV)
        shared.save_and_report(results=nested_scores,
                               folder='logreg',
                               name='gen_error.npy')
예제 #3
0
def evaluate(gridsearch=True, gen_error=True):
    # since there are no hyper parameters to be optimized we only need
    # the generalization error estimate
    MODEL.set_question_loader(subcats=shared.SUBCATS)
    if gridsearch:
        MODEL.gridsearch(param_grid=PARAM_GRID,
                         n_jobs=shared.N_JOBS,
                         CV=shared.CV)
        shared.save_and_report(results=MODEL.grid_search_.cv_results_,
                               folder='multinb')

    if gen_error:
        nested_scores = MODEL.nested_cv(param_grid=PARAM_GRID, CV=shared.CV)
        shared.save_and_report(results=nested_scores,
                               folder='multinb',
                               name='gen_error.npy')
예제 #4
0
def evaluate(gridsearch=True, gen_error=True, memory=True):
    """Evaluate model

    Compute either an estimate for the generalization error for
    f1_macro with a nested gridsearch or evaluate the parameter
    grid in a simple gridsearch.

    Parameters
    -----------
    gridsearch : boolean, if True the gridsearch is performed

    gen_error : boolean, if True an estimate for the generalization
        error is computed.

    Returns
    ---------
    NOTHING but SAVES the results of the performed computations
    """

    MODEL = model.SMSGuruModel(classifier=CLASSIFIER,
                               pre_reduction=PRE_REDUCTION,
                               reduction=LDA(),
                               memory=memory)
    MODEL.set_question_loader(subcats=shared.SUBCATS)
    if gridsearch:
        MODEL.gridsearch(param_grid=PARAM_GRID_DIM,
                         n_jobs=shared.N_JOBS,
                         CV=shared.CV)
        shared.save_and_report(results=MODEL.grid_search_.cv_results_,
                               folder='lda_svm')

    if gen_error:
        # since in this case the higher the dimension the better the estimator
        # we do not include the lower dimensions in this search
        nested_scores = MODEL.nested_cv(param_grid=PARAM_GRID, CV=shared.CV)
        shared.save_and_report(results=nested_scores,
                               folder='lda_svm',
                               name='gen_error.npy')
예제 #5
0
corr_macro = []
for train_index, test_index in skf.split(questions, categoryids):
    q_train, q_test = questions[train_index], questions[test_index]
    cat_train, cat_test = categoryids[train_index], categoryids[test_index]

    # fit all classifiers
    for clf in clfs:
        clf.fit(q_train, cat_train)
    # predict_proba for all classfifiers with best param config
    probas = [clf.predict_proba(q_test) for clf in clfs]

    # micro-averaged corrcoef
    probas_micro = np.asarray([prob.reshape(-1) for prob in probas])
    corr_micro.append(np.corrcoef(probas_micro))

    # macro-averaged corrcoef
    probas_macro = np.asarray(probas)
    corr_macro_class = [
        np.corrcoef(probs) for probs in np.rollaxis(probas_macro, 2)
    ]
    corr_macro.append(np.mean(np.asarray(corr_macro_class), 0))

corr_micro = np.mean(np.asarray(corr_micro), axis=0)
corr_macro = np.mean(np.asarray(corr_macro), axis=0)
shared.save_and_report(results={
    'corr_micro': corr_micro,
    'corr_macro': corr_macro
},
                       folder='ensemble',
                       name='corr')
예제 #6
0
def evaluate(subcats=False,
             comb_method='avg',
             gen_error=False,
             gridsearch=False,
             save_avg_path='./results/gridsearch/ensemble/raw/'):
    """
    Run an ensemble method.

    A voting classifier is used with three inner classifiers (SVM, mNB, LDA).
    The ensemble method is then either evaluated in a gridsearch to evaluate
    the associated parameter grid or in a nested gridsearch to get an estimate
    for the generalization error. There is also the beginning a a bagging
    classifier implemented. But that is not working so far.

    Parameters
    ----------
    subcats : if True subcategories are used as lables, else parent categories

    comb_method: string, determines the method used to combine the classifiers
        in the voting classifier. Can be either 'mult' or 'avg', then the
        classifiers are combined by multiplyinb or averaging, respectively.

    gen_error : boolean, if true a nested gridsearch is performed to estimate
        generalization error.

    gridsearch : boolean, if true a gridsearch is performed to find the best
        parameter combination from the associated grid.

    save_avg_path : string, determines where the probs from the voting
        classifier are saved from the nested gridsearch.
    """
    print('subcats: {}, comb_method: {}'
          ', save_avg_path: {}'.format(subcats, comb_method, save_avg_path))

    if not os.path.exists(save_avg_path):
        print('create directory: {}'.format(save_avg_path))
        os.makedirs(save_avg_path)

    question_loader = ql.QuestionLoader(qfile=shared.QFILE,
                                        catfile=shared.CATFILE,
                                        subcats=subcats,
                                        metadata=True,
                                        verbose=True)

    cv = 5
    verbose = 100

    if comb_method != 'bagging':
        # If a classifier is changed the grid might have to be changed, too
        # Put the estimator with the best expected perfromance at the first
        # position! Then its probability output will be saved!
        SVM = shared.SVM_subcats if subcats else shared.SVM_parentcats
        MNB = shared.MNB_subcats if subcats else shared.MNB_parentcats
        ensemble = VotingClassifierB(estimators=[('svm', SVM), ('mnb', MNB),
                                                 ('lda', shared.LDA)],
                                     voting='soft',
                                     comb_method=comb_method,
                                     save_avg_path=save_avg_path)

        # ##################### without gridsearch ############################
        # scores = cross_val_score(
        #     ensemble, question_loader.questions,
        # question_loader.categoryids, cv=cv,
        #     scoring='f1_macro', n_jobs=-1, verbose=verbose)
        #
        # shared.save_and_report(
        #     results=scores, folder='ensemble', name='gen_error.npy')

        # ##################### with gridsearch ###############################
        # svm param
        C_RANGE = np.logspace(-5, 5, 11)

        # grid
        PARAM_GRID_l = {
            'svm__classifier__base_estimator__C': C_RANGE,
            'svm__union__bow__vectorize__min_df': shared.MIN_DF,
            'svm__union__bow__tfidf': [None, TfidfTransformer()],
            'mnb__union__bow__vectorize__min_df': shared.MIN_DF,
            'mnb__union__bow__tfidf': [None, TfidfTransformer()],
            'lda__union__bow__vectorize__min_df': shared.MIN_DF,
            'lda__union__bow__tfidf': [None, TfidfTransformer()]
        }

        PARAM_GRID_s = {'svm__classifier__base_estimator__C': C_RANGE}

        PARAM_GRID_m = {
            'svm__classifier__base_estimator__C': C_RANGE,
            'svm__union__bow__vectorize__min_df': shared.MIN_DF,
            'mnb__union__bow__vectorize__min_df': shared.MIN_DF,
            'lda__union__bow__vectorize__min_df': shared.MIN_DF
        }

        PARAM_GRID = PARAM_GRID_m
        if gridsearch:
            grid = GridSearchCV(estimator=ensemble,
                                cv=cv,
                                param_grid=PARAM_GRID,
                                refit=False,
                                error_score=-1,
                                n_jobs=-1,
                                verbose=verbose,
                                scoring='f1_macro')

            grid.fit(question_loader.questions, question_loader.categoryids)
            if subcats:
                name = comb_method + 'subcats' + 'grid.npy'
            else:
                name = comb_method + 'grid.npy'
            shared.save_and_report(results=grid.cv_results_,
                                   folder='ensemble',
                                   name=name)

        if gen_error:
            clf = GridSearchCVB(estimator=ensemble,
                                param_grid=PARAM_GRID,
                                cv=cv,
                                n_jobs=-1,
                                scoring='f1_macro',
                                verbose=verbose)

            nested_cv_scores = cross_val_score(clf,
                                               X=question_loader.questions,
                                               y=question_loader.categoryids,
                                               cv=cv,
                                               scoring=f1_macroB,
                                               verbose=verbose)

    if comb_method == 'bagging':
        base_estimator = shared.SVM
        base_estimator.set_params(question_created_at=None,
                                  union__bow__selector=None)

        clf = BaggingClassifier(base_estimator,
                                n_estimators=50,
                                max_samples=1.0)

        X = [pair['question'] for pair in question_loader.questions]
        # X = np.asarray(X).reshape((-1, 1))
        nested_cv_scores = cross_val_score(clf,
                                           X=X,
                                           y=question_loader.categoryids,
                                           cv=cv,
                                           scoring=f1_macroB,
                                           verbose=verbose)

    if gen_error:
        if subcats:
            name = comb_method + 'subcats' + 'gen.npy'
        else:
            name = comb_method + 'gen.npy'
        shared.save_and_report(results=nested_cv_scores,
                               folder='ensemble',
                               name=name)