Exemplo n.º 1
def crossValidate(data, labels, chunks, dataCont=None):
    ''' Perform n-fold cross validation.
    Given data array, labels, the chunks as folds, the function performs
    cross validation by using 4 out of 5 folds as training and the fifth as 
    testing set. This is repeated 5 times with a different chunk of the fold 
    serving as testing set. With each fold, both NB and logistic regression 
    is fitted. The validation error with both algorithms are returned.

    errLG = np.empty(len(chunks))  # pred error, logistic regression
    nIters = np.empty(len(chunks),
                      int)  # number of iterations for logistic reg
    errNB = np.empty(len(chunks))  # pred error, naive Bayes

    for ck in range(len(chunks)):
        # get index and dataset for current fold of cross-validation
        trnIdx = np.hstack([x for n, x in enumerate(chunks) if n != ck])
        vldIdx = np.hstack([x for n, x in enumerate(chunks) if n == ck])
        dataTrain, labelTrain = data[trnIdx], labels[trnIdx]  # training
        dataTest, labelTest = data[vldIdx], labels[vldIdx]  # validation

        if labels.ndim > 1:  # if labels have more than one dimension
            labelTest = labelTest.argmax(axis=1)  # collapse to 1D

        ## Fit and predict with naive Bayes
        prb = NB.NB_Train(dataTrain, labelTrain)
        predNB = NB.NB_Pred(dataTest, prb)
        errNB[ck] = errRate(predNB, labelTest)  # error with naive Bayes

        ## Fit and predict with logistic regression
        if dataCont is not None:  # if given non-discretized data
            dataTrain = dataCont[trnIdx]
            dataTest = dataCont[vldIdx]
        if labels.ndim > 1:  # if more than two classes, use multinomial logistic
            wts, nIters[ck] = LG.fitLogisticNK(dataTrain, labelTrain, 0.5)
            predLG = LG.predLogisticNK(dataTest, wts)
        else:  # binary response var, use regular logistic regression
            wt, nIters[ck] = LG.fitLogisticReg(dataTrain, labelTrain, 0.5)
            predLG = LG.predLogistic(dataTest, wt)
        errLG[ck] = errRate(predLG, labelTest)  # error with logistic reg

    return errLG, errNB, nIters