Пример #1
0
def _var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):
    arr = asanyarray(a)

    rcount = _count_reduce_items(arr, axis)
    # Make this warning show up on top.
    if ddof >= rcount:
        warnings.warn("Degrees of freedom <= 0 for slice", RuntimeWarning,
                      stacklevel=2)

    # Cast bool, unsigned int, and int to float64 by default
    if dtype is None and issubclass(arr.dtype.type, (nt.integer, nt.bool_)):
        dtype = mu.dtype('f8')

    # Compute the mean.
    # Note that if dtype is not of inexact type then arraymean will
    # not be either.
    arrmean = umr_sum(arr, axis, dtype, keepdims=True)
    if isinstance(arrmean, mu.ndarray):
        arrmean = um.true_divide(
                arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
    else:
        arrmean = arrmean.dtype.type(arrmean / rcount)

    # Compute sum of squared deviations from mean
    # Note that x may not be inexact and that we need it to be an array,
    # not a scalar.
    x = asanyarray(arr - arrmean)
    if issubclass(arr.dtype.type, (nt.floating, nt.integer)):
        x = um.multiply(x, x, out=x)
    else:
        x = um.multiply(x, um.conjugate(x), out=x).real

    ret = umr_sum(x, axis, dtype, out, keepdims)

    # Compute degrees of freedom and make sure it is not negative.
    rcount = max([rcount - ddof, 0])

    # divide by degrees of freedom
    if isinstance(ret, mu.ndarray):
        ret = um.true_divide(
                ret, rcount, out=ret, casting='unsafe', subok=False)
    elif hasattr(ret, 'dtype'):
        ret = ret.dtype.type(ret / rcount)
    else:
        ret = ret / rcount

    return ret
Пример #2
0
def _nanvar(a, axis=None, dtype=None, out=None, ddof=0,
                            keepdims=False):
    # Using array() instead of asanyarray() because the former always
    # makes a copy, which is important due to the copyto() action later
    arr = array(a, subok=True)
    mask = isnan(arr)

    # First compute the mean, saving 'rcount' for reuse later
    if dtype is None and (issubdtype(arr.dtype, nt.integer) or
                          issubdtype(arr.dtype, nt.bool_)):
        arrmean = um.add.reduce(arr, axis=axis, dtype='f8', keepdims=True)
    else:
        mu.copyto(arr, 0.0, where=mask)
        arrmean = um.add.reduce(arr, axis=axis, dtype=dtype,
                                keepdims=True)
    rcount = (~mask).sum(axis=axis, keepdims=True)
    if isinstance(arrmean, mu.ndarray):
        arrmean = um.true_divide(arrmean, rcount,
                            out=arrmean, casting='unsafe', subok=False)
    else:
        arrmean = arrmean / float(rcount)

    # arr - arrmean
    x = arr - arrmean
    mu.copyto(x, 0.0, where=mask)

    # (arr - arrmean) ** 2
    if issubdtype(arr.dtype, nt.complex_):
        x = um.multiply(x, um.conjugate(x), out=x).real
    else:
        x = um.multiply(x, x, out=x)

    # add.reduce((arr - arrmean) ** 2, axis)
    ret = um.add.reduce(x, axis=axis, dtype=dtype, out=out,
                        keepdims=keepdims)

    # add.reduce((arr - arrmean) ** 2, axis) / (n - ddof)
    if not keepdims and isinstance(rcount, mu.ndarray):
        rcount = rcount.squeeze(axis=axis)
    rcount -= ddof
    if isinstance(ret, mu.ndarray):
        ret = um.true_divide(ret, rcount,
                        out=ret, casting='unsafe', subok=False)
    else:
        ret = ret / float(rcount)

    return ret
Пример #3
0
def _var(a, axis=None, dtype=None, out=None, ddof=0,
                            skipna=False, keepdims=False):
    arr = asanyarray(a)

    # First compute the mean, saving 'rcount' for reuse later
    if dtype is None and arr.dtype.kind in ['b','u','i']:
        arrmean = um.add.reduce(arr, axis=axis, dtype='f8',
                            skipna=skipna, keepdims=True)
    else:
        arrmean = um.add.reduce(arr, axis=axis, dtype=dtype,
                            skipna=skipna, keepdims=True)
    rcount = mu.count_reduce_items(arr, axis=axis,
                            skipna=skipna, keepdims=True)
    if isinstance(arrmean, mu.ndarray):
        arrmean = um.true_divide(arrmean, rcount,
                            out=arrmean, casting='unsafe', subok=False)
    else:
        arrmean = arrmean / float(rcount)

    # arr - arrmean
    x = arr - arrmean

    # (arr - arrmean) ** 2
    if arr.dtype.kind == 'c':
        x = um.multiply(x, um.conjugate(x), out=x).real
    else:
        x = um.multiply(x, x, out=x)

    # add.reduce((arr - arrmean) ** 2, axis)
    ret = um.add.reduce(x, axis=axis, dtype=dtype, out=out,
                                skipna=skipna, keepdims=keepdims)

    # add.reduce((arr - arrmean) ** 2, axis) / (n - ddof)
    if not keepdims and isinstance(rcount, mu.ndarray):
        rcount = rcount.squeeze(axis=axis)
    rcount -= ddof
    if isinstance(ret, mu.ndarray):
        ret = um.true_divide(ret, rcount,
                        out=ret, casting='unsafe', subok=False)
    else:
        ret = ret / float(rcount)

    return ret
Пример #4
0
def calcEk(oS, k):
    '''
    对于给定的 alpha, 计算 E值
    :param oS:
    :param k:
    :return:
    '''
    fXk = float(multiply(oS.alphas, oS.labelMat).T * oS.K[:, k] + oS.b)
    Ek = fXk - float(oS.labelMat[k])
    return Ek
Пример #5
0
def adaBoostTrainDS(dataArr, classLabels, numIt=40):
    '''
    基于单层决策树的AdaBoost训练过程
    :param dataArr: 数据集
    :param classLabels: 类标签
    :param numIt: 迭代次数, 用户自定义指定
    :return: weakClassArr, 弱分类器集合;aggClassEst,每个数据点的类别估计累计值
    '''
    # 初始化
    weakClassArr = []
    m = shape(dataArr)[0]
    D = mat(ones((m, 1)) / m)  # 初始化概率分布向量,其元素之和为 1
    aggClassEst = mat(zeros((m, 1)))

    for i in range(numIt):
        # 构建单层决策树
        bestStump, error, classEst = buildStump(dataArr, classLabels, D)
        print "D:", D.T

        # alpha每个分类器配备的权重值, 计算公式:alpha = (1/2) * ln[(1-e) / e]
        alpha = float(0.5 * log((1.0 - error) / max(error, 1e-16)))
        bestStump['alpha'] = alpha
        weakClassArr.append(bestStump)  # 存储最佳决策树
        print "classEst: ", classEst.T

        # 更新权重向量D
        # 若正确分类,D[t + 1] = [D[t]*exp(-a) / sum(D)]
        # 若错误分类,D[t + 1] = [D[t]*exp(+a) / sum(D)]
        expon = multiply(-1 * alpha * mat(classLabels).T, classEst)
        D = multiply(D, exp(expon))  # Calc New D for next iteration
        D = D / D.sum()

        aggClassEst += alpha * classEst  # 更新累计类别估计值
        print "aggClassEst: ", aggClassEst.T
        aggErrors = multiply(sign(aggClassEst) != mat(classLabels).T, ones((m, 1)))
        errorRate = aggErrors.sum() / m  # 计算错误率
        print "total error: ", errorRate

        if errorRate == 0.0:
            break  # 为0, 退出循环

    return weakClassArr, aggClassEst
Пример #6
0
def testDigits(kTup=('rbf', 10)):
    '''
    测试基于SVM的手写数字识别系统
    :param kTup: 输入参数,元组类型
    :return:
    '''
    dataArr, labelArr = loadImages(os.path.dirname(os.getcwd()) +
                                   '\\datas\\digits\\trainingDigits')
    b, alphas = smoP(dataArr, labelArr, 200, 0.0001, 10000, kTup)
    datMat = mat(dataArr)
    labelMat = mat(labelArr).transpose()
    svInd = nonzero(alphas.A > 0)[0]
    sVs = datMat[svInd]
    labelSV = labelMat[svInd]

    print "there are %d Support Vectors" % shape(sVs)[0]
    m, n = shape(datMat)
    errorCount = 0

    for i in range(m):
        kernelEval = kernelTrans(sVs, datMat[i, :], kTup)
        predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b

        if sign(predict) != sign(labelArr[i]):
            errorCount += 1
    print "the training error rate is: %f" % (float(errorCount) / m)

    dataArr, labelArr = loadImages(os.path.dirname(os.getcwd()) +
                                   '\\datas\\digits\\testDigits')
    errorCount = 0
    datMat = mat(dataArr)
    labelMat = mat(labelArr).transpose()
    m, n = shape(datMat)

    for i in range(m):
        kernelEval = kernelTrans(sVs, datMat[i, :], kTup)
        predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b

        if sign(predict) != sign(labelArr[i]):
            errorCount += 1
    print "the test error rate is: %f" % (float(errorCount) / m)
Пример #7
0
def testRbf(k1=1.3):
    '''
    利用核函数进行分类的径向基测试函数, 数据集表现为圆形分类,如figure_3.png
    :param k1: 输入参数,高斯径向基中的用户自定义变量
    :return:
    '''
    dataArr, labelArr = loadDataSet(os.path.dirname(os.getcwd()) +
                                    '\\datas\\testSetRBF.txt')
    b, alphas = smoP(dataArr, labelArr, 200, 0.0001, 10000, ('rbf', k1))
    datMat = mat(dataArr)
    labelMat = mat(labelArr).transpose()
    svInd = nonzero(alphas.A > 0)[0]
    sVs = datMat[svInd]  # 获取唯一的支持向量
    labelSV = labelMat[svInd]

    print "there are %d Support Vectors" % shape(sVs)[0]
    m, n = shape(datMat)
    errorCount = 0

    for i in range(m):
        kernelEval = kernelTrans(sVs, datMat[i, :], ('rbf', k1))
        predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b

        if sign(predict) != sign(labelArr[i]):
            errorCount += 1
    print "the training error rate is: %f" % (float(errorCount) / m)

    dataArr, labelArr = loadDataSet(os.path.dirname(os.getcwd()) +
                                    '\\datas\\testSetRBF2.txt')
    errorCount = 0
    datMat = mat(dataArr)
    labelMat = mat(labelArr).transpose()
    m, n = shape(datMat)

    for i in range(m):
        kernelEval = kernelTrans(sVs, datMat[i, :], ('rbf', k1))
        predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b

        if sign(predict) != sign(labelArr[i]):
            errorCount += 1
    print "the test error rate is: %f" % (float(errorCount) / m)
Пример #8
0
 def test_wrap_with_iterable(self):
     # test fix for bug #1026:
     class with_wrap(np.ndarray):
         __array_priority__ = 10
         def __new__(cls):
             return np.asarray(1).view(cls).copy()
         def __array_wrap__(self, arr, context):
             return arr.view(type(self))
     a = with_wrap()
     x = ncu.multiply(a, (1, 2, 3))
     self.assertTrue(isinstance(x, with_wrap))
     assert_array_equal(x, np.array((1, 2, 3)))
Пример #9
0
def crossValidation(xArr, yArr, numVal=10):
    '''
    交叉验证测试岭回归
    :param xArr:
    :param yArr:
    :param numVal:
    :return:
    '''
    m = len(yArr)
    indexList = range(m)
    errorMat = zeros((numVal, 30))

    for i in range(numVal):
        trainX = []
        trainY = []
        testX = []
        testY = []
        random.shuffle(indexList)

        for j in range(m):
            if j < m * 0.9:
                trainX.append(xArr[indexList[j]])
                trainY.append(yArr[indexList[j]])
            else:
                testX.append(xArr[indexList[j]])
                testY.append(yArr[indexList[j]])

        wMat = ridgeTest(trainX, trainY)
        for k in range(30):
            matTestX = mat(testX)
            matTrainX = mat(trainX)
            meanTrain = mean(matTrainX, 0)
            varTrain = var(matTrainX, 0)
            matTestX = (matTestX - meanTrain) / varTrain
            yEst = matTestX * mat(wMat[k, :]).T + mean(trainY)
            errorMat[i, k] = rssError(yEst.T.A, array(testY))
            print errorMat[i, k]

    meanErrors = mean(errorMat, 0)
    minMean = float(min(meanErrors))
    bestWeights = wMat[nonzero(meanErrors == minMean)]

    xMat = mat(xArr)
    yMat = mat(yArr).T
    meanX = mean(xMat, 0)
    varX = var(xMat, 0)
    unReg = bestWeights / varX

    print "the best model from Ridge Regression is:\n", unReg
    print "with constant term: ", -1 * sum(multiply(meanX, unReg)) + mean(yMat)
Пример #10
0
def calcWs(alphas, dataArr, classLabels):
    '''
    计算 w值
    :param alphas: 参数
    :param dataArr: 数据集
    :param classLabels: 类标签
    :return:
    '''
    X = mat(dataArr)
    labelMat = mat(classLabels).transpose()
    m, n = shape(X)
    w = zeros((n, 1))

    for i in range(m):
        w += multiply(alphas[i] * labelMat[i], X[i, :].T)

    return w
Пример #11
0
def _var(a,
         axis=None,
         dtype=None,
         out=None,
         ddof=0,
         keepdims=False,
         *,
         where=True):
    arr = asanyarray(a)

    rcount = _count_reduce_items(arr, axis, keepdims=keepdims, where=where)
    # Make this warning show up on top.
    if ddof >= rcount if where is True else umr_any(ddof >= rcount):
        warnings.warn("Degrees of freedom <= 0 for slice",
                      RuntimeWarning,
                      stacklevel=2)

    # Cast bool, unsigned int, and int to float64 by default
    if dtype is None and issubclass(arr.dtype.type, (nt.integer, nt.bool_)):
        dtype = mu.dtype('f8')

    # Compute the mean.
    # Note that if dtype is not of inexact type then arraymean will
    # not be either.
    arrmean = umr_sum(arr, axis, dtype, keepdims=True, where=where)
    # The shape of rcount has to match arrmean to not change the shape of out
    # in broadcasting. Otherwise, it cannot be stored back to arrmean.
    if rcount.ndim == 0:
        # fast-path for default case when where is True
        div = rcount
    else:
        # matching rcount to arrmean when where is specified as array
        div = rcount.reshape(arrmean.shape)
    if isinstance(arrmean, mu.ndarray):
        arrmean = um.true_divide(arrmean,
                                 div,
                                 out=arrmean,
                                 casting='unsafe',
                                 subok=False)
    else:
        arrmean = arrmean.dtype.type(arrmean / rcount)

    # Compute sum of squared deviations from mean
    # Note that x may not be inexact and that we need it to be an array,
    # not a scalar.
    x = asanyarray(arr - arrmean)

    if issubclass(arr.dtype.type, (nt.floating, nt.integer)):
        x = um.multiply(x, x, out=x)
    # Fast-paths for built-in complex types
    elif x.dtype in _complex_to_float:
        xv = x.view(dtype=(_complex_to_float[x.dtype], (2, )))
        um.multiply(xv, xv, out=xv)
        x = um.add(xv[..., 0], xv[..., 1], out=x.real).real
    # Most general case; includes handling object arrays containing imaginary
    # numbers and complex types with non-native byteorder
    else:
        x = um.multiply(x, um.conjugate(x), out=x).real

    ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)

    # Compute degrees of freedom and make sure it is not negative.
    rcount = um.maximum(rcount - ddof, 0)

    # divide by degrees of freedom
    if isinstance(ret, mu.ndarray):
        ret = um.true_divide(ret,
                             rcount,
                             out=ret,
                             casting='unsafe',
                             subok=False)
    elif hasattr(ret, 'dtype'):
        ret = ret.dtype.type(ret / rcount)
    else:
        ret = ret / rcount

    return ret
Пример #12
0
def smoSimple(dataMatIn, classLabels, C, toler, maxIter):
    '''
    简化版 SMO算法实现
    :param dataMatIn: 数据集
    :param classLabels: 类标签
    :param C: 常数 C
    :param toler: 容错率
    :param maxIter: 取消前最大的循环次数
    :return:
    '''
    dataMatrix = mat(dataMatIn)
    labelMat = mat(classLabels).transpose()

    # 初始化
    b = 0
    m, n = shape(dataMatrix)
    alphas = mat(zeros((m, 1)))
    iter = 0

    while (iter < maxIter):
        # 如果Alpha可以更改进入优化过程
        alphaPairsChanged = 0
        for i in range(m):
            fXi = float(multiply(alphas, labelMat).T *
                        (dataMatrix * dataMatrix[i, :].T)) + b
            Ei = fXi - float(labelMat[i])

            if ((labelMat[i] * Ei < -toler) and (alphas[i] < C)) or \
                    ((labelMat[i] * Ei > toler) and (alphas[i] > 0)):
                j = selectJrand(i, m)  # 随机选择第二个alpha
                fXj = float(multiply(alphas, labelMat).T *
                            (dataMatrix * dataMatrix[j, :].T)) + b
                Ej = fXj - float(labelMat[j])
                alphaIold = alphas[i].copy()
                alphaJold = alphas[j].copy()

                # 保证alpha在0和C之间
                if (labelMat[i] != labelMat[j]):
                    L = max(0, alphas[j] - alphas[i])
                    H = min(C, C + alphas[j] - alphas[i])
                else:
                    L = max(0, alphas[j] + alphas[i] - C)
                    H = min(C, alphas[j] + alphas[i])

                if L == H:
                    print "L==H"
                    continue

                eta = 2.0 * dataMatrix[i, :] * dataMatrix[j, :].T - \
                      dataMatrix[i, :] * dataMatrix[i, :].T - \
                      dataMatrix[j, :] * dataMatrix[j, :].T
                if eta >= 0:
                    print "eta>=0"
                    continue

                alphas[j] -= labelMat[j] * (Ei - Ej) / eta
                alphas[j] = clipAlpha(alphas[j], H, L)

                if (abs(alphas[j] - alphaJold) < 0.00001):
                    print "j not moving enough"
                    continue
                alphas[i] += labelMat[j] * labelMat[i] * (alphaJold - alphas[j])
                b1 = b - Ei - labelMat[i] * (alphas[i] - alphaIold) \
                              * dataMatrix[i, :] * dataMatrix[i, :].T - \
                     labelMat[j] * (alphas[j] - alphaJold) * dataMatrix[i, :] * \
                     dataMatrix[j, :].T
                b2 = b - Ej - labelMat[i] * (alphas[i] - alphaIold) * \
                              dataMatrix[i, :] * dataMatrix[j, :].T - \
                     labelMat[j] * (alphas[j] - alphaJold) * dataMatrix[j, :] * \
                     dataMatrix[j, :].T

                if (0 < alphas[i]) and (C > alphas[i]):
                    b = b1
                elif (0 < alphas[j]) and (C > alphas[j]):
                    b = b2
                else:
                    b = (b1 + b2) / 2.0
                alphaPairsChanged += 1
                print "iter: %d i:%d, pairs changed %d" % (iter, i, alphaPairsChanged)

        if (alphaPairsChanged == 0):
            iter += 1
        else:
            iter = 0
        print "iteration number: %d" % iter
    return b, alphas
Пример #13
0
def average(a, axis=None, weights=None, returned=False):
    """average(a, axis=None weights=None, returned=False)

    Average the array over the given axis.  If the axis is None,
    average over all dimensions of the array.  Equivalent to
    a.mean(axis) and to

      a.sum(axis) / size(a, axis)

    If weights are given, result is:
        sum(a * weights,axis) / sum(weights,axis),
    where the weights must have a's shape or be 1D with length the
    size of a in the given axis. Integer weights are converted to
    Float.  Not specifying weights is equivalent to specifying
    weights that are all 1.

    If 'returned' is True, return a tuple: the result and the sum of
    the weights or count of values. The shape of these two results
    will be the same.

    Raises ZeroDivisionError if appropriate.  (The version in MA does
    not -- it returns masked values).

    """
    if axis is None:
        a = array(a).ravel()
        if weights is None:
            n = add.reduce(a)
            d = len(a) * 1.0
        else:
            w = array(weights).ravel() * 1.0
            n = add.reduce(multiply(a, w))
            d = add.reduce(w)
    else:
        a = array(a)
        ash = a.shape
        if ash == ():
            a.shape = (1,)
        if weights is None:
            n = add.reduce(a, axis)
            d = ash[axis] * 1.0
            if returned:
                d = ones(n.shape) * d
        else:
            w = array(weights, copy=False) * 1.0
            wsh = w.shape
            if wsh == ():
                wsh = (1,)
            if wsh == ash:
                n = add.reduce(a*w, axis)
                d = add.reduce(w, axis)
            elif wsh == (ash[axis],):
                ni = ash[axis]
                r = [newaxis]*ni
                r[axis] = slice(None, None, 1)
                w1 = eval("w["+repr(tuple(r))+"]*ones(ash, float)")
                n = add.reduce(a*w1, axis)
                d = add.reduce(w1, axis)
            else:
                raise ValueError, 'averaging weights have wrong shape'

    if not isinstance(d, ndarray):
        if d == 0.0:
            raise ZeroDivisionError, 'zero denominator in average()'
    if returned:
        return n/d, d
    else:
        return n/d