Exemplo n.º 1
0
def attest_ind(a, b, dim=None):
    """ Return the t-test statistics on arrays a and b over the dim axis.
    Returns both the t statistic as well as the p-value
    """
#    dim = a.ndim - 1 if dim is None else dim
    x1, x2 = ma.mean(a, dim), ma.mean(b, dim)
    v1, v2 = ma.var(a, dim), ma.var(b, dim)
    n1, n2 = (a.shape[dim], b.shape[dim]) if dim is not None else (a.size, b.size)
    df = float(n1+n2-2)
    svar = ((n1-1)*v1+(n2-1)*v2) / df
    t = (x1-x2)/ma.sqrt(svar*(1.0/n1 + 1.0/n2))
    if t.ndim == 0:
        return (t, statc.betai(0.5*df,0.5,df/(df+t**2)) if t is not ma.masked and df/(df+t**2) <= 1.0 else ma.masked)
    else:
        prob = [statc.betai(0.5*df,0.5,df/(df+tsq)) if tsq is not ma.masked and df/(df+tsq) <= 1.0 else ma.masked  for tsq in t*t]
        return t, prob
Exemplo n.º 2
0
def aF_oneway(*args, **kwargs):
    dim = kwargs.get("dim", None)
    arrays = args
    means = [ma.mean(a, dim) for a in arrays]
    vars = [ma.var(a, dim) for a in arrays]
    lens = [ma.sum(ma.array(ma.ones(a.shape), mask=ma.asarray(a).mask), dim) for a in arrays]
    alldata = ma.concatenate(arrays, dim if dim is not None else 0)
    bign =  ma.sum(ma.array(ma.ones(alldata.shape), mask=alldata.mask), dim)
    sstot = ma.sum(alldata ** 2, dim) - (ma.sum(alldata, dim) ** 2) / bign
    ssbn = ma.sum([(ma.sum(a, dim) ** 2) / L for a, L in zip(arrays, lens)], dim)
#    print ma.sum(alldata, dim) ** 2 / bign, ssbn
    ssbn -= ma.sum(alldata, dim) ** 2 / bign
    sswn = sstot - ssbn
    dfbn = dfnum = float(len(args) - 1.0)
    dfwn = bign - len(args) # + 1.0
    F = (ssbn / dfbn) / (sswn / dfwn)
    if F.ndim == 0 and dfwn.ndim == 0:
        return (F,statc.betai(0.5 * dfwn, 0.5 * dfnum, dfwn/float(dfwn+dfnum*F)) if F is not ma.masked and dfwn/float(dfwn+dfnum*F) <= 1.0 \
                and dfwn/float(dfwn+dfnum*F) >= 0.0 else ma.masked)
    else:
        prob = [statc.betai(0.5 * dfden, 0.5 * dfnum, dfden/float(dfden+dfnum*f)) if f is not ma.masked and dfden/float(dfden+dfnum*f) <= 1.0 \
            and dfden/float(dfden+dfnum*f) >= 0.0 else ma.masked for dfden, f in zip (dfwn, F)]
        return F, prob
Exemplo n.º 3
0
    def __call__(self, data, weight=None):
        if not self.use_attributes is None:
            new_domain = orange.Domain(self.use_attributes, data.domain.classVar)
            new_domain.addmetas(data.domain.getmetas())
            data = orange.ExampleTable(new_domain, data)
            
        if self.stepwise and self.stepwise_before:
            use_attributes=stepwise(data,add_sig=self.add_sig,remove_sig=self.remove_sig)
            new_domain = orange.Domain(use_attributes, data.domain.classVar)
            new_domain.addmetas(data.domain.getmetas())
            data = orange.ExampleTable(new_domain, data)

        # continuization (replaces discrete with continuous attributes)
        continuizer = orange.DomainContinuizer()
        continuizer.multinomialTreatment = continuizer.FrequentIsBase
        continuizer.zeroBased = True
        domain0 = continuizer(data)
        data = data.translate(domain0)

        if self.stepwise and not self.stepwise_before:
            use_attributes=stepwise(data,weight,add_sig=self.add_sig,remove_sig=self.remove_sig)
            new_domain = orange.Domain(use_attributes, data.domain.classVar)
            new_domain.addmetas(data.domain.getmetas())
            data = orange.ExampleTable(new_domain, data)        
        
        # missing values handling (impute missing)
        imputer = orange.ImputerConstructor_model()
        imputer.learnerContinuous = orange.MajorityLearner()
        imputer.learnerDiscrete = orange.MajorityLearner()
        imputer = imputer(data)
        data = imputer(data)

        # convertion to numpy
        A, y, w = data.toNumpy()        # weights ??
        if A is None:
            n = len(data)
            m = 0
        else:
            n, m = numpy.shape(A)
     
        if self.beta0 == True:
             if A is None:
                 X = numpy.ones([len(data),1])
             else:
                 X = numpy.insert(A,0,1,axis=1) # adds a column of ones
        else:
             X = A

        # set weights
        W = numpy.identity(len(data))
        if weight:
            for di, d in enumerate(data):
                W[di,di] = float(d[weight])

        D = dot(dot(numpy.linalg.pinv(dot(dot(X.T,W),X)), X.T), W) # adds some robustness by computing the pseudo inverse; normal inverse could fail due to singularity of the X.T*W*X
        beta = dot(D,y)

        yEstimated = dot(X,beta)  # estimation
        # some desriptive statistisc
        muY, sigmaY = numpy.mean(y), numpy.std(y)
        muX, covX = numpy.mean(X, axis = 0), numpy.cov(X, rowvar = 0)
 
        # model statistics
        SST, SSR = numpy.sum((y - muY) ** 2), numpy.sum((yEstimated - muY) ** 2)
        SSE, RSquare = SST-SSR, SSR/SST
        R = numpy.sqrt(RSquare) # coefficient of determination
        RAdjusted = 1 - (1 - RSquare) * (n - 1) / (n - m - 1)
        F = (SSR / m) / (SST - SSR / (n - m - 1)) # F statistisc
        df = m - 1
 
        sigmaSquare = SSE / (n-m-1)

        # standard error of estimated coefficients
        errCoeff = sqrt(sigmaSquare * inv(dot(X.T,X)).diagonal())
 
        # t statistisc, significance
        t = beta / errCoeff
        df = n-2
        significance = []
        for tt in t:
            try:
                significance.append(statc.betai(df*0.5,0.5,df/(df+tt*tt)))
            except:
                significance.append(1.0)
 
        # standardized coefficients
        if m>0:
            stdCoeff = (sqrt(covX.diagonal()) / sigmaY)  * beta
        else:
            stdCoeff = (sqrt(covX) / sigmaY)  * beta
 
        model = {'descriptives': { 'meanX' : muX, 'covX' : covX, 'meanY' : muY, 'sigmaY' : sigmaY},
                 'model' : {'estCoeff' : beta, 'stdErrorEstimation': errCoeff},
                 'model summary': {'TotalVar' : SST, 'ExplVar' : SSE, 'ResVar' : SSR, 'R' : R, 'RAdjusted' : RAdjusted,
                                   'F' : F, 't' : t, 'sig': significance}}
        return LinearRegression(statistics = model, domain = data.domain, name = self.name, beta0 = self.beta0, imputer=imputer)
Exemplo n.º 4
0
    def __call__(self, data, weight=None):
        if not self.use_attributes is None:
            new_domain = orange.Domain(self.use_attributes,
                                       data.domain.classVar)
            new_domain.addmetas(data.domain.getmetas())
            data = orange.ExampleTable(new_domain, data)

        if self.stepwise and self.stepwise_before:
            use_attributes = stepwise(data,
                                      add_sig=self.add_sig,
                                      remove_sig=self.remove_sig)
            new_domain = orange.Domain(use_attributes, data.domain.classVar)
            new_domain.addmetas(data.domain.getmetas())
            data = orange.ExampleTable(new_domain, data)

        # continuization (replaces discrete with continuous attributes)
        continuizer = orange.DomainContinuizer()
        continuizer.multinomialTreatment = continuizer.FrequentIsBase
        continuizer.zeroBased = True
        domain0 = continuizer(data)
        data = data.translate(domain0)

        if self.stepwise and not self.stepwise_before:
            use_attributes = stepwise(data,
                                      weight,
                                      add_sig=self.add_sig,
                                      remove_sig=self.remove_sig)
            new_domain = orange.Domain(use_attributes, data.domain.classVar)
            new_domain.addmetas(data.domain.getmetas())
            data = orange.ExampleTable(new_domain, data)

        # missing values handling (impute missing)
        imputer = orange.ImputerConstructor_model()
        imputer.learnerContinuous = orange.MajorityLearner()
        imputer.learnerDiscrete = orange.MajorityLearner()
        imputer = imputer(data)
        data = imputer(data)

        # convertion to numpy
        A, y, w = data.toNumpy()  # weights ??
        if A is None:
            n = len(data)
            m = 0
        else:
            n, m = numpy.shape(A)

        if self.beta0 == True:
            if A is None:
                X = numpy.ones([len(data), 1])
            else:
                X = numpy.insert(A, 0, 1, axis=1)  # adds a column of ones
        else:
            X = A

        # set weights
        W = numpy.identity(len(data))
        if weight:
            for di, d in enumerate(data):
                W[di, di] = float(d[weight])

        D = dot(
            dot(numpy.linalg.pinv(dot(dot(X.T, W), X)), X.T), W
        )  # adds some robustness by computing the pseudo inverse; normal inverse could fail due to singularity of the X.T*W*X
        beta = dot(D, y)

        yEstimated = dot(X, beta)  # estimation
        # some desriptive statistisc
        muY, sigmaY = numpy.mean(y), numpy.std(y)
        muX, covX = numpy.mean(X, axis=0), numpy.cov(X, rowvar=0)

        # model statistics
        SST, SSR = numpy.sum((y - muY)**2), numpy.sum((yEstimated - muY)**2)
        SSE, RSquare = SST - SSR, SSR / SST
        R = numpy.sqrt(RSquare)  # coefficient of determination
        RAdjusted = 1 - (1 - RSquare) * (n - 1) / (n - m - 1)
        F = (SSR / m) / (SST - SSR / (n - m - 1))  # F statistisc
        df = m - 1

        sigmaSquare = SSE / (n - m - 1)

        # standard error of estimated coefficients
        errCoeff = sqrt(sigmaSquare * inv(dot(X.T, X)).diagonal())

        # t statistisc, significance
        t = beta / errCoeff
        df = n - 2
        significance = []
        for tt in t:
            try:
                significance.append(
                    statc.betai(df * 0.5, 0.5, df / (df + tt * tt)))
            except:
                significance.append(1.0)

        # standardized coefficients
        if m > 0:
            stdCoeff = (sqrt(covX.diagonal()) / sigmaY) * beta
        else:
            stdCoeff = (sqrt(covX) / sigmaY) * beta

        model = {
            'descriptives': {
                'meanX': muX,
                'covX': covX,
                'meanY': muY,
                'sigmaY': sigmaY
            },
            'model': {
                'estCoeff': beta,
                'stdErrorEstimation': errCoeff
            },
            'model summary': {
                'TotalVar': SST,
                'ExplVar': SSE,
                'ResVar': SSR,
                'R': R,
                'RAdjusted': RAdjusted,
                'F': F,
                't': t,
                'sig': significance
            }
        }
        return LinearRegression(statistics=model,
                                domain=data.domain,
                                name=self.name,
                                beta0=self.beta0,
                                imputer=imputer)