def ks_stat(data1, data2):
    # Compute ECDF from data: x, y
    x, y = dcst.ecdf(data1)
    # Compute corresponding values of the target CDF
    cdf = dcst.ecdf_formal(x, data2)
    # Compute distances between concave corners and CDF
    D_top = y - cdf
    # Compute distance between convex corners and CDF
    D_bottom = cdf - y + 1 / len(data1)
    return np.max((D_top, D_bottom))
Exemplo n.º 2
0
def test_ecdf_formal_custom():
    assert dcst.ecdf_formal(0.1, [0, 1, 2, 3]) == 0.25
    assert dcst.ecdf_formal(-0.1, [0, 1, 2, 3]) == 0.0
    assert dcst.ecdf_formal(0.1, [3, 2, 0, 1]) == 0.25
    assert dcst.ecdf_formal(-0.1, [3, 2, 0, 1]) == 0.0
    assert dcst.ecdf_formal(2, [3, 2, 0, 1]) == 0.75
    assert dcst.ecdf_formal(1, [3, 2, 0, 1]) == 0.5
    assert dcst.ecdf_formal(3, [3, 2, 0, 1]) == 1.0
    assert dcst.ecdf_formal(0, [3, 2, 0, 1]) == 0.25

    with pytest.raises(RuntimeError) as excinfo:
        dcst.ecdf_formal([np.nan, np.inf], [0, 1, 2, 3])
    excinfo.match("Input cannot have NaNs.")

    correct = np.array([1.0, 1.0])
    result = dcst.ecdf_formal([3.1, np.inf], [3, 2, 0, 1])
    assert np.allclose(correct, result, atol=atol)
Exemplo n.º 3
0
    def predict(self, data=None, digits=3):
        """
            Prediction based on IDR model fit

            Parameters
            ----------
            idr_object : object from class idrobject
            data : pd.DataFrame, optional
                containing variables with which to predict. The default is None.
            digits : integer value, optional
                digits number of decimal places for predictive CDF. 
                The default is 3.

            Returns
            -------
            object of class idrpredict.
            predictions : Object of class predictions_idr:
                points : where predictie CDF has jumps
                cdf : estimated CDF evaluated at points
                lower : bounds for estimated CDF (out-of-sample predictions)
                upper : bounds for estimated CDF (out-of-sample predictions)
            incomparables : gives the indices of all predictions for which the 
                climatological forecast is returned because the forecast variables are not 
                comparable to the training data. None if not available.

            """

        cdf = self.ecdf.copy()
        thresholds = self.thresholds.copy()
        order_indices = []
        preds = []
        if data is None:
            indices = self.indices
            for i in range(indices.shape[0]):
                edf = np.round(cdf[i, :], digits)
                sel = np.hstack([edf[0] > 0, np.diff(edf) > 0])
                #dat = {'points': thresholds[sel], 'cdf': edf[sel]}
                #tmp = pd.DataFrame(dat, columns = ['points', 'cdf'])
                tmp = predictions_idr(ecdf=edf[sel],
                                      points=thresholds[sel],
                                      lower=[],
                                      upper=[])
                for j in indices[i]:
                    order_indices.append(j)
                    preds.append(tmp)
            preds_rearanged = [preds[k] for k in np.argsort(order_indices)]
            idr_predictions = idrpredict(predictions=preds_rearanged,
                                         incomparables=None)
            return (idr_predictions)

        if isinstance(data, pd.DataFrame) == False:
            raise ValueError("data must be a pandas data frame")
        X = self.X.copy()
        M = all(elem in data.columns for elem in X.columns)
        if M == False:
            raise ValueError("some variables of idr fit are missing in data")
        data = data.copy()
        data = prepareData(data[X.columns],
                           groups=self.groups,
                           orders=self.orders)
        nVar = data.shape[1]
        if nVar == 1:
            X = np.array(X[X.columns[0]])
            x = np.array(data[data.columns[0]])
            #fct = all(X[i] <= X[i+1] for i in range(len(X)-1))
            #fct = False
            #if fct:
            #   X = X.astype(int)
            #  x = x.astype(int)
            #smaller = findInterval(x, X)
            smaller = np.array([bisect.bisect_left(X, a) for a in x])
            smaller = np.where(smaller == 0, 1, smaller) - 1
            wg = np.interp(
                x, X, np.arange(
                    1, X.shape[0] + 1), left=1, right=X.shape[0]) - np.arange(
                        1, X.shape[0] + 1)[smaller.astype(int)]
            greater = smaller + (wg > 0).astype(int)
            #if fct == False:
            ws = 1 - wg
            #else:
            #   ws = np.zeros(x.shape[0])+0.5
            #  wg = ws
            # mapping function
            l = np.round(cdf[greater.astype(int), :], digits)
            u = np.round(cdf[smaller.astype(int), :], digits)

            def fun_preds(l, u, ws, wg):
                ls = np.insert(l[:-1], 0, 0)
                us = np.insert(u[:-1], 0, 0)
                ind = (ls < l) + (us < u)
                l = l[ind]
                u = u[ind]
                cdf = np.round(np.multiply(l, wg) + np.multiply(u, ws), digits)
                #dat = {"points": thresholds[ind], "lower": l, "cdf": cdf, "upper": u}
                #tmp = pd.DataFrame(dat, columns = ['points', 'lower', 'cdf', 'upper'])
                return predictions_idr(ecdf=cdf,
                                       points=thresholds[ind],
                                       lower=l,
                                       upper=u)

            preds = list(map(fun_preds, l, u, list(ws), list(wg)))
            idr_predictions = idrpredict(predictions=preds, incomparables=None)
            return idr_predictions

        nPoints = neighbor_points(data, X, order_X=self.constraints)
        smaller = nPoints[0]
        greater = nPoints[1]
        incomparables = np.array(list(map(len, smaller))) + np.array(
            list(map(len, greater))) == 0

        if any(incomparables):
            y = self.y
            edf = np.round(dcst.ecdf_formal(thresholds, y.explode()), digits)
            sel = edf > 0
            edf = edf[sel]
            points = thresholds[sel]
            upr = np.where(edf == 1)[0]
            if upr < len(edf) - 1:
                points = np.delete(points, np.arange(upr, len(edf)))
                edf = np.delete(edf, np.arange(upr, len(edf)))
            #dat = {'points':points, 'lower':edf, 'cdf':edf, 'upper':edf}
            #tmp = pd.DataFrame(dat, columns = ['points', 'lower', 'cdf', 'upper'])
            tmp = predictions_idr(ecdf=edf,
                                  points=points,
                                  lower=edf,
                                  upper=edf)
            for i in np.where(incomparables == True)[0]:
                preds.append(tmp)
                order_indices.append(i)
        for i in np.where(incomparables == False)[0]:
            if smaller[i].size > 0 and greater[i].size == 0:
                upper = np.round(
                    np.amin(cdf[smaller[i].astype(int), :], axis=0), digits)
                sel = np.hstack([upper[0] != 0, np.diff(upper) != 0])
                upper = upper[sel]
                lower = np.zeros(len(upper))
                estimCDF = upper
            elif smaller[i].size == 0 and greater[i].size > 0:
                lower = np.round(
                    np.amax(cdf[greater[i].astype(int), :], axis=0), digits)
                sel = np.hstack([lower[0] != 0, np.diff(lower) != 0])
                lower = lower[sel]
                upper = np.ones(len(lower))
                estimCDF = lower
            else:
                lower = np.round(
                    np.amax(cdf[greater[i].astype(int), :], axis=0), digits)
                upper = np.round(
                    np.amin(cdf[smaller[i].astype(int), :], axis=0), digits)
                sel = np.hstack(
                    [lower[0] != 0, np.diff(lower) != 0]) + np.hstack(
                        [upper[0] != 0, np.diff(upper) != 0])
                lower = lower[sel]
                upper = upper[sel]
                estimCDF = np.round(0.5 * (lower + upper), digits)

            #dat = {'points': thresholds[sel], 'lower': lower, 'cdf': estimCDF, 'upper': upper}
            #tmp = pd.DataFrame(dat, columns = ['points', 'lower', 'cdf', 'upper'])
            tmp = predictions_idr(ecdf=estimCDF,
                                  points=thresholds[sel],
                                  lower=lower,
                                  upper=upper)
            order_indices.append(i)
            preds.append(tmp)

        preds_rearanged = [preds[k] for k in np.argsort(order_indices)]
        idr_predictions = idrpredict(predictions=preds_rearanged,
                                     incomparables=np.where(incomparables))
        #return preds_rearanged
        return idr_predictions
Exemplo n.º 4
0
def test_ecdf_formal(x, data):
    correct = np.searchsorted(np.sort(data), x, side="right") / len(data)
    assert np.allclose(dcst.ecdf_formal(x, data),
                       correct,
                       atol=atol,
                       equal_nan=True)