Пример #1
0
    def _ks_2samp(self, data1, data2):
        from numpy import asarray
        from scipy.stats import kstwobign

        data1, data2 = map(asarray,
                           (data1, data2))  # change data list to array

        n1 = len(data1)  # number of elements in data1
        n2 = len(data2)  # number of elements in data2

        #data1 = np.sort(data1) # sort data1 by ascending values
        #data2 = np.sort(data2) # sort data2 by ascending values

        data_all = np.concatenate(
            [data1, data2]
        )  # concatenate both data arrays and sort - this is basically the array of all possible values

        cdf1 = np.searchsorted(data1, data_all, side='right') / n1
        cdf2 = np.searchsorted(data2, data_all, side='right') / n2

        d = np.max(np.absolute(cdf1 - cdf2))
        # Note: d absolute not signed distance
        en = np.sqrt(n1 * n2 / float(n1 + n2))

        try:
            prob = kstwobign.sf((en + 0.12 + 0.11 / en) * d)
        except:
            prob = 1.0
        return prob
def ks2d2s(x1, y1, x2, y2, nboot=None, extra=False):
    '''Two-dimensional Kolmogorov-Smirnov test on two samples.
    Parameters
    ----------
    x1, y1 : ndarray, shape (n1, )
        Data of sample 1.
    x2, y2 : ndarray, shape (n2, )
        Data of sample 2. Size of two samples can be different.
    extra: bool, optional
        If True, KS statistic is also returned. Default is False.
    Returns
    -------
    p : float
        Two-tailed p-value.
    D : float, optional
        KS statistic. Returned if keyword `extra` is True.
    Notes
    -----
    This is the two-sided K-S test. Small p-values means that the two samples
    are significantly different. Note that the p-value is only an approximation
    as the analytic distribution is unkonwn. The approximation is accurate
    enough when N > ~20 and p-value < ~0.20 or so. When p-value > 0.20, the
    value may not be accurate, but it certainly implies that the two samples
    are not significantly different. (cf. Press 2007)
    References
    ----------
    Peacock, J.A. 1983, Two-Dimensional Goodness-of-Fit Testing in Astronomy,
    Monthly Notices of the Royal Astronomical Society, vol. 202, pp. 615-627
    Fasano, G. and Franceschini, A. 1987, A Multidimensional Version of the
    Kolmogorov-Smirnov Test, Monthly Notices of the Royal Astronomical Society,
    vol. 225, pp. 155-170
    Press, W.H. et al. 2007, Numerical Recipes, section 14.8
    '''
    assert (len(x1) == len(y1)) and (len(x2) == len(y2))
    n1, n2 = len(x1), len(x2)
    D = avgmaxdist(x1, y1, x2, y2)

    if nboot is None:
        sqen = np.sqrt(n1 * n2 / (n1 + n2))
        r1 = pearsonr(x1, y1)[0]
        r2 = pearsonr(x2, y2)[0]
        r = np.sqrt(1 - 0.5 * (r1**2 + r2**2))
        d = D * sqen / (1 + r * (0.25 - 0.75 / sqen))
        p = kstwobign.sf(d)
    else:
        n = n1 + n2
        x = np.concatenate([x1, x2])
        y = np.concatenate([y1, y2])
        d = np.empty(nboot, 'f')
        for i in range(nboot):
            idx = random.choice(n, n, replace=True)
            ix1, ix2 = idx[:n1], idx[n1:]
            #ix1 = random.choice(n, n1, replace=True)
            #ix2 = random.choice(n, n2, replace=True)
            d[i] = avgmaxdist(x[ix1], y[ix1], x[ix2], y[ix2])
        p = np.sum(d > D).astype('f') / nboot
    if extra:
        return p, D
    else:
        return p
Пример #3
0
    def Prob(self):
        try:
            from scipy.stats import kstwobign
            prob = kstwobign.sf((en + .12 + .11 / en) * d)
        except:
            prob = 1.0

        # KS Probability Function
        self.prob = prob
Пример #4
0
def ks2d2s(x1, y1, x2, y2, nboot=None, extra=False):
    '''Two-dimensional Kolmogorov-Smirnov test on two samples. 
    Parameters
    ----------
    x1, y1 : ndarray, shape (n1, )
        Data of sample 1.
    x2, y2 : ndarray, shape (n2, )
        Data of sample 2. Size of two samples can be different.
    extra: bool, optional
        If True, KS statistic is also returned. Default is False.

    Returns
    -------
    p : float
        Two-tailed p-value.
    D : float, optional
        KS statistic. Returned if keyword `extra` is True.

    Notes
    -----
    This is the two-sided K-S test. Small p-values means that the two samples are significantly different. Note that the p-value is only an approximation as the analytic distribution is unkonwn. The approximation is accurate enough when N > ~20 and p-value < ~0.20 or so. When p-value > 0.20, the value may not be accurate, but it certainly implies that the two samples are not significantly different. (cf. Press 2007)

    References
    ----------
    Peacock, J.A. 1983, Two-Dimensional Goodness-of-Fit Testing in Astronomy, Monthly Notices of the Royal Astronomical Society, vol. 202, pp. 615-627
    Fasano, G. and Franceschini, A. 1987, A Multidimensional Version of the Kolmogorov-Smirnov Test, Monthly Notices of the Royal Astronomical Society, vol. 225, pp. 155-170
    Press, W.H. et al. 2007, Numerical Recipes, section 14.8

    '''
    assert (len(x1) == len(y1)) and (len(x2) == len(y2))
    n1, n2 = len(x1), len(x2)
    D = avgmaxdist(x1, y1, x2, y2)

    if nboot is None:
        sqen = np.sqrt(n1 * n2 / (n1 + n2))
        r1 = pearsonr(x1, y1)[0]
        r2 = pearsonr(x2, y2)[0]
        r = np.sqrt(1 - 0.5 * (r1**2 + r2**2))
        d = D * sqen / (1 + r * (0.25 - 0.75 / sqen))
        p = kstwobign.sf(d)
    else:
        n = n1 + n2
        x = np.concatenate([x1, x2])
        y = np.concatenate([y1, y2])
        d = np.empty(nboot, 'f')
        for i in range(nboot):
            idx = random.choice(n, n, replace=True)
            ix1, ix2 = idx[:n1], idx[n1:]
            #ix1 = random.choice(n, n1, replace=True)
            #ix2 = random.choice(n, n2, replace=True)
            d[i] = avgmaxdist(x[ix1], y[ix1], x[ix2], y[ix2])
        p = np.sum(d > D).astype('f') / nboot
    if extra:
        return p, D
    else:
        return p
Пример #5
0
def multi_ks2samp(X, Y, alphas, gamma=None):
	# two sample KS test in higher dimensions using QOCSVM sets
	m, n = X.shape[0], Y.shape[0]
	quants = QOCSVM(alphas, gamma=gamma)
	quants.fit(X)
	result1 = quants.transform(X)
	result2 = quants.transform(Y)

	F1 = result1.mean(axis=0)
	F2 = result2.mean(axis=0)
	max_delta = max(np.abs(F1-F2))
	teststat = np.sqrt((n * m) / (n + m)) * max_delta
	pval = kstwobign.sf(teststat)
	return pval, teststat, max_delta
Пример #6
0
def ks_2samp(a, b, wa=None, wb=None):
    '''
    Compute the Kolmogorov-Smirnov statistic on 2 samples.
    This is a two-sided test for the null hypothesis that 2 independent
    samples are drawn from the same continuous distribution.
    Weights for each sample are accepted. If no weights are provided, then
    the function :func:`scipy.stats.ks_2samp` is called instead.

    :param a: first sample.
    :type a: numpy.ndarray
    :param b: second sample.
    :type b: numpy.ndarray
    :param wa: set of weights for "a". Same length as "a".
    :type wa: numpy.ndarray or None.
    :param wb: set of weights for "b". Same length as "b".
    :type wb: numpy.ndarray or None.
    :returns: Test statistic and two-tailed p-value.
    :rtype: float, float
    '''
    if wa is None and wb is None:
        return scipy_ks_2samp(a, b)

    a, cwa, na = _ks_2samp_values(a, wa)
    b, cwb, nb = _ks_2samp_values(b, wb)

    m = np.concatenate([a, b])

    cdfa = cwa[np.searchsorted(a, m, side='right')]
    cdfb = cwb[np.searchsorted(b, m, side='right')]

    d = np.max(np.abs(cdfa - cdfb))

    en = np.sqrt(na*nb/float(na + nb))
    try:
        prob = kstwobign.sf((en + 0.12 + 0.11/en)*d)
    except:
        prob = 1.

    return d, prob
Пример #7
0
def ks_2samp_w(data1, data2, weights1, weights2):
    """
    Reimplementation of ks_2samp from scipy/stats that allows weighted samples.
    From:
    https://stackoverflow.com/questions/40044375/how-to-calculate-the-kolmogorov-smirnov-statistic-between-two-weighted-samples
    
    NOT QUITE SURE IT WORKS. HELPERS ARE WELCOME
    """

    from scipy.stats import kstwobign

    n1 = np.sum(weights1)
    n2 = np.sum(weights2)
    ix1 = np.argsort(data1)
    ix2 = np.argsort(data2)
    data1 = data1[ix1]
    data2 = data2[ix2]
    weights1 = weights1[ix1]
    weights2 = weights2[ix2]
    data1 = np.sort(data1)
    data2 = np.sort(data2)
    data_all = np.concatenate([data1, data2])
    cwei1 = np.hstack([0, np.cumsum(weights1) / sum(weights1)])
    cwei2 = np.hstack([0, np.cumsum(weights2) / sum(weights2)])
    cdf1we = cwei1[[np.searchsorted(data1, data_all, side='right')]]
    cdf2we = cwei2[[np.searchsorted(data2, data_all, side='right')]]
    d = np.max(np.absolute(cdf1we - cdf2we))
    # Note: d absolute not signed distance
    en = np.sqrt(n1 * n2 / float(n1 + n2))
    try:
        prob = kstwobign.sf(
            (en + 0.12 + 0.11 / en) * d)  #where this come from?
    except:
        prob = 1.0

    return d, prob
Пример #8
0
def insertion_p_value(indexes, nlive, batch=0):
    """Compute the p-value from insertion indexes, assuming constant nlive.

    Note that this function doesn't use scipy.stats.kstest as the latter
    assumes continuous distributions.

    For more detail, see https://arxiv.org/abs/2006.03371

    For a rolling test, you should provide the optional parameter batch!=0. In
    this case the test computes the p value on consecutive batches of size
    nlive * batch, selects the smallest one and adjusts for multiple
    comparisons using a Bonferroni correction.

    Parameters
    ----------
    indexes: array-like
        list of insertion indexes, sorted by death contour

    nlive: int
        number of live points

    batch: float
        batch size in units of nlive for a rolling p-value

    Returns
    -------
    ks_result: dict
        Kolmogorov-Smirnov test results:
            D: Kolmogorov-Smirnov statistic
            sample_size: sample size
            p-value: p-value
            # if batch != 0
            iterations: bounds of batch with minimum p-value
            nbatches: the number of batches in total
            uncorrected p-value: p-value without Bonferroni correction
    """
    if batch == 0:
        bins = np.arange(-0.5, nlive + 0.5, 1.)
        empirical_pmf = np.histogram(indexes, bins=bins, density=True)[0]
        empirical_cmf = np.cumsum(empirical_pmf)
        uniform_cmf = np.arange(1., nlive + 1., 1.) / nlive

        D = abs(empirical_cmf - uniform_cmf).max()
        sample_size = len(indexes)
        K = D * np.sqrt(sample_size)

        ks_result = {}
        ks_result["D"] = D
        ks_result["sample_size"] = sample_size
        ks_result["p-value"] = kstwobign.sf(K)
        return ks_result
    else:
        batch = int(batch * nlive)
        batches = [indexes[i:i + batch] for i in range(0, len(indexes), batch)]
        ks_results = [insertion_p_value(c, nlive) for c in batches]
        ks_result = min(ks_results, key=lambda t: t["p-value"])
        index = ks_results.index(ks_result)

        ks_result["iterations"] = (index * batch, (index + 1) * batch)
        ks_result["nbatches"] = n = len(batches)
        ks_result["uncorrected p-value"] = p = ks_result["p-value"]
        ks_result["p-value"] = 1. - (1. - p)**n
        if ks_result["p-value"] == 0.:
            ks_result["p-value"] = p * n
        return ks_result
Пример #9
0
def ks_2samp(data1, data2):
    """
    Computes the Kolmogorov-Smirnov statistic on 2 samples.
    This is a two-sided test for the null hypothesis that 2 independent samples
    are drawn from the same continuous distribution.
    Parameters
    ----------
    a, b : sequence of 1-D ndarrays
        two arrays of sample observations assumed to be drawn from a continuous
        distribution, sample sizes can be different
    Returns
    -------
    D : float
        KS statistic
    p-value : float
        two-tailed p-value
    Notes
    -----
    This tests whether 2 samples are drawn from the same distribution. Note
    that, like in the case of the one-sample K-S test, the distribution is
    assumed to be continuous.
    This is the two-sided test, one-sided tests are not implemented.
    The test uses the two-sided asymptotic Kolmogorov-Smirnov distribution.
    If the K-S statistic is small or the p-value is high, then we cannot
    reject the hypothesis that the distributions of the two samples
    are the same.
    Examples
    --------
    >>> from scipy import stats
    >>> np.random.seed(12345678)  #fix random seed to get the same result
    >>> n1 = 200  # size of first sample
    >>> n2 = 300  # size of second sample
    For a different distribution, we can reject the null hypothesis since the
    pvalue is below 1%:
    >>> rvs1 = stats.norm.rvs(size=n1, loc=0., scale=1)
    >>> rvs2 = stats.norm.rvs(size=n2, loc=0.5, scale=1.5)
    >>> stats.ks_2samp(rvs1, rvs2)
    (0.20833333333333337, 4.6674975515806989e-005)
    For a slightly different distribution, we cannot reject the null hypothesis
    at a 10% or lower alpha since the p-value at 0.144 is higher than 10%
    >>> rvs3 = stats.norm.rvs(size=n2, loc=0.01, scale=1.0)
    >>> stats.ks_2samp(rvs1, rvs3)
    (0.10333333333333333, 0.14498781825751686)
    For an identical distribution, we cannot reject the null hypothesis since
    the p-value is high, 41%:
    >>> rvs4 = stats.norm.rvs(size=n2, loc=0.0, scale=1.0)
    >>> stats.ks_2samp(rvs1, rvs4)
    (0.07999999999999996, 0.41126949729859719)
    """
    data1, data2 = map(asarray, (data1, data2))
    n1 = data1.shape[0]
    n2 = data2.shape[0]
    n1 = len(data1)
    n2 = len(data2)
    data1 = np.sort(data1)
    data2 = np.sort(data2)
    data_all = np.concatenate([data1,data2])
    #print(len(data_all))
    cdf1 = np.searchsorted(data1,data_all,side='right')/(1.0*n1)
    cdf2 = np.searchsorted(data2,data_all,side='right')/(1.0*n2)
    tau=0
    darray=cdf1-cdf2
    d = np.max(np.absolute(darray))
    if d==-np.min(darray):
        d=-d
        jamfri=np.min(np.where(darray == np.min(darray))[0])
    else:
        jamfri=np.min(np.where(darray == darray.max())[0])
    #print(jamfri)
    tau=data_all[jamfri]
    #tau=data_all[list(darray).index(d)]
    # Note: d absolute not signed distance
    en = np.sqrt(n1*n2/float(n1+n2))
    try:
        prob = kstwobign.sf((en + 0.12 + 0.11 / en) * d)
        #prob = distributions.kstwobign.sf((en + 0.12 + 0.11 / en) * d)
    except:
        prob = 1.0
    return d, prob, tau
Пример #10
0
def ks_2samp(a, b, aw=None, bw=None):
    """
    Computes the Kolmogorov-Smirnov (KS) statistic on 2 samples.

    This is a two-sided test for the null hypothesis that 2 independent samples
    are drawn from the same continuous distribution.

    Parameters
    ----------
    a, b : Sequence of 1D ndarrays
        Two arrays of sample observations assumed to be drawn from a continuous
        distribution, sample sizes can be different.
    aw, bw: Sequence of 1D ndarrays, optional
        The weights of each observation in a, b. Must be the same length as the
        associated array of observations. If omitted or None, every measurement
        will be assigned an equal weight.

    Returns
    -------
    D : float
        KS statistic
    p-value : float
        Two-tailed p-value

    Notes
    -----
    This tests whether 2 samples are drawn from the same distribution. Note
    that, like in the case of the one-sample KS test, the distribution is
    assumed to be continuous.

    This is the two-sided test, one-sided tests are not implemented. The test
    uses the two-sided asymptotic KS distribution.

    If the KS statistic is small or the p-value is high, then we cannot reject
    the hypothesis that the distributions of the two samples are the same.

    This function accounts for weights using the recommendations found in [1].

    Convergence is improved in the large-sample KS distribution by using the
    form found by [2].

    References
    ----------
    [1] J. Monahan, "Numerical Methods of Statistics" 2nd Ed., 2011

    [2] M. A. Stephens "Use of the Kolmogorov-Smirnov, Cramer-Von Mises and
    Related Statistics Without Extensive Tables", Journal of the Royal
    Statistical Society, Series B (Methodological), Vol. 32, No. 1., pp.
    115-122, 1970
    """

    # Methodology for weighted Kolmogorov-Smirnov test taken from Numerical
    # Methods of Statistics - J. Monahan

    ab = np.sort(np.concatenate((a, b)))

    D = np.max(np.abs(ecdf(a, aw)(ab) - ecdf(b, bw)(ab)))

    n1 = len(a) if aw is None else np.sum(aw)**2 / np.sum(aw**2)
    n2 = len(b) if bw is None else np.sum(bw)**2 / np.sum(bw**2)

    en = np.sqrt(n1 * n2 / float(n1 + n2))

    p = kstwobign.sf((en + 0.12 + 0.11 / en) * D)  # Stephens (1970)

    return D, p
Пример #11
0
    def calibrateModelDRO(self, sigma_n_grid, batch_size=16, parallel=True):
        n1 = self.data_feature.shape[0]
        data_feature_sorted = self.data_feature.copy()
        data_feature_sorted.sort(axis=0)
        empirical_quantile = np.array([np.arange(0., 1 - 1e-15, 1 / n1)] *
                                      self.encode_length).T
        # self.plotFeatureDistribution(data_feature_sorted)

        scaler = MinMaxScaler()
        simulate_quantile_list = []

        for sigma_n in sigma_n_grid:
            time_start = time.time()
            mid_prices_dist = self.generateTimeSeriesDistribution(
                sigma_n, batch_size, parallel)
            mid_prices_dist = np.array([
                scaler.fit_transform(mid_prices)
                for mid_prices in mid_prices_dist
            ])

            feature_dist = self.encode(mid_prices_dist[:, :, 0])

            feature_dist.sort(axis=0)
            n2 = len(feature_dist)
            simulate_quantile = np.full_like(empirical_quantile, 0.)
            for i in range(self.encode_length):
                simulate_quantile[:, i] = np.searchsorted(
                    feature_dist[:, i],
                    data_feature_sorted[:, i],
                    side='right') / n2
            simulate_quantile_list.append(simulate_quantile)

            loss = np.max(np.abs(empirical_quantile - simulate_quantile))
            loss_argmax = np.argmax(
                np.abs(empirical_quantile - simulate_quantile))
            arg_x, arg_y = (loss_argmax // self.encode_length,
                            loss_argmax % self.encode_length)
            time_end = time.time()
            print("sigma_n {} finished with time {}, loss is {} at {}.".format(
                sigma_n, time_end - time_start, loss, (arg_x, arg_y)))
            logprint(
                "sigma_n {} finished with time {}, loss is {} at {}.\n".format(
                    sigma_n, time_end - time_start, loss, (arg_x, arg_y)))
            # self.plotFeatureDistribution(feature_dist)
            #  self.plotQuantileHeatmap(empirical_quantile, simulate_quantile)

        simulate_quantile_list = np.array(simulate_quantile_list)

        print("Quantile calculation finished. Start optimization.")
        logprint("Quantile calculation finished. Start optimization.\n")
        m = gurobipy.Model("DRO")

        q = m.addVar(vtype=GRB.CONTINUOUS, name='q')
        W = dict()
        W_sum = 0
        quantile_avg = dict()
        m.addConstr(q >= 0, name="postive_q")
        for i in range(len(sigma_n_grid)):
            W[i] = m.addVar(vtype=GRB.CONTINUOUS, name='W' + str(i))
            m.addConstr(W[i] >= 0, "postive_W" + str(i))
            W_sum += W[i]
            for j in range(n1):
                for k in range(self.encode_length):
                    if (j, k) in quantile_avg:
                        quantile_avg[(
                            j, k)] += W[i] * simulate_quantile_list[i][j][k]
                    else:
                        quantile_avg[(
                            j, k)] = W[i] * simulate_quantile_list[i][j][k]

        m.addConstr(W_sum == 1, name="sum_prob")
        for j in range(n1):
            for k in range(self.encode_length):
                m.addConstr(empirical_quantile[j][k] - q / np.sqrt(n1 * n2 /
                                                                   (n1 + n2))
                            <= quantile_avg[(j, k)],
                            name="qCons1_" + str(j) + str(k))
                m.addConstr(empirical_quantile[j][k] + q / np.sqrt(n1 * n2 /
                                                                   (n1 + n2))
                            >= quantile_avg[(j, k)],
                            name="qCons2_" + str(j) + str(k))

        m.setObjective(q, GRB.MINIMIZE)
        m.optimize()
        print("Optimization finished.")
        logprint("Optimization finished.\n")

        W_optimal = [W[i].x for i in W]
        plt.plot(sigma_n_grid, W_optimal)
        plt.xlabel("$\sigma_n^2$")
        plt.ylabel("Weights")
        plt.savefig("weights_l_{}_b_{}.png".format(self.lambda_a, batch_size))
        plt.show()

        return m.objVal, kstwobign.sf(m.objVal), W_optimal
Пример #12
0
    def calibrateModelDRO(self, sigma_n_grid, batch_size=16, parallel=True):
        n1 = len(self.fundamental_returns)
        fundamental_returns_sorted = np.sort(self.fundamental_returns)
        fund_quantile = np.searchsorted(fundamental_returns_sorted,
                                        self.fundamental_returns,
                                        side='right') / n1
        quantile_dict = dict()
        for sigma_n in sigma_n_grid:
            time_start = time.time()
            dist_sim = self.generateReturnDistribution(sigma_n, batch_size,
                                                       parallel)
            dist_sim = np.sort(dist_sim)
            n2 = len(dist_sim)
            quantile_dict[sigma_n] = np.searchsorted(
                dist_sim, self.fundamental_returns, side='right') / n2
            time_end = time.time()
            print("sigma_n {} finished with total time {}, loss {}.".format(
                sigma_n, time_end - time_start,
                np.max(quantile_dict[sigma_n] - fund_quantile)))
            logprint(
                "sigma_n {} finished with total time {}, loss {}.\n".format(
                    sigma_n, time_end - time_start,
                    np.max(quantile_dict[sigma_n] - fund_quantile)))

        print("Quantile calculation finished. Start optimization")
        m = Model("DRO")

        q = m.addVar(vtype=GRB.CONTINUOUS, name='q')
        W = dict()
        W_sum = 0
        quantile_avg = dict()
        m.addConstr(q >= 0, name="postive_q")
        for i in range(len(sigma_n_grid)):
            W[i] = m.addVar(vtype=GRB.CONTINUOUS, name='W' + str(i))
            m.addConstr(W[i] >= 0, "postive_W" + str(i))
            W_sum += W[i]
            for j in range(len(fund_quantile)):
                if j in quantile_avg:
                    quantile_avg[j] += W[i] * quantile_dict[sigma_n_grid[i]][j]
                else:
                    quantile_avg[j] = W[i] * quantile_dict[sigma_n_grid[i]][j]

        m.addConstr(W_sum == 1, name="sum_prob")
        for j in range(len(fund_quantile)):
            m.addConstr(
                fund_quantile[j] - q / np.sqrt(n1 * n2 /
                                               (n1 + n2)) <= quantile_avg[j],
                name="qCons1_" + str(j))
            m.addConstr(
                fund_quantile[j] + q / np.sqrt(n1 * n2 /
                                               (n1 + n2)) >= quantile_avg[j],
                name="qCons2_" + str(j))

        m.setObjective(q, GRB.MINIMIZE)
        m.optimize()
        print("Optimization finished.")

        W_optimal = [W[i].x for i in W]
        plt.plot(sigma_n_grid, W_optimal)
        plt.xlabel("$\sigma_n^2$")
        plt.ylabel("Weights")
        plt.savefig(log_file + "_weights.png")
        plt.show()

        return m.objVal, kstwobign.sf(m.objVal), W_optimal