示例#1
0
    #y = 20 + x ** 2 * np.cos(0.5 * x) * int(x < 0) + 10 * np.sin(x) * np.cos(x) * x * int(0 <= x)
    y = -x
    return y


y = [datagenproc(i) for i in x]
ynoise = [datagenproc(i) + 2 * np.random.randn(1)[0] for i in x]
#ynoise = np.random.randn(len(x))

# correlation coeff
print('r_xy=\n', np.corrcoef(np.array([x, ynoise]).T, rowvar=0))

# get Silvermans opt bandwidth
h_x = kde.kde_pdf(x=None,
                  sampledata=np.array(x),
                  kerneltype='epanechnikov',
                  biascorrected=False,
                  getsilverman=True)
h_y = kde.kde_pdf(x=None,
                  sampledata=np.array(y),
                  kerneltype='epanechnikov',
                  biascorrected=False,
                  getsilverman=True)

# check independence
tstat = nptests.nptests_ahmadli(xdata=x,
                                ydata=y,
                                bandwidthx=h_x,
                                bandwidthy=h_y,
                                kerneltype='epanechnikov')
print('tstat=', tstat)
示例#2
0
sampleMulti = np.random.multivariate_normal(mu, sigma, size=50) ** 2

# Use kde_pdf to predict the density based on these sample data in a range. Also predict confidence intervals for the
# univariate and compute the true density in the range.
support = np.arange(0, 40, 0.8)
supportx1 = np.arange(-4, 4, 0.2)
supportx2 = np.arange(-3, 3, 0.2)
# Predefine the lists
fhatUni = list()
ci_low = list()
ci_high = list()
fUni = list()
# Univariate
for x in support:
    # Call the function with CI mode
    hat, low, high = kde.kde_pdf(x, sampleUni, 'epanechnikov', kernelorder=2, biascorrected=True,
                                 correctboundarybias=False, flowbound=0, bandwidth=None, confidint=True)
    fhatUni.append(hat)
    ci_low.append(low)
    ci_high.append(high)
    # Compute the true density
    fUni.append(stats.chi2.pdf(x=x, df=20))
print('\nUnivariate estimation is done.\n')
# Multivariate
fhatMulti = [[kde.kde_pdf(np.array([x1, x2]), sampleMulti, 'epanechnikov',
                          correctboundarybias=False, flowbound=np.array([0, 0]),
                          bandwidth=np.array([0.5, 0.5]), confidint=True)
              for x1 in supportx1] for x2 in supportx2]
fMulti = [[stats.multivariate_normal.pdf([x1, x2], mean=mu, cov=sigma) for x1 in supportx1] for x2 in supportx2]
print('Multivariate estimation is done.\n')

with open(r'C:\Users\Máté\Dropbox\CEU\2017 Spring\Nonparametric\Nonparametric_ps2\bweight_age_white_20K.txt', 'r')\
          as myfile:
    filein = myfile.readlines()
# break the lines and convert into np array
rawdata = np.array([float(line.split()[0]) for line in filein])
# weight
weight_w = rawdata[0:int(len(rawdata) / 2)]
# age
age_w = rawdata[int(len(rawdata) / 2):]

# Size
n = len(age_b)

# Get Silverman's bandwidth for Gaussian and Epa kernels
# gaussian
h_gau_weight_b = kde.kde_pdf(x=None, sampledata=weight_b, kerneltype='gaussian', kernelorder=2, getsilverman=True)
h_gau_weight_w = kde.kde_pdf(x=None, sampledata=weight_w, kerneltype='gaussian', kernelorder=2, getsilverman=True)
h_gau_age_b = kde.kde_pdf(x=None, sampledata=age_b, kerneltype='gaussian', kernelorder=2, getsilverman=True)
h_gau_age_w = kde.kde_pdf(x=None, sampledata=age_w, kerneltype='gaussian', kernelorder=2, getsilverman=True)
# epa
h_epa_weight_b = kde.kde_pdf(x=None, sampledata=weight_b, kerneltype='epanechnikov', kernelorder=2, getsilverman=True)
h_epa_weight_w = kde.kde_pdf(x=None, sampledata=weight_w, kerneltype='epanechnikov', kernelorder=2, getsilverman=True)
h_epa_age_b = kde.kde_pdf(x=None, sampledata=age_b, kerneltype='epanechnikov', kernelorder=2, getsilverman=True)
h_epa_age_w = kde.kde_pdf(x=None, sampledata=age_w, kerneltype='epanechnikov', kernelorder=2, getsilverman=True)

# Compute the test statistics for different kernels and scaled bandwidths
# summary to print as table at the end
matrixtoprint = np.empty((3, 4))
matrixindex = 0
subsamplesize = 100
for scale in [1 / 3, 1, 3]:
                     weight_w_train.mean()) / weight_w_train.std()
# test
age_b_test_st = (age_b_test - age_b_test.mean()) / age_b_test.std()
age_w_test_st = (age_w_test - age_w_test.mean()) / age_w_test.std()
weight_b_test_st = (weight_b_test - weight_b_test.mean()) / weight_b_test.std()
weight_w_test_st = (weight_w_test - weight_w_test.mean()) / weight_w_test.std()

# Time of preparing data
start_time_lscv_b = time.time()
print('Preparing data: %s seconds\n' % (start_time_lscv_b - start_time))

# ###################### TRAINING ##############################
# Bandwidth
kerneltype = 'epanechnikov'
h_age_b = kde.kde_pdf(x=None,
                      sampledata=age_b_train_st,
                      kerneltype=kerneltype,
                      getsilverman=True)
h_age_w = kde.kde_pdf(x=None,
                      sampledata=age_w_train_st,
                      kerneltype=kerneltype,
                      getsilverman=True)
h_weight_b = kde.kde_pdf(x=None,
                         sampledata=weight_b_train_st,
                         kerneltype=kerneltype,
                         getsilverman=True)
h_weight_w = kde.kde_pdf(x=None,
                         sampledata=weight_w_train_st,
                         kerneltype=kerneltype,
                         getsilverman=True)
print('Silvermans bandwidht, blacks, age and weight:', [h_age_b, h_weight_b])
print('Silvermans bandwidht, whites, age and weight:', [h_age_w, h_weight_w])
示例#5
0
# For each x
# index
i_x = 0
# loop
for x in [1, 1.5, 2]:
    fx = stats.chi2.pdf(x=x, df=1)
    # for each sample size
    # index
    i_n = 0
    # loop
    for n in [50, 100, 250, 500]:
        print('Currently working on x=', x, ' and n=', n, '\n')
        # get Silverman's bandwidth, using the average of 40 samples of size n
        h_s = kde.kde_pdf(
            x=None,
            sampledata=np.mean(
                [np.random.chisquare(df=1, size=n) for i in range(40)], 0),
            kerneltype='epanechnikov',
            getsilverman=True)
        # iteration
        for m in range(M):
            # draw sample
            sample = np.random.chisquare(df=1, size=n)
            # estimate fhat and ci for Silverman and rescaled Silverman bandwidth
            # (for the coverage probability fhat is not need only the CI; fhat is needed only for
            # the rescaled bias computation)
            fhatt, ci_low, ci_high = kde.kde_pdf(x=x,
                                                 sampledata=sample,
                                                 kerneltype='epanechnikov',
                                                 bandwidth=None,
                                                 correctboundarybias=False,
                                                 biascorrected=False,
示例#6
0
# True f(x) for f~uniform[0,1]
f = 1
# Seed random
np.random.seed([0])

# For each x
# index
i = 0
# loop
for x in [0, 0.05, 0.1, 0.5, 0.6]:
    # iteration
    for m in range(M):
        # draw sample of size n
        sample = np.random.rand(n)
        # compute the kernel density estimation, given the sample, for (bandwidth, kerneltype) tuples
        fhat_m[m, 0:4] = [kde.kde_pdf(x=x, sampledata=sample, bandwidth=0.1,
                                      kerneltype='epanechnikov', biascorrected=False),
                          kde.kde_pdf(x=x, sampledata=sample, bandwidth=0.25,
                                      kerneltype='epanechnikov', biascorrected=False),
                          kde.kde_pdf(x=x, sampledata=sample, bandwidth=0.1,
                                      kerneltype='gaussian', biascorrected=False),
                          kde.kde_pdf(x=x, sampledata=sample, bandwidth=0.25,
                                      kerneltype='epanechnikov', biascorrected=False)
                          ]
        # mean over (bandwidth,kerneltype) tuples
        fhat_m[m, 4] = np.mean(fhat_m[m, 0:4])
    # average over iterations
    Efhat[i, :] = np.mean(fhat_m, 0)
    # Compute bias
    bias[i, :] = Efhat[i,:] - f
    # increase row index for the next x
    i = i + 1
示例#7
0
    filein = myfile.readlines()
# break lines and convert into numpy array
bw_white = np.array([float(line.split()[0]) for line in filein])
# black
with open(r'C:\Users\Máté\Dropbox\CEU\2017 Spring\Nonparametric\Nonparametric_ps1\bweight_black_20K.txt', 'r') \
        as myfile:
    filein = myfile.readlines()
# break lines and convert into numpy array
bw_black = np.array([float(line.split()[0]) for line in filein])

# Part (a) #
# kernel density estimation for whites, Epa kernel, Silverman's bandwidth, plot
# domain
support = np.arange(0, bw_white.max() + 0.3 * bw_white.std(), 100)
# estimation
fhat_white_s = [kde.kde_pdf(x=x, sampledata=bw_white, kerneltype='epanechnikov', bandwidth=None, bandwidthscale=None,
                            biascorrected=False, correctboundarybias=False) for x in support]
# plot
kde.kde_plot(fhat=fhat_white_s, ismultiple=False, fsupport=support, plottitle='KDE of white birthweight, Epa. with '
             'Silverman\'s bandwidth', xlabel='gram', ylabel='$\hat{f}(x)$', savemode=True,
             filepath=r'C:\Users\Máté\Dropbox\CEU\2017 Spring\Nonparametric\Nonparametric_ps1\Problem1_a',
             viewmode=True)

# Part (b) #
# kernel density estimation for whites, Epa kernel, Silverman's bandwidth times 1/5 and 5, plot
# estimation
fhat_white_s_1over5 = [kde.kde_pdf(x=x, sampledata=bw_white, kerneltype='epanechnikov',
                                   bandwidth=None, bandwidthscale=1 / 5,
                                   biascorrected=False, correctboundarybias=False) for x in support]
fhat_white_s_5 = [kde.kde_pdf(x=x, sampledata=bw_white, kerneltype='epanechnikov',
                                   bandwidth=None, bandwidthscale=5,
                                   biascorrected=False, correctboundarybias=False) for x in support]
def nptests_ahmadli(xdata,
                    ydata,
                    bandwidthx,
                    bandwidthy,
                    kerneltype,
                    kernelorder=2,
                    subsamplesize=None,
                    getpvalue=False):
    """
    Ahmad & Li nonparametrics test of statistical indenpendence of two random variables, which can be multivariate
    :param xdata: sample data on x, n*d_x sized, n: number of observations, d_x number of variables in x
    :param ydata: sample data on y, n*d_y sized, n: number of observations, d_y number of variables in y
    :param bandwidthx: scalar or list of length d_x, bandwidths to use in the kernel. If scalar, the same bandwidth
     is used for all variables in x
    :param bandwidthy: scalar or list of length d_y, bandwidths to use in the kernel. If scalar, the same bandwidth
     is used for all variables in y
    :param kerneltype: string, a name of kernel from kernels.py
    :param kernelorder: order of the kernel
    :param subsamplesize: if given the test uses only subsamplesize randomly chosen elements of the original sample
    :param getpvalue: if True, p-value is returned as well
    :return:
    """
    # Import dependencies
    import numpy as np
    from nonparaecon.kde import kde_pdf
    from nonparaecon import kernels

    # Subsample to use
    if not (subsamplesize is None):
        # seed random
        np.random.seed([0])
        # permute observations and keep required size
        permindex = np.random.permutation(len(ydata))
        xdata = xdata[permindex[0:subsamplesize]]
        ydata = ydata[permindex[0:subsamplesize]]

    # Get sizes
    n = len(xdata)
    try:
        d_x = np.size(xdata, 1)
    except:
        d_x = 1
    try:
        d_y = np.size(ydata, 1)
    except:
        d_y = 1
    #print('Perceived  number of variables in x and y respectively: ', d_x, d_y)

    # Joint array
    try:
        xyarray = np.array(np.concatenate([xdata, ydata], 1))
    except:
        xyarray = np.array([xdata, ydata]).T

    # Expand bandwidths to list if scalar
    if np.isscalar(bandwidthx):
        bandwidthx_exp = [bandwidthx] * d_x
    else:
        bandwidthx_exp = bandwidthx
    if np.isscalar(bandwidthy):
        bandwidthy_exp = [bandwidthy] * d_y
    else:
        bandwidthy_exp = bandwidthy
    # join
    bandwidthxy = bandwidthx_exp + bandwidthy_exp

    # Components of Itilde
    itilde1 = np.array([(n - 1) / n * kde_pdf(x=xyarray[i],
                                              sampledata=xyarray,
                                              kerneltype=kerneltype,
                                              bandwidth=bandwidthxy,
                                              kernelorder=kernelorder,
                                              biascorrected=False,
                                              leaveoneout=True,
                                              leftoutindex=i)
                        for i in range(n)]).mean()
    itilde2 = np.array([(n - 1) / n * kde_pdf(x=xdata[i], sampledata=xdata, kerneltype=kerneltype,
                                bandwidth=bandwidthx, kernelorder=kernelorder,
                                biascorrected=False, leaveoneout=True, leftoutindex=i) * \
                        (n - 1) / n * kde_pdf(x=ydata[j], sampledata=ydata, kerneltype=kerneltype,
                                bandwidth=bandwidthy, kernelorder=kernelorder,
                                biascorrected=False, leaveoneout=True, leftoutindex=j)
                        for i in range(n) for j in range(n)]).mean()
    itilde3 = -2 * np.array([(n - 1) / n * kde_pdf(x=xdata[i], sampledata=xdata, kerneltype=kerneltype,
                                     bandwidth=bandwidthx, kernelorder=kernelorder, biascorrected=False,
                                     leaveoneout=True, leftoutindex=i) * \
                             (n - 1) / n * kde_pdf(x=ydata[i], sampledata=ydata, kerneltype=kerneltype,
                                     bandwidth=bandwidthy, kernelorder=kernelorder, biascorrected=False,
                                     leaveoneout=True, leftoutindex=i) for i in range(n)]).mean()

    # Numerator
    itilde = itilde1 + itilde2 + itilde3
    bandwidthproduct = np.product(np.array(bandwidthxy))
    numerator = bandwidthproduct * itilde

    # Denominator
    # k
    # get kernel
    kernel = getattr(kernels, kerneltype)
    # univariate x, univariate y
    if d_x == 1 and d_y == 1:
        k = np.array([kernel((xdata[i] - xdata[j]) / bandwidthx, kernelorder) ** 2 * \
                      kernel((ydata[i] - ydata[j]) / bandwidthy, kernelorder) ** 2
                      for j in range(n) for i in range(n) if j != i]).sum()
    # multivariate x, univariate y
    elif d_x > 1 and d_y == 1:
        k = np.array([np.product([kernel((xdata[i, d] - xdata[j, d]) / bandwidthx_exp[d], kernelorder)
                                  for d in range(d_x)]) ** 2 *\
                      kernel((ydata[i] - ydata[j]) / bandwidthy, kernelorder) ** 2
                      for j in range(n) for i in range(n) if j != i]).sum()
    # univariate x, multivariate y
    elif d_x == 1 and d_y > 1:
        k = np.array([kernel((xdata[i] - xdata[j]) / bandwidthx, kernelorder) ** 2 *\
                      np.product([kernel((ydata[i, d] - ydata[j, d]) / bandwidthy_exp[d], kernelorder)]
                                  for d in range(d_y)) ** 2
                      for j in range(n) for i in range(n) if j != i]).sum()
    # multivariate x, multivariate y
    else:
        k = np.array([np.product([kernel((xdata[i, d] - xdata[j, d]) / bandwidthx_exp[d], kernelorder)
                                  for d in range(d_x)]) ** 2 *\
                      np.product([kernel((ydata[i, d] - ydata[j, d]) / bandwidthy_exp[d], kernelorder)]
                                  for d in range(d_y)) ** 2
                      for j in range(n) for i in range(n) if j != i]).sum()

    denominator = np.sqrt(2 * k)
    # Test statistics
    teststat = n**2 * numerator / denominator
    if not getpvalue:
        return teststat